In [1]:
import pandas as pd
import numpy as np
from subprocess import check_output
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
X = pd.read_csv('Clustered Data');

In [3]:
y= X[['Power Outage']];
X = X.drop(['Power Outage'], axis=1);

In [4]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

class ReduceVIF(BaseEstimator, TransformerMixin):
    def __init__(self, thresh=5.0, impute=True, impute_strategy='median'):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        
        # The statsmodel function will fail with NaN values, as such we have to impute them.
        # By default we impute using the median value.
        # This imputation could be taken out and added as part of an sklearn Pipeline.
        if impute:
            self.imputer = Imputer(strategy=impute_strategy)

    def fit(self, X, y=None):
        print('ReduceVIF fit')
        if hasattr(self, 'imputer'):
            self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        print('ReduceVIF transform')
        columns = X.columns.tolist()
        if hasattr(self, 'imputer'):
            X = pd.DataFrame(self.imputer.transform(X), columns=columns)
        return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=5.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped=True
        while dropped:
            variables = X.columns
            dropped = False
            vif = [variance_inflation_factor(X[variables].values, X.columns.get_loc(var)) for var in X.columns]
            
            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f'Dropping {X.columns[maxloc]} with vif={max_vif}')
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped=True
        return X

In [5]:
transformer = ReduceVIF()
X= transformer.fit_transform(X,y);
X.head()

ReduceVIF fit
ReduceVIF transform
Dropping TMIN with vif=89.8320184398733
Dropping Fastest 2 minute wind speed with vif=67.89729815482103
Dropping Average Wind Speed with vif=36.037016291349325
Dropping Fastest 5 second wind speed with vif=34.97478399117493


Unnamed: 0,PRCP,TMAX,5 second wind speed squared,2 min wind speed squared,Avg Wind Speed Squared,Fog/Ice,Heavy/Freezing Fog,Thunder,Smoke/Haze
0,0.037842,0.591765,0.013388,0.037429,0.135637,0.3,0.06,0.07,0.09
1,0.184211,0.392157,0.757609,0.371606,0.26347,1.0,4.1633360000000003e-17,1.0,2.775558e-17
2,0.0,0.960784,0.208811,0.078352,0.139,0.5,4.1633360000000003e-17,2.0816680000000002e-17,0.5
3,0.005263,0.843137,0.446936,0.055064,0.012114,1.0,4.1633360000000003e-17,2.0816680000000002e-17,2.775558e-17
4,0.083333,0.576797,0.0466,0.108467,0.232044,0.666667,0.1666667,0.1666667,2.775558e-17


In [6]:
finalData = pd.concat([X,y], axis=1);
finalData.to_csv('Final Data', index=False);

In [7]:
X_new = SelectKBest(chi2, k=5).fit_transform(X.values,y.values)
X = pd.DataFrame(X_new);

In [8]:
X.head()

Unnamed: 0,0,1,2,3,4
0,0.013388,0.037429,0.3,0.06,0.07
1,0.757609,0.371606,1.0,4.1633360000000003e-17,1.0
2,0.208811,0.078352,0.5,4.1633360000000003e-17,2.0816680000000002e-17
3,0.446936,0.055064,1.0,4.1633360000000003e-17,2.0816680000000002e-17
4,0.0466,0.108467,0.666667,0.1666667,0.1666667


In [9]:
finalData.head()

Unnamed: 0,PRCP,TMAX,5 second wind speed squared,2 min wind speed squared,Avg Wind Speed Squared,Fog/Ice,Heavy/Freezing Fog,Thunder,Smoke/Haze,Power Outage
0,0.037842,0.591765,0.013388,0.037429,0.135637,0.3,0.06,0.07,0.09,0.0
1,0.184211,0.392157,0.757609,0.371606,0.26347,1.0,4.1633360000000003e-17,1.0,2.775558e-17,0.0
2,0.0,0.960784,0.208811,0.078352,0.139,0.5,4.1633360000000003e-17,2.0816680000000002e-17,0.5,0.0
3,0.005263,0.843137,0.446936,0.055064,0.012114,1.0,4.1633360000000003e-17,2.0816680000000002e-17,2.775558e-17,0.0
4,0.083333,0.576797,0.0466,0.108467,0.232044,0.666667,0.1666667,0.1666667,2.775558e-17,0.0
