In [173]:
import pandas as pd
import numpy as np
from subprocess import check_output
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [174]:
X = pd.read_csv('Shuffled Data');
X.head()

Unnamed: 0,DATE,Average Wind Speed,PRCP,TMAX,TMIN,Fastest 2 minute wind speed,Fastest 5 second wind speed,Fog/Ice,Heavy/Freezing Fog,Thunder,Smoke/Haze,Power Outage
0,2011-02-02,11.41,0.14,25.0,4.0,25.9,35.1,1.0,0.0,0.0,1.0,0.0
1,2013-12-28,8.5,0.0,47.0,26.0,19.9,25.1,0.0,0.0,0.0,0.0,1.0
2,2017-06-07,9.17,0.0,74.0,45.0,23.0,30.0,0.0,0.0,0.0,0.0,0.0
3,2011-07-19,2.91,0.0,94.0,70.0,12.1,18.1,1.0,1.0,0.0,1.0,1.0
4,2017-04-06,16.33,0.44,42.0,34.0,32.0,42.9,0.0,0.0,0.0,0.0,0.0


In [175]:
y= X[['Power Outage']];
X = X.drop(['Power Outage','DATE'], axis=1);
y.head()

Unnamed: 0,Power Outage
0,0.0
1,1.0
2,0.0
3,1.0
4,0.0


In [176]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

class ReduceVIF(BaseEstimator, TransformerMixin):
    def __init__(self, thresh=5.0, impute=True, impute_strategy='median'):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        
        # The statsmodel function will fail with NaN values, as such we have to impute them.
        # By default we impute using the median value.
        # This imputation could be taken out and added as part of an sklearn Pipeline.
        if impute:
            self.imputer = Imputer(strategy=impute_strategy)

    def fit(self, X, y=None):
        print('ReduceVIF fit')
        if hasattr(self, 'imputer'):
            self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        print('ReduceVIF transform')
        columns = X.columns.tolist()
        if hasattr(self, 'imputer'):
            X = pd.DataFrame(self.imputer.transform(X), columns=columns)
        return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=5.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped=True
        while dropped:
            variables = X.columns
            dropped = False
            vif = [variance_inflation_factor(X[variables].values, X.columns.get_loc(var)) for var in X.columns]
            
            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f'Dropping {X.columns[maxloc]} with vif={max_vif}')
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped=True
        return X

In [177]:
transformer = ReduceVIF()
X= transformer.fit_transform(X,y);
X.head()


ReduceVIF fit
ReduceVIF transform
Dropping TMAX with vif=42.751292463214675
Dropping Fastest 2 minute wind speed with vif=27.281675515123197


Unnamed: 0,Average Wind Speed,PRCP,TMIN,Fastest 5 second wind speed,Fog/Ice,Heavy/Freezing Fog,Thunder,Smoke/Haze
0,11.41,0.14,4.0,35.1,1.0,0.0,0.0,1.0
1,8.5,0.0,26.0,25.1,0.0,0.0,0.0,0.0
2,9.17,0.0,45.0,30.0,0.0,0.0,0.0,0.0
3,2.91,0.0,70.0,18.1,1.0,1.0,0.0,1.0
4,16.33,0.44,34.0,42.9,0.0,0.0,0.0,0.0


In [178]:
finalData = pd.concat([X,y], axis=1);
finalData.to_csv('Final Data', index=False);

In [179]:
finalData.describe()

Unnamed: 0,Average Wind Speed,PRCP,TMIN,Fastest 5 second wind speed,Fog/Ice,Heavy/Freezing Fog,Thunder,Smoke/Haze,Power Outage
count,318.0,318.0,318.0,318.0,318.0,318.0,318.0,318.0,318.0
mean,7.932956,0.095786,39.827044,30.706289,0.374214,0.103774,0.106918,0.179245,0.5
std,4.119541,0.234191,19.021115,17.701957,0.484682,0.305447,0.309496,0.384162,0.500788
min,0.0,0.0,-8.0,11.0,0.0,0.0,0.0,0.0,0.0
25%,4.92,0.0,26.0,21.225,0.0,0.0,0.0,0.0,0.0
50%,7.16,0.0,42.0,25.9,0.0,0.0,0.0,0.0,0.5
75%,10.07,0.05,56.0,34.825,1.0,0.0,0.0,0.0,1.0
max,22.15,1.76,73.0,181.0,1.0,1.0,1.0,1.0,1.0


In [180]:
#https://machinelearningmastery.com/feature-selection-machine-learning-python/
#Tree forest
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y);
print(model.feature_importances_)
print(model.score(X,y));

[ 0.27280403  0.10925922  0.28066677  0.23007349  0.02782142  0.01604124
  0.04353876  0.01979507]
1.0


  


In [181]:
#RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
rfe=RFE(model,3)
rfe=rfe.fit(X, y);
print(rfe.support_);
print(rfe.ranking_);

[False  True False False  True False  True False]
[4 1 5 6 1 3 1 2]


  y = column_or_1d(y, warn=True)
