In [1]:
import pandas as pd
import pandas_ml as pdml
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing

from subprocess import check_output
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
#Load data
df = pd.read_csv('Initial Data.csv');
#Replace NaN with 0
df = df.fillna(0);
df['DATE'] = pd.to_datetime(df['DATE']);
df.head()

Unnamed: 0,DATE,Average Wind Speed,PRCP,TMAX,TMIN,Fastest 2 minute wind speed,Fastest 5 second wind speed,Fog/Ice,Heavy/Freezing Fog,Thunder,Smoke/Haze,Power Outage
0,2017-12-31,4.25,0.0,15.0,-11.0,10.1,14.1,0.0,0.0,0.0,0.0,0.0
1,2017-12-30,8.72,0.0,15.0,-2.0,19.9,25.9,0.0,0.0,0.0,0.0,0.0
2,2017-12-29,7.16,0.0,17.0,8.0,16.1,19.9,0.0,0.0,0.0,0.0,0.0
3,2017-12-28,3.13,0.02,11.0,-20.0,8.1,11.0,1.0,0.0,0.0,0.0,0.0
4,2017-12-27,4.92,0.0,11.0,-17.0,17.0,21.0,0.0,0.0,0.0,0.0,0.0


In [3]:
#Add column for Delta T Max
temp = pd.Series(df.iloc[:,3]);
deltaTemp = pd.Series(df.iloc[:,3]);
deltaTemp[0] = 0;
for i in range (1, temp.shape[0]):
    deltaTemp[i] = temp[i] - temp[i-1];
df['Delta T Max'] = pd.Series(np.square(deltaTemp));

In [4]:
#Add column for Delta T min
temp = pd.Series(df.iloc[:,4]);
deltaTemp = pd.Series(df.iloc[:,4]);
deltaTemp[0] = 0;
for i in range (1, temp.shape[0]):
    deltaTemp[i] = temp[i] - temp[i-1];
df['Delta T Min'] = pd.Series(np.square(deltaTemp));

In [5]:
#Add columns for wind squared
df['Avg Wind Speed Squared'] = pd.Series(np.square(df['Average Wind Speed']), index=df.index)
df['5 second wind speed squared'] = pd.Series(np.square(df['Fastest 5 second wind speed']));
df['2 min wind speed squared'] = pd.Series(np.square(df['Fastest 2 minute wind speed']));
df = df[['Power Outage', 'PRCP','Delta T Min', 'Delta T Max', 'TMAX','TMIN','Fastest 5 second wind speed','5 second wind speed squared', 'Fastest 2 minute wind speed' ,'2 min wind speed squared','Average Wind Speed','Avg Wind Speed Squared','Fog/Ice', 'Heavy/Freezing Fog', 'Thunder','Smoke/Haze']];


In [6]:
#Normalize data
scaledData = preprocessing.MinMaxScaler().fit_transform(df.values);
X = pd.DataFrame(scaledData, index = df.index, columns = df.columns);

In [7]:
# finalData.to_csv('Clustered Data', index = False);
# X = pd.read_csv('Clustered Data');
# X.head()


In [8]:
y= X[['Power Outage']];
X = X.drop(['Power Outage'], axis=1);

In [9]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

class ReduceVIF(BaseEstimator, TransformerMixin):
    def __init__(self, thresh=5.0, impute=True, impute_strategy='median'):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        
        # The statsmodel function will fail with NaN values, as such we have to impute them.
        # By default we impute using the median value.
        # This imputation could be taken out and added as part of an sklearn Pipeline.
        if impute:
            self.imputer = Imputer(strategy=impute_strategy)

    def fit(self, X, y=None):
        print('ReduceVIF fit')
        if hasattr(self, 'imputer'):
            self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        print('ReduceVIF transform')
        columns = X.columns.tolist()
        if hasattr(self, 'imputer'):
            X = pd.DataFrame(self.imputer.transform(X), columns=columns)
        return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=5.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped=True
        while dropped:
            variables = X.columns
            dropped = False
            vif = [variance_inflation_factor(X[variables].values, X.columns.get_loc(var)) for var in X.columns]
            
            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f'Dropping {X.columns[maxloc]} with vif={max_vif}')
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped=True
        return X

In [10]:
transformer = ReduceVIF()
X= transformer.fit_transform(X,y);
X.head()

ReduceVIF fit
ReduceVIF transform
Dropping Fastest 2 minute wind speed with vif=116.40926312615994
Dropping Average Wind Speed with vif=41.15475553558485
Dropping Fastest 5 second wind speed with vif=16.96519189561377
Dropping TMIN with vif=11.935424694979043


Unnamed: 0,PRCP,Delta T Min,Delta T Max,TMAX,5 second wind speed squared,2 min wind speed squared,Avg Wind Speed Squared,Fog/Ice,Heavy/Freezing Fog,Thunder,Smoke/Haze
0,0.0,0.0,0.0,0.471698,0.005809,0.014808,0.036815,0.0,0.0,0.0,0.0
1,0.0,5.4e-05,0.000886,0.487421,0.0196,0.057484,0.154984,0.0,0.0,0.0,0.0
2,0.0,0.001362,1.6e-05,0.473795,0.011571,0.037627,0.104491,0.0,0.0,0.0,0.0
3,0.005,0.012255,0.000319,0.481132,0.003535,0.009524,0.019968,1.0,0.0,0.0,0.0
4,0.0,0.002301,1.6e-05,0.473795,0.012885,0.041951,0.049338,0.0,0.0,0.0,0.0


In [11]:
finalData = pd.concat([X,y], axis=1);
finalData.to_csv('Final Data', index=False);

In [12]:
X_new = SelectKBest(chi2, k=5).fit_transform(X.values,y.values)
X = pd.DataFrame(X_new);
print('Done');

Done
