In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

In [2]:
tornados_outage_2019 = pd.read_csv('../merged/tornados_outage_2023.csv', parse_dates=['DATE'])



In [3]:
tornados_outage_2019.head()

Unnamed: 0.1,Unnamed: 0,DATE,AVGDV_max,LLDV_max,MXDV_max,MXDV_HEIGHT_max,DEPTH_max,MAX_SHEAR_max,MAX_SHEAR_HEIGHT_max,location,county,state,Event Month,power_outage
0,0,2023-01-01,36,49,56,7,6.5,29,6.6,"(32.39305, -110.68147)",Pima County,Arizona,,False
1,1,2023-01-01,52,96,96,2,5.9,86,1.8,"(32.18141, -110.52664)",Cochise County,Arizona,,False
2,2,2023-01-01,31,50,50,4,9.1,17,4.2,"(34.34357, -117.82177)",San Bernardino County,California,,False
3,3,2023-01-01,35,62,62,6,8.3,21,6.4,"(34.29869, -117.62836)",San Bernardino County,California,,False
4,4,2023-01-01,39,52,52,4,5.0,24,2.6,"(35.0269, -118.24596)",Kern County,California,,False


In [17]:
tornados_grouped_by = tornados_outage_2019.groupby(['power_outage'])
tornados_balanced = tornados_grouped_by.apply(lambda x: x.sample(tornados_grouped_by.size().min()).reset_index(drop=True))
tornados_balanced =tornados_balanced tornados_balanced.droplevel(['power_outage'])

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
tornados_train, tornados_test = train_test_split(tornados_balanced.copy(),
                                              shuffle=True,
                                              random_state=123,
                                              test_size=.1,
                                              stratify=tornados_balanced.power_outage.values)

In [20]:
## import random forest classifier
from sklearn.ensemble import RandomForestClassifier

## import kfold
from sklearn.model_selection import StratifiedKFold

## import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score



In [21]:
## this will isolate the feature columns
features = tornados_train.columns[2: 9]

In [22]:
features

Index(['AVGDV_max', 'LLDV_max', 'MXDV_max', 'MXDV_HEIGHT_max', 'DEPTH_max',
       'MAX_SHEAR_max', 'MAX_SHEAR_HEIGHT_max'],
      dtype='object')

In [23]:
## set the number of CV folds
n_splits = 5

## Make the kfold object
kfold = StratifiedKFold(n_splits, 
                        random_state=216, 
                        shuffle=True)

In [10]:
max_depths = range(1, 11)
n_trees = [100, 500]

rf_accs = np.zeros((n_splits, len(max_depths), len(n_trees)))



for i,(train_index, test_index) in enumerate(kfold.split(tornados_train, tornados_train.power_outage)):
    tornados_tt = tornados_train.iloc[train_index]
    tornados_ho = tornados_train.iloc[test_index]

    for j, max_depth in enumerate(max_depths):
        for k, n_estimators in enumerate(n_trees):
            print(i,j,k)
            rf = RandomForestClassifier(n_estimators = n_estimators,
                                           max_depth = max_depth,
                                           max_samples = 0.8,
                                           random_state = 216)
                                           
            rf.fit(tornados_tt[features], tornados_tt.power_outage)
            
            pred = rf.predict(tornados_ho[features])
            
            rf_accs[i,j,k] = accuracy_score(tornados_ho.power_outage,  pred)

0 0 0
0 0 1
0 1 0
0 1 1
0 2 0
0 2 1
0 3 0


KeyboardInterrupt: 

In [25]:
max_depths = range(1, 11)
n_trees = [100, 500]

rf_accs = np.zeros((n_splits, len(max_depths), len(n_trees)))
rf_prec = np.zeros((n_splits, len(max_depths), len(n_trees)))
rf_recall = np.zeros((n_splits, len(max_depths), len(n_trees)))



for i,(train_index, test_index) in enumerate(kfold.split(tornados_train, tornados_train.power_outage)):
    tornados_tt = tornados_train.iloc[train_index]
    tornados_ho = tornados_train.iloc[test_index]

    for j, max_depth in enumerate(max_depths):
        for k, n_estimators in enumerate(n_trees):
            print(i,j,k)
            rf = RandomForestClassifier(n_estimators = n_estimators,
                                           max_depth = max_depth,
                                           max_samples = 0.8,
                                           random_state = 216)
                                           
            rf.fit(tornados_tt[features], tornados_tt.power_outage)
            
            pred = rf.predict(tornados_ho[features])
            
            rf_accs[i,j,k] = accuracy_score(tornados_ho.power_outage,  pred)
            rf_prec[i,j,k]= precision_score(tornados_ho.power_outage,  pred)
            rf_recall[i,j,k]= recall_score(tornados_ho.power_outage,  pred)

0 0 0
0 0 1
0 1 0
0 1 1
0 2 0
0 2 1
0 3 0
0 3 1
0 4 0
0 4 1
0 5 0
0 5 1
0 6 0
0 6 1
0 7 0
0 7 1
0 8 0
0 8 1
0 9 0
0 9 1
1 0 0
1 0 1
1 1 0
1 1 1
1 2 0
1 2 1
1 3 0
1 3 1
1 4 0
1 4 1
1 5 0
1 5 1
1 6 0
1 6 1
1 7 0
1 7 1
1 8 0
1 8 1
1 9 0
1 9 1
2 0 0
2 0 1
2 1 0
2 1 1
2 2 0
2 2 1
2 3 0
2 3 1
2 4 0
2 4 1
2 5 0
2 5 1
2 6 0
2 6 1
2 7 0
2 7 1
2 8 0
2 8 1
2 9 0
2 9 1
3 0 0
3 0 1
3 1 0
3 1 1
3 2 0
3 2 1
3 3 0
3 3 1
3 4 0
3 4 1
3 5 0
3 5 1
3 6 0
3 6 1
3 7 0
3 7 1
3 8 0
3 8 1
3 9 0
3 9 1
4 0 0
4 0 1
4 1 0
4 1 1
4 2 0
4 2 1
4 3 0
4 3 1
4 4 0
4 4 1
4 5 0
4 5 1
4 6 0
4 6 1
4 7 0
4 7 1
4 8 0
4 8 1
4 9 0
4 9 1


We will do `GridSearchCV

In [26]:
max_index_prec = np.unravel_index(np.argmax(np.mean(rf_prec, axis=0)), 
                                       np.mean(rf_prec, axis=0).shape)


print(max_depths[max_index_prec[0]],n_trees[max_index_prec[1]])

10 500


In [27]:
max_index_recall = np.unravel_index(np.argmax(np.mean(rf_recall, axis=0)), 
                                       np.mean(rf_recall, axis=0).shape)


print(max_depths[max_index_recall[0]],n_trees[max_index_recall[1]])

7 500


In [28]:
np.mean(rf_recall, axis=0)

array([[0.52926716, 0.53623679],
       [0.54539898, 0.54426948],
       [0.55689987, 0.5477244 ],
       [0.56380307, 0.55344495],
       [0.5730184 , 0.56265364],
       [0.58916351, 0.57877882],
       [0.58111089, 0.60416584],
       [0.59148229, 0.59723606],
       [0.59609993, 0.58920338],
       [0.59493721, 0.59838549]])

In [29]:
np.mean(rf_prec, axis=0)

array([[0.57285685, 0.57339531],
       [0.5752494 , 0.57232622],
       [0.57284238, 0.56619972],
       [0.56825947, 0.5719408 ],
       [0.57047971, 0.57406423],
       [0.58275969, 0.58172524],
       [0.57492039, 0.58402227],
       [0.58176093, 0.58606672],
       [0.58925088, 0.5826173 ],
       [0.59219218, 0.59646313]])

In [30]:
max_index = np.unravel_index(np.argmax(np.mean(rf_accs, axis=0)), 
                                       np.mean(rf_accs, axis=0).shape)


print(max_depths[max_index[0]],n_trees[max_index[1]])

10 500


In [31]:
## first import GridSearchCV
from sklearn.model_selection import GridSearchCV

In [32]:
grid_cv = GridSearchCV(RandomForestClassifier(), # first put the model object here
                          param_grid = {'max_depth':max_depths, # place the grid values for max_depth and
                                        'n_estimators':n_trees}, # and n_estimators here
                          scoring = 'accuracy', # put the metric we are trying to optimize here as a string, "accuracy"
                          cv = 5) # put the number of cv splits here

## you fit it just like a model
grid_cv.fit(tornados_train[features], tornados_train.power_outage)

In [33]:
## You can find the hyperparameter grid point that
## gave the best performance like so
## .best_params_
grid_cv.best_params_

{'max_depth': 7, 'n_estimators': 100}

In [34]:
## You can find the best score like so
## .best_score_
grid_cv.best_score_

0.5866424856735897

In [35]:
## Calling best_estimator_ returns the model with the 
## best avg cv performance after it has been refit on the
## entire data set
grid_cv.best_estimator_

In [36]:
grid_cv.best_estimator_.predict(tornados_train[features])

array([ True, False, False, ...,  True, False, False])

In [38]:
## You can get all of the results with cv_results_
grid_cv.cv_results_

{'mean_fit_time': array([0.1915514 , 0.98334384, 0.21898255, 0.90960102, 0.17941155,
        0.97599359, 0.21707926, 1.07329369, 0.24518127, 1.2178988 ,
        0.27318544, 1.37791076, 0.30434823, 1.53222799, 0.33487511,
        1.67176723, 0.36495676, 1.87377162, 0.40216088, 1.98587222]),
 'std_fit_time': array([0.03534748, 0.13759033, 0.0285225 , 0.10095016, 0.00787123,
        0.12003976, 0.01159485, 0.06811716, 0.00436472, 0.02743494,
        0.01076415, 0.03387321, 0.00244344, 0.0196477 , 0.00867133,
        0.03100106, 0.00191522, 0.0487714 , 0.01316863, 0.04955759]),
 'mean_score_time': array([0.01393371, 0.06116695, 0.01410327, 0.05352926, 0.01101747,
        0.04995871, 0.01336432, 0.05238981, 0.01362324, 0.05661116,
        0.01331501, 0.05703487, 0.01830339, 0.05962663, 0.01610136,
        0.06469812, 0.01719508, 0.06434183, 0.0154603 , 0.07352419]),
 'std_score_time': array([0.00133443, 0.02155372, 0.00219232, 0.00774945, 0.000918  ,
        0.00330346, 0.00219803, 0.005390

Using either the `best_estimator_` fitted model or a refitted model according to our results from the `for` loop cross-validation we will find the feature importance scores. 

In [39]:
pd.DataFrame({'feature_importance_score':grid_cv.best_estimator_.feature_importances_},
                 index=features).sort_values('feature_importance_score',
                                                ascending=False)

Unnamed: 0,feature_importance_score
DEPTH_max,0.175802
MAX_SHEAR_HEIGHT_max,0.158524
MXDV_max,0.157887
AVGDV_max,0.145169
MAX_SHEAR_max,0.142298
LLDV_max,0.126842
MXDV_HEIGHT_max,0.093478


Bagging

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

In [41]:
pipe = Pipeline([('scale', StandardScaler()),('knn',KNeighborsClassifier())])
bag_pipe = BaggingClassifier(pipe, bootstrap = True, max_samples = 0.90)
bag_cv = GridSearchCV(bag_pipe, 
                          param_grid = {'estimator__knn__n_neighbors':[1,2,3], 
                                        'n_estimators':np.arange(1,100,10)}, 
                          scoring = 'accuracy', 
                          cv = 5)
bag_cv.fit(tornados_train[features], tornados_train.power_outage)

In [42]:
print(f"The best mean cv accuracy of {bag_cv.best_score_:.3f} was achieved using k = {bag_cv.best_estimator_.estimator['knn'].n_neighbors} and {bag_cv.best_estimator_.n_estimators} estimators")

The best mean cv accuracy of 0.571 was achieved using k = 3 and 51 estimators


In [44]:
single_pipe = Pipeline([('scale', StandardScaler()),('knn',KNeighborsClassifier(n_neighbors=3))])
single_cv = cross_validate(single_pipe, tornados_train[features], tornados_train.power_outage, cv = 5, scoring = 'accuracy')

In [45]:
print(f"The mean cv accuracy of a single kNN model with k=3 is {single_cv['test_score'].mean():.3f}")

The mean cv accuracy of a single kNN model with k=3 is 0.558


Model with `GridSearchCV`

In [46]:
model = grid_cv.best_estimator_

model.fit(tornados_train[features], tornados_train.power_outage)

In [47]:
accuracy_score(model.predict(tornados_train[features]), tornados_train.power_outage)

0.7869890616004606

In [48]:
accuracy_score(model.predict(tornados_test[features]), tornados_test.power_outage)

0.6683937823834197