In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

In [3]:
tornados_outage_2019 = pd.read_csv('../merged/tornados_outage_2023.csv', parse_dates=['DATE'])



In [4]:
tornados_outage_2019.head()

Unnamed: 0.1,Unnamed: 0,DATE,AVGDV_max,LLDV_max,MXDV_max,MXDV_HEIGHT_max,DEPTH_max,MAX_SHEAR_max,MAX_SHEAR_HEIGHT_max,location,county,state,Event Month,power_outage
0,0,2023-01-01,36,49,56,7,6.5,29,6.6,"(32.39305, -110.68147)",Pima County,Arizona,,False
1,1,2023-01-01,52,96,96,2,5.9,86,1.8,"(32.18141, -110.52664)",Cochise County,Arizona,,False
2,2,2023-01-01,31,50,50,4,9.1,17,4.2,"(34.34357, -117.82177)",San Bernardino County,California,,False
3,3,2023-01-01,35,62,62,6,8.3,21,6.4,"(34.29869, -117.62836)",San Bernardino County,California,,False
4,4,2023-01-01,39,52,52,4,5.0,24,2.6,"(35.0269, -118.24596)",Kern County,California,,False


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
tornados_train, tornados_test = train_test_split(tornados_outage_2019.copy(),
                                              shuffle=True,
                                              random_state=123,
                                              test_size=.1,
                                              stratify=tornados_outage_2019.power_outage.values)

In [7]:
## import random forest classifier
from sklearn.ensemble import RandomForestClassifier

## import kfold
from sklearn.model_selection import StratifiedKFold

## import accuracy_score
from sklearn.metrics import accuracy_score

In [12]:
## this will isolate the feature columns
features = tornados_train.columns[2: 9]

In [13]:
features

Index(['AVGDV_max', 'LLDV_max', 'MXDV_max', 'MXDV_HEIGHT_max', 'DEPTH_max',
       'MAX_SHEAR_max', 'MAX_SHEAR_HEIGHT_max'],
      dtype='object')

In [14]:
## set the number of CV folds
n_splits = 5

## Make the kfold object
kfold = StratifiedKFold(n_splits, 
                        random_state=216, 
                        shuffle=True)

In [20]:
max_depths = range(1, 11)
n_trees = [100, 500]

rf_accs = np.zeros((n_splits, len(max_depths), len(n_trees)))



for i,(train_index, test_index) in enumerate(kfold.split(tornados_train, tornados_train.power_outage)):
    tornados_tt = tornados_train.iloc[train_index]
    tornados_ho = tornados_train.iloc[test_index]

    for j, max_depth in enumerate(max_depths):
        for k, n_estimators in enumerate(n_trees):
            print(i,j,k)
            rf = RandomForestClassifier(n_estimators = n_estimators,
                                           max_depth = max_depth,
                                           max_samples = 0.8,
                                           random_state = 216)
                                           
            rf.fit(tornados_tt[features], tornados_tt.power_outage)
            
            pred = rf.predict(tornados_ho[features])
            
            rf_accs[i,j,k] = accuracy_score(tornados_ho.power_outage,  pred)

0 0 0
0 0 1
0 1 0
0 1 1
0 2 0
0 2 1
0 3 0
0 3 1
0 4 0
0 4 1
0 5 0
0 5 1
0 6 0
0 6 1
0 7 0
0 7 1
0 8 0
0 8 1
0 9 0
0 9 1
1 0 0
1 0 1
1 1 0
1 1 1
1 2 0
1 2 1
1 3 0
1 3 1
1 4 0
1 4 1
1 5 0
1 5 1
1 6 0
1 6 1
1 7 0
1 7 1
1 8 0
1 8 1
1 9 0
1 9 1
2 0 0
2 0 1
2 1 0
2 1 1
2 2 0
2 2 1
2 3 0
2 3 1
2 4 0
2 4 1
2 5 0
2 5 1
2 6 0
2 6 1
2 7 0
2 7 1
2 8 0
2 8 1
2 9 0
2 9 1
3 0 0
3 0 1
3 1 0
3 1 1
3 2 0
3 2 1
3 3 0
3 3 1
3 4 0
3 4 1
3 5 0
3 5 1
3 6 0
3 6 1
3 7 0
3 7 1
3 8 0
3 8 1
3 9 0
3 9 1
4 0 0
4 0 1
4 1 0
4 1 1
4 2 0
4 2 1
4 3 0
4 3 1
4 4 0
4 4 1
4 5 0
4 5 1
4 6 0
4 6 1
4 7 0
4 7 1
4 8 0
4 8 1
4 9 0
4 9 1


We will do `GridSearchCV

In [21]:
max_index = np.unravel_index(np.argmax(np.mean(rf_accs, axis=0)), 
                                       np.mean(rf_accs, axis=0).shape)


print(max_depths[max_index[0]],n_trees[max_index[1]])

1 100


In [22]:
## first import GridSearchCV
from sklearn.model_selection import GridSearchCV

In [23]:
grid_cv = GridSearchCV(RandomForestClassifier(), # first put the model object here
                          param_grid = {'max_depth':max_depths, # place the grid values for max_depth and
                                        'n_estimators':n_trees}, # and n_estimators here
                          scoring = 'accuracy', # put the metric we are trying to optimize here as a string, "accuracy"
                          cv = 5) # put the number of cv splits here

## you fit it just like a model
grid_cv.fit(tornados_train[features], tornados_train.power_outage)

In [24]:
## You can find the hyperparameter grid point that
## gave the best performance like so
## .best_params_
grid_cv.best_params_

{'max_depth': 1, 'n_estimators': 100}

In [25]:
## You can find the best score like so
## .best_score_
grid_cv.best_score_

0.9753814784159458

In [26]:
## Calling best_estimator_ returns the model with the 
## best avg cv performance after it has been refit on the
## entire data set
grid_cv.best_estimator_

In [27]:
grid_cv.best_estimator_.predict(tornados_train[features])

array([False, False, False, ..., False, False, False])

In [28]:
## You can get all of the results with cv_results_
grid_cv.cv_results_

{'mean_fit_time': array([0.39612818, 2.0815752 , 0.56130295, 2.58751307, 0.63653431,
        3.26518903, 0.75406027, 4.17678232, 0.96916757, 5.11800051,
        1.18915925, 5.40857201, 1.26096864, 6.21301508, 1.37687707,
        7.37866063, 1.40727692, 7.59948225, 1.75328717, 8.60942907]),
 'std_fit_time': array([0.0219131 , 0.21669958, 0.07042406, 0.28284362, 0.11139402,
        0.4007571 , 0.03601968, 0.18250822, 0.10741978, 0.32407857,
        0.13917095, 0.63920363, 0.1815284 , 0.32496427, 0.05802689,
        0.50352836, 0.01056947, 0.46857013, 0.02580312, 0.38340912]),
 'mean_score_time': array([0.02799549, 0.14757686, 0.03944941, 0.15652781, 0.03646727,
        0.18106275, 0.04060616, 0.213521  , 0.04937239, 0.23427014,
        0.04998093, 0.25081091, 0.0566258 , 0.25142808, 0.05816293,
        0.28751826, 0.06009717, 0.3164897 , 0.07214794, 0.34162097]),
 'std_score_time': array([0.00240856, 0.01553706, 0.00409303, 0.02029532, 0.00729673,
        0.0266211 , 0.00430372, 0.019092

Using either the `best_estimator_` fitted model or a refitted model according to our results from the `for` loop cross-validation we will find the feature importance scores. 

In [29]:
pd.DataFrame({'feature_importance_score':grid_cv.best_estimator_.feature_importances_},
                 index=features).sort_values('feature_importance_score',
                                                ascending=False)

Unnamed: 0,feature_importance_score
AVGDV_max,0.3
MXDV_max,0.22
LLDV_max,0.18
MAX_SHEAR_max,0.17
DEPTH_max,0.08
MAX_SHEAR_HEIGHT_max,0.05
MXDV_HEIGHT_max,0.0


Bagging

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

In [31]:
pipe = Pipeline([('scale', StandardScaler()),('knn',KNeighborsClassifier())])
bag_pipe = BaggingClassifier(pipe, bootstrap = True, max_samples = 0.90)
bag_cv = GridSearchCV(bag_pipe, 
                          param_grid = {'estimator__knn__n_neighbors':[1,2,3], 
                                        'n_estimators':np.arange(1,100,10)}, 
                          scoring = 'accuracy', 
                          cv = 5)
bag_cv.fit(tornados_train[features], tornados_train.power_outage)

In [32]:
print(f"The best mean cv accuracy of {bag_cv.best_score_:.3f} was achieved using k = {bag_cv.best_estimator_.estimator['knn'].n_neighbors} and {bag_cv.best_estimator_.n_estimators} estimators")

The best mean cv accuracy of 0.973 was achieved using k = 3 and 71 estimators


In [33]:
single_pipe = Pipeline([('scale', StandardScaler()),('knn',KNeighborsClassifier(n_neighbors=3))])
single_cv = cross_validate(single_pipe, tornados_train[features], tornados_train.power_outage, cv = 5, scoring = 'accuracy')

In [34]:
print(f"The mean cv accuracy of a single kNN model with k=3 is {single_cv['test_score'].mean():.3f}")

The mean cv accuracy of a single kNN model with k=3 is 0.971


Model with `GridSearchCV`

In [35]:
model = grid_cv.best_estimator_

model.fit(tornados_train[features], tornados_train.power_outage)

In [36]:
accuracy_score(model.predict(tornados_train[features]), tornados_train.power_outage)

0.9753814737080946

In [38]:
accuracy_score(model.predict(tornados_test[features]), tornados_test.power_outage)

0.97524247064829