In [2]:
# import useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl 
mpl.rcParams["figure.dpi"] = 150
import seaborn as sns
import os

# enable copy on write (default in pandas 3.0)
pd.options.mode.copy_on_write = True

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [4]:
merge_meso_2019 = pd.read_csv('../merged/merged_meso_2019.csv', parse_dates=['DATE'])

In [5]:
del merge_meso_2019['TVS_max']

In [6]:
outage = merge_meso_2019[merge_meso_2019['power_outage']==True]
no_outage = merge_meso_2019[merge_meso_2019['power_outage']==False]

In [7]:
merge_meso_2019['y'] = 0

merge_meso_2019.loc[merge_meso_2019.power_outage == True, 'y']=1

In [8]:
merge_meso_2019['DATE'] = pd.to_datetime(merge_meso_2019['DATE'])
merge_meso_2019['Month'] = merge_meso_2019['DATE'].dt.month

In [9]:
all_features =([merge_meso_2019.columns[3], 
                merge_meso_2019.columns[4]] +
                merge_meso_2019.columns[6:14].tolist() +
                [merge_meso_2019.columns[18]])

In [10]:
all_features

['LAT_mean',
 'LON_mean',
 'LL_ROT_VEL_max',
 'LL_DV_max',
 'LL_BASE_max',
 'DEPTH_KFT_max',
 'DPTH_STMRL_max',
 'MAX_RV_KFT_max',
 'MAX_RV_KTS_max',
 'MSI_max',
 'Month']

In [11]:
merge_meso_2019.head()

Unnamed: 0.1,Unnamed: 0,index,DATE,LAT_mean,LON_mean,STR_RANK_max,LL_ROT_VEL_max,LL_DV_max,LL_BASE_max,DEPTH_KFT_max,DPTH_STMRL_max,MAX_RV_KFT_max,MAX_RV_KTS_max,MSI_max,county,state,power_outage,y,Month
0,0,0,2019-01-01,35.30391,-106.70199,9,52,61,14,12,100,24,63,5559,Sandoval County,New Mexico,False,0,1
1,1,1,2019-01-01,35.79095,-106.68525,6L,35,44,7,12,100,11,46,3419,Sandoval County,New Mexico,False,0,1
2,2,2,2019-01-01,35.35228,-106.68135,5L,33,47,9,8,100,16,62,4740,Sandoval County,New Mexico,False,0,1
3,3,3,2019-01-01,34.96357,-107.08421,5L,31,43,9,4,100,12,46,3480,Cibola County,New Mexico,False,0,1
4,4,4,2019-01-01,35.44015,-106.72896,5L,40,45,10,3,100,10,40,2431,Sandoval County,New Mexico,False,0,1


In [12]:
meso_train, meso_test = train_test_split(merge_meso_2019.copy(),
                                              shuffle=True,
                                              random_state=123,
                                              test_size=.2,
                                              stratify=merge_meso_2019.y.values)

In [13]:
meso_tt, meso_val = train_test_split(meso_train.copy(),
                                              shuffle=True,
                                              random_state=123,
                                              test_size=.2,
                                              stratify=meso_train.y.values)

In [14]:
outage = meso_tt[meso_tt['power_outage']==True]
no_outage = meso_tt[meso_tt['power_outage']==False]
no_outage= no_outage.sample(n=len(outage), random_state=101)
meso_tt_balanced = pd.concat([outage,no_outage],axis=0)

In [15]:
n_splits = 5

kfold = StratifiedKFold(n_splits,
                           shuffle=True,
                           random_state=498)

In [16]:
all_features =([merge_meso_2019.columns[3], 
                merge_meso_2019.columns[4]] +
                merge_meso_2019.columns[6:14].tolist() +
                [merge_meso_2019.columns[18]])

In [1]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
max_depths = range(1, 11)
n_trees = [100, 500]

rf_accs = np.zeros((n_splits, len(max_depths), len(n_trees)))

rf_reccs = np.zeros((n_splits, len(max_depths), len(n_trees)))

rf_precis = np.zeros((n_splits, len(max_depths), len(n_trees)))



for i,(train_index, test_index) in enumerate(kfold.split(meso_tt_balanced, meso_tt_balanced.y)):
    print("CV Split", i)
    meso_bal_tt = meso_tt_balanced.iloc[train_index]
    meso_ho = meso_tt_balanced.iloc[test_index]

    for j, max_depth in enumerate(max_depths):
        for k, n_estimators in enumerate(n_trees):
            print(i,j,k)
            rf = RandomForestClassifier(n_estimators = n_estimators,
                                           max_depth = max_depth,
                                           max_samples = 0.8,
                                           random_state = 216)
                                           
            rf.fit(meso_bal_tt[all_features], meso_bal_tt.y)
            
            pred = rf.predict(meso_ho[all_features])
            
            rf_accs[i,j,k] = accuracy_score(meso_ho.y,  pred)

            rf_reccs[i,j,k] = accuracy_score(meso_ho.y,  pred)

            rf_precis[i,j,k] = accuracy_score(meso_ho.y,  pred)

CV Split 0
0 0 0
0 0 1
0 1 0
0 1 1
0 2 0
0 2 1
0 3 0
0 3 1
0 4 0
0 4 1
0 5 0
0 5 1
0 6 0
0 6 1
0 7 0
0 7 1
0 8 0
0 8 1
0 9 0
0 9 1
CV Split 1
1 0 0
1 0 1
1 1 0
1 1 1
1 2 0
1 2 1
1 3 0
1 3 1
1 4 0
1 4 1
1 5 0
1 5 1
1 6 0
1 6 1
1 7 0
1 7 1
1 8 0
1 8 1
1 9 0
1 9 1
CV Split 2
2 0 0
2 0 1
2 1 0
2 1 1
2 2 0
2 2 1
2 3 0
2 3 1
2 4 0
2 4 1
2 5 0
2 5 1
2 6 0
2 6 1
2 7 0
2 7 1
2 8 0
2 8 1
2 9 0
2 9 1
CV Split 3
3 0 0
3 0 1
3 1 0
3 1 1
3 2 0
3 2 1
3 3 0
3 3 1
3 4 0
3 4 1
3 5 0
3 5 1
3 6 0
3 6 1
3 7 0
3 7 1
3 8 0
3 8 1
3 9 0
3 9 1
CV Split 4
4 0 0
4 0 1
4 1 0
4 1 1
4 2 0
4 2 1
4 3 0
4 3 1
4 4 0
4 4 1
4 5 0
4 5 1
4 6 0
4 6 1
4 7 0
4 7 1
4 8 0
4 8 1
4 9 0
4 9 1


In [23]:
max_acc_index = np.unravel_index(np.argmax(np.mean(rf_accs, axis=0)), 
                                       np.mean(rf_accs, axis=0).shape)


print(max_depths[max_acc_index[0]],n_trees[max_acc_index[1]])

10 500


In [24]:
max_recall_index = np.unravel_index(np.argmax(np.mean(rf_reccs, axis=0)), 
                                       np.mean(rf_reccs, axis=0).shape)


print(max_depths[max_recall_index[0]],n_trees[max_recall_index[1]])

10 500


In [25]:
max_precis_index = np.unravel_index(np.argmax(np.mean(rf_precis, axis=0)), 
                                       np.mean(rf_precis, axis=0).shape)


print(max_depths[max_precis_index[0]],n_trees[max_precis_index[1]])

10 500


In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
grid_cv = GridSearchCV(RandomForestClassifier(),
                          param_grid = {'max_depth':max_depths, 
                                        'n_estimators':n_trees},
                          scoring = 'accuracy',
                          cv = 5)


grid_cv.fit(meso_tt_balanced[all_features], meso_tt_balanced.y)

In [28]:
grid_cv.best_params_

{'max_depth': 10, 'n_estimators': 100}

In [29]:
grid_cv.best_score_

0.8858185325892498

In [30]:
grid_cv.best_estimator_

In [31]:
grid_cv.best_estimator_.predict(meso_tt_balanced[all_features])

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [32]:
pd.DataFrame({'feature_importance_score':grid_cv.best_estimator_.feature_importances_},
                 index=all_features).sort_values('feature_importance_score',
                                                ascending=False)

Unnamed: 0,feature_importance_score
LAT_mean,0.321691
LON_mean,0.258581
Month,0.244038
DEPTH_KFT_max,0.057956
DPTH_STMRL_max,0.025569
MAX_RV_KFT_max,0.025172
LL_BASE_max,0.021137
MSI_max,0.013725
MAX_RV_KTS_max,0.012851
LL_DV_max,0.010355


In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

In [None]:
pipe = Pipeline([('scale', StandardScaler()),('knn',KNeighborsClassifier())])
bag_pipe = BaggingClassifier(pipe, bootstrap = True, max_samples = 0.90)
bag_cv = GridSearchCV(bag_pipe, 
                          param_grid = {'estimator__knn__n_neighbors':[1,2,3], 
                                        'n_estimators':np.arange(1,100,10)}, 
                          scoring = 'accuracy', 
                          cv = 5)
bag_cv.fit(meso_tt_balanced[all_features], meso_tt_balanced.y)

In [None]:
print(f"The best mean cv accuracy of {bag_cv.best_score_:.3f} was achieved using k = {bag_cv.best_estimator_.estimator['knn'].n_neighbors} and {bag_cv.best_estimator_.n_estimators} estimators")

In [None]:
single_pipe = Pipeline([('scale', StandardScaler()),('knn',KNeighborsClassifier(n_neighbors=3))])
single_cv = cross_validate(single_pipe, meso_tt_balanced[all_features], meso_tt_balanced.y, cv = 5, scoring = 'accuracy')

In [None]:
print(f"The mean cv accuracy of a single kNN model with k=3 is {single_cv['test_score'].mean():.3f}")

In [None]:
model = grid_cv.best_estimator_

model.fit(meso_tt_balanced[all_features], meso_tt_balanced.y)

In [None]:
accuracy_score(model.predict(meso_tt_balanced[all_features]), meso_tt_balanced.y)

In [None]:
recall_score(model.predict(meso_tt_balanced[all_features]), meso_tt_balanced.y)

In [None]:
precision_score(model.predict(meso_tt_balanced[all_features]), meso_tt_balanced.y)

In [None]:
accuracy_score(model.predict(meso_val[all_features]), meso_val.y)

In [None]:
recall_score(model.predict(meso_val[all_features]), meso_val.y)

In [None]:
precision_score(model.predict(meso_val[all_features]), meso_val.y)