In [2]:
# import useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl 
mpl.rcParams["figure.dpi"] = 150
import seaborn as sns
import os

# enable copy on write (default in pandas 3.0)
pd.options.mode.copy_on_write = True

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [4]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [5]:
merge_meso_2019 = pd.read_csv('../merged/merged_meso_2019.csv', parse_dates=['DATE'])

In [6]:
del merge_meso_2019['TVS_max']

In [7]:
outage = merge_meso_2019[merge_meso_2019['power_outage']==True]
no_outage = merge_meso_2019[merge_meso_2019['power_outage']==False]

In [8]:
merge_meso_2019['y'] = 0

merge_meso_2019.loc[merge_meso_2019.power_outage == True, 'y']=1

In [9]:
merge_meso_2019.sample(5)

Unnamed: 0.1,Unnamed: 0,index,DATE,LAT_mean,LON_mean,STR_RANK_max,LL_ROT_VEL_max,LL_DV_max,LL_BASE_max,DEPTH_KFT_max,DPTH_STMRL_max,MAX_RV_KFT_max,MAX_RV_KTS_max,MSI_max,county,state,power_outage,y
328699,328699,291105,2019-05-10,29.72423,-90.35504,4,52,84,11,10,0,11,52,4373,Saint Charles Parish,Louisiana,False,0
341511,341511,304199,2019-05-16,46.07194,-94.07616,7,56,73,14,13,100,14,56,3943,Morrison County,Minnesota,False,0
412137,412137,368099,2019-05-26,43.571104,-115.988581,8L,77,94,1,4,35,2,85,10835,Ada County,Idaho,False,0
705365,705365,649386,2019-08-30,40.16385,-95.02418,5,49,65,9,20,0,16,58,5199,Holt County,Missouri,False,0
484109,484109,438065,2019-06-16,38.90457,-86.07636,5,61,85,6,12,0,6,61,4251,Jackson County,Indiana,False,0


In [10]:
merge_meso_2019['DATE'] = pd.to_datetime(merge_meso_2019['DATE'])
merge_meso_2019['Month'] = merge_meso_2019['DATE'].dt.month

In [11]:
all_features =([merge_meso_2019.columns[3], 
                merge_meso_2019.columns[4]] +
                merge_meso_2019.columns[6:14].tolist() +
                [merge_meso_2019.columns[18]])

In [12]:
all_features

['LAT_mean',
 'LON_mean',
 'LL_ROT_VEL_max',
 'LL_DV_max',
 'LL_BASE_max',
 'DEPTH_KFT_max',
 'DPTH_STMRL_max',
 'MAX_RV_KFT_max',
 'MAX_RV_KTS_max',
 'MSI_max',
 'Month']

In [13]:
meso_train, meso_test = train_test_split(merge_meso_2019.copy(),
                                              shuffle=True,
                                              random_state=123,
                                              test_size=.2,
                                              stratify=merge_meso_2019.y.values)

In [14]:
meso_tt, meso_val = train_test_split(meso_train.copy(),
                                              shuffle=True,
                                              random_state=123,
                                              test_size=.2,
                                              stratify=meso_train.y.values)

In [15]:
outage = meso_tt[meso_tt['power_outage']==True]
no_outage = meso_tt[meso_tt['power_outage']==False]
no_outage= no_outage.sample(n=len(outage), random_state=101)
meso_tt_balanced = pd.concat([outage,no_outage],axis=0)

In [16]:
n_splits = 5

kfold = StratifiedKFold(n_splits,
                           shuffle=True,
                           random_state=498)

In [17]:
pca = Pipeline([('scale', StandardScaler()),
                   ('pca', PCA(2))])

pca_values = pca.fit_transform(meso_tt_balanced[all_features])

In [18]:
n_splits=5
kfold = StratifiedKFold(n_splits,
                           shuffle=True,
                           random_state=216)

In [22]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

In [24]:
bayes_accs = np.zeros((n_splits, 3))
bayes_reccs = np.zeros((n_splits, 3))
bayes_precis = np.zeros((n_splits, 3))

for i, (train_index, test_index) in enumerate(kfold.split(meso_tt_balanced, meso_tt_balanced.y)):
    print("CV Split", i)
    meso_bal_tt = meso_tt_balanced.iloc[train_index]
    meso_ho = meso_tt_balanced.iloc[test_index]
    
    ## Linear Discriminant Analysis
    lda = Pipeline([('scale', StandardScaler()),
                    ('lda', LinearDiscriminantAnalysis())])
    
    lda.fit(meso_bal_tt[all_features].values,
               meso_bal_tt.y.values)
    lda_pred = lda.predict(meso_ho[all_features].values)
    
    bayes_accs[i, 0] = accuracy_score(meso_ho.y.values,
                                         lda_pred)
    
    ## Quadratic Discriminant Analysis
    qda = Pipeline([('scale', StandardScaler()),
                    ('qda', QuadraticDiscriminantAnalysis())])
    
    qda.fit(meso_bal_tt[all_features].values,
               meso_bal_tt.y.values)
    
    qda_pred = qda.predict(meso_ho[all_features].values)
    
    bayes_accs[i, 1] = accuracy_score(meso_ho.y.values,
                                         qda_pred)
    
    
    ## Gaussian Naive Bayes
    nb = Pipeline([('scale', StandardScaler()),
                    ('NB', GaussianNB())])
    
    nb.fit(meso_bal_tt[all_features].values,
              meso_bal_tt.y.values)
    
    nb_pred = nb.predict(meso_ho[all_features].values)
    
    bayes_accs[i, 2] = accuracy_score(meso_ho.y.values,
                                         nb_pred)
    
    bayes_reccs[i, 2] = recall_score(meso_ho.y.values,
                                         nb_pred)
    
    bayes_precis[i, 2] = precision_score(meso_ho.y.values,
                                         nb_pred)

CV Split 0
CV Split 1
CV Split 2
CV Split 3
CV Split 4


In [25]:
np.mean(bayes_accs, axis=0)

array([0.69226683, 0.6286606 , 0.65072591])

In [26]:
np.mean(bayes_reccs, axis=0)

array([0.        , 0.        , 0.66143254])

In [27]:
np.mean(bayes_precis, axis=0)

array([0.        , 0.        , 0.64756736])