In [4]:
# import useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl 
mpl.rcParams["figure.dpi"] = 150
import seaborn as sns
import os
import datetime as dt
# enable copy on write (default in pandas 3.0)
pd.options.mode.copy_on_write = True

In [5]:
def read_merged(weather, year):
    return pd.read_csv('../merged/merged_{}_{}.csv'.format(weather, year))

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [8]:
meso = [read_merged("meso", year) for year in range(2015, 2023)]

In [None]:
data = pd.concat(meso)
del meso

In [11]:
del data['Unnamed: 0']
del data['index']

In [12]:
data['TVS_max'] = 1*(data['TVS_max'] == 'Y')
data['DATE'] = pd.to_datetime(data['DATE'])
data['Month'] = data['DATE'].dt.month

In [13]:
data['y'] = 0

data.loc[data.power_outage == True, 'y']=1

In [14]:
all_features =([data.columns[1], 
                data.columns[2]] +
                data.columns[4:13].tolist() +
                [data.columns[16]])

In [15]:
all_features

['LAT_mean',
 'LON_mean',
 'LL_ROT_VEL_max',
 'LL_DV_max',
 'LL_BASE_max',
 'DEPTH_KFT_max',
 'DPTH_STMRL_max',
 'MAX_RV_KFT_max',
 'MAX_RV_KTS_max',
 'TVS_max',
 'MSI_max',
 'Month']

In [16]:
meso_train, meso_test = train_test_split(data.copy(),
                                              shuffle=True,
                                              random_state=123,
                                              test_size=.2,
                                              stratify=data.y.values)

In [17]:
meso_tt, meso_val = train_test_split(meso_train.copy(),
                                              shuffle=True,
                                              random_state=123,
                                              test_size=.2,
                                              stratify=meso_train.y.values)

In [18]:
outage = meso_tt[meso_tt['power_outage']==True]
no_outage = meso_tt[meso_tt['power_outage']==False]
no_outage= no_outage.sample(n=len(outage), random_state=101)
meso_tt_balanced = pd.concat([outage,no_outage],axis=0)

In [19]:
n_splits = 5

kfold = StratifiedKFold(n_splits,
                           shuffle=True,
                           random_state=498)

In [20]:
log_accs = np.zeros(5)

i = 0
for train_index, test_index in kfold.split(meso_tt_balanced, meso_tt_balanced.y):
    meso_tt_tt = meso_tt_balanced.iloc[train_index]
    meso_ho = meso_tt_balanced.iloc[test_index]
    
    log_reg = LogisticRegression(penalty=None, max_iter = 1000)
        
    log_reg.fit(meso_tt_tt[all_features].values,
                   meso_tt_tt.y.values)
        
    pred = log_reg.predict(meso_ho[all_features].values)
    
    log_accs[i] = accuracy_score(meso_ho.y.values,
                                              pred)
    
    i = i + 1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [21]:
np.mean(log_accs)

0.6444063703413615

In [22]:
log_recs = np.zeros(5)
log_precis = np.zeros(5)

i = 0
for train_index, test_index in kfold.split(meso_tt_balanced, meso_tt_balanced.y):
    meso_tt_tt = meso_tt_balanced.iloc[train_index]
    meso_ho = meso_tt_balanced.iloc[test_index]
    
    log_reg = LogisticRegression(penalty=None, max_iter = 1000)
        
    log_reg.fit(meso_tt_tt[all_features].values,
                   meso_tt_tt.y.values)
        
    pred = log_reg.predict(meso_ho[all_features].values)
    
    log_recs[i] = recall_score(meso_ho.y.values,
                                              pred)
    log_precis[i] = precision_score(meso_ho.y.values,
                                              pred)
    
    i = i + 1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [23]:
np.mean(log_recs)

0.7115984596305724

In [24]:
np.mean(log_precis)

0.6272998015779756

In [22]:
ks = range(1,56)


k_all_accs = np.zeros((5, len(ks)))
k_all_recs = np.zeros((5, len(ks))) 
k_all_precis = np.zeros((5, len(ks))) 

i = 0
for train_index, test_index in kfold.split(meso_tt_balanced, meso_tt_balanced.y):
    meso_bal_tt = meso_tt_balanced.iloc[train_index]
    meso_ho = meso_tt_balanced.iloc[test_index]
    
    j = 0
    for k in ks:
        print(i,j,k)
        knn = Pipeline([('scale', StandardScaler()),('knn_cls', KNeighborsClassifier(k))])
        
        knn.fit(meso_bal_tt[all_features].values,
                   meso_bal_tt.y.values)
        
        pred = knn.predict(meso_ho[all_features].values)
        
        k_all_accs[i,j] = accuracy_score(meso_ho.y.values, pred)

        k_all_recs[i] = recall_score(meso_ho.y.values,
                                              pred)
        
        k_all_precis[i] = precision_score(meso_ho.y.values,
                                              pred)
        
        j = j + 1
    i = i + 1

0 0 1
0 1 2
0 2 3
0 3 4
0 4 5
0 5 6
0 6 7
0 7 8
0 8 9
0 9 10
0 10 11
0 11 12
0 12 13
0 13 14
0 14 15
0 15 16
0 16 17
0 17 18
0 18 19
0 19 20
0 20 21
0 21 22
0 22 23
0 23 24
0 24 25
0 25 26
0 26 27
0 27 28
0 28 29
0 29 30
0 30 31
0 31 32
0 32 33
0 33 34
0 34 35
0 35 36
0 36 37
0 37 38
0 38 39
0 39 40
0 40 41
0 41 42
0 42 43
0 43 44
0 44 45
0 45 46
0 46 47
0 47 48
0 48 49
0 49 50
0 50 51
0 51 52
0 52 53
0 53 54
0 54 55
1 0 1
1 1 2
1 2 3
1 3 4
1 4 5
1 5 6
1 6 7
1 7 8
1 8 9
1 9 10
1 10 11
1 11 12
1 12 13
1 13 14
1 14 15
1 15 16
1 16 17
1 17 18
1 18 19
1 19 20
1 20 21
1 21 22
1 22 23
1 23 24
1 24 25
1 25 26
1 26 27
1 27 28
1 28 29
1 29 30
1 30 31
1 31 32
1 32 33
1 33 34
1 34 35
1 35 36
1 36 37
1 37 38
1 38 39
1 39 40
1 40 41
1 41 42
1 42 43
1 43 44
1 44 45
1 45 46
1 46 47
1 47 48
1 48 49
1 49 50
1 50 51
1 51 52
1 52 53
1 53 54
1 54 55
2 0 1
2 1 2
2 2 3
2 3 4
2 4 5
2 5 6
2 6 7
2 7 8
2 8 9
2 9 10
2 10 11
2 11 12
2 12 13
2 13 14
2 14 15
2 15 16
2 16 17
2 17 18
2 18 19
2 19 20
2 20 21
2 21 22
2

In [None]:
np.mean(k_all_accs)

0.8189735537426536

In [None]:
np.mean(k_all_recs)

0.8745816973261702

In [None]:
np.mean(k_all_precis)

0.7719033184046648

In [None]:
best_k = np.argmax(np.mean(k_all_accs, axis=0))
best_k_acc = np.mean(k_all_accs, axis=0)[best_k]

print("The optimal value of k was", best_k + 1, "which achieved a mean CV accuracy of ", best_k_acc)

The optimal value of k was 10 which achieved a mean CV accuracy of  0.8311241046820271


In [None]:
knn_best = KNeighborsClassifier(n_neighbors = best_k + 1)

In [None]:
knn_best.fit(meso_tt_balanced[all_features].values,
                   meso_tt_balanced.y.values)

pred_best = knn_best.predict(meso_val[all_features].values)

In [None]:
print("accuracy score:", accuracy_score(meso_val.y.values, pred_best))
print("recall score:", recall_score(meso_val.y.values, pred_best))
print("precision score:", precision_score(meso_val.y.values, pred_best))

accuracy score: 0.6655794722800352
recall score: 0.6207177814029364
precision score: 0.07708202499848064
