In [5]:
# import useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl 
mpl.rcParams["figure.dpi"] = 150
import seaborn as sns
import os

# enable copy on write (default in pandas 3.0)
pd.options.mode.copy_on_write = True

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [7]:
merge_meso_2019 = pd.read_csv('../merged/merged_meso_2019.csv', parse_dates=['DATE'])

In [8]:
del merge_meso_2019['TVS_max']

In [9]:
outage = merge_meso_2019[merge_meso_2019['power_outage']==True]
no_outage = merge_meso_2019[merge_meso_2019['power_outage']==False]

In [10]:
merge_meso_2019['y'] = 0

merge_meso_2019.loc[merge_meso_2019.power_outage == True, 'y']=1

In [11]:
merge_meso_2019['DATE'] = pd.to_datetime(merge_meso_2019['DATE'])
merge_meso_2019['Month'] = merge_meso_2019['DATE'].dt.month

In [12]:
all_features =([merge_meso_2019.columns[3], 
                merge_meso_2019.columns[4]] +
                merge_meso_2019.columns[6:14].tolist() +
                [merge_meso_2019.columns[18]])

In [13]:
all_features

['LAT_mean',
 'LON_mean',
 'LL_ROT_VEL_max',
 'LL_DV_max',
 'LL_BASE_max',
 'DEPTH_KFT_max',
 'DPTH_STMRL_max',
 'MAX_RV_KFT_max',
 'MAX_RV_KTS_max',
 'MSI_max',
 'Month']

In [14]:
merge_meso_2019.head()

Unnamed: 0.1,Unnamed: 0,index,DATE,LAT_mean,LON_mean,STR_RANK_max,LL_ROT_VEL_max,LL_DV_max,LL_BASE_max,DEPTH_KFT_max,DPTH_STMRL_max,MAX_RV_KFT_max,MAX_RV_KTS_max,MSI_max,county,state,power_outage,y,Month
0,0,0,2019-01-01,35.30391,-106.70199,9,52,61,14,12,100,24,63,5559,Sandoval County,New Mexico,False,0,1
1,1,1,2019-01-01,35.79095,-106.68525,6L,35,44,7,12,100,11,46,3419,Sandoval County,New Mexico,False,0,1
2,2,2,2019-01-01,35.35228,-106.68135,5L,33,47,9,8,100,16,62,4740,Sandoval County,New Mexico,False,0,1
3,3,3,2019-01-01,34.96357,-107.08421,5L,31,43,9,4,100,12,46,3480,Cibola County,New Mexico,False,0,1
4,4,4,2019-01-01,35.44015,-106.72896,5L,40,45,10,3,100,10,40,2431,Sandoval County,New Mexico,False,0,1


In [15]:
meso_train, meso_test = train_test_split(merge_meso_2019.copy(),
                                              shuffle=True,
                                              random_state=123,
                                              test_size=.2,
                                              stratify=merge_meso_2019.y.values)

In [16]:
meso_tt, meso_val = train_test_split(meso_train.copy(),
                                              shuffle=True,
                                              random_state=123,
                                              test_size=.2,
                                              stratify=meso_train.y.values)

In [17]:
outage = meso_tt[meso_tt['power_outage']==True]
no_outage = meso_tt[meso_tt['power_outage']==False]
no_outage= no_outage.sample(n=len(outage), random_state=101)
meso_tt_balanced = pd.concat([outage,no_outage],axis=0)

In [18]:
n_splits = 5

kfold = StratifiedKFold(n_splits,
                           shuffle=True,
                           random_state=498)

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error

In [20]:
models = {
    'lr': LinearRegression(),
    'svr': SVR(),
    'knr': KNeighborsRegressor(n_neighbors=10),
    'rf': RandomForestRegressor(),
    'ab': AdaBoostRegressor(),
    'gb': GradientBoostingRegressor(),
    'xgb': XGBRegressor()
}

In [31]:
knr = KNeighborsRegressor(n_neighbors=10)
knr.fit(meso_tt_balanced[all_features],meso_tt_balanced.y)

In [32]:
mean_squared_error(meso_val.y, knr.predict(meso_val[all_features]))

0.24377135394113783

In [33]:
lr = LinearRegression()
lr.fit(meso_tt_balanced[all_features],meso_tt_balanced.y)

In [34]:
mean_squared_error(meso_val.y, lr.predict(meso_val[all_features]))

0.22082250323193214

In [21]:
svr = SVR()
svr.fit(meso_tt_balanced[all_features],meso_tt_balanced.y)

In [22]:
mean_squared_error(meso_val.y, svr.predict(meso_val[all_features]))

0.35374711258586583

In [23]:
ada = AdaBoostRegressor()
ada.fit(meso_tt_balanced[all_features],meso_tt_balanced.y)

In [24]:
mean_squared_error(meso_val.y, ada.predict(meso_val[all_features]))

0.16863515974938517

In [25]:
gb = GradientBoostingRegressor()
gb.fit(meso_tt_balanced[all_features],meso_tt_balanced.y)

In [26]:
mean_squared_error(meso_val.y, gb.predict(meso_val[all_features]))

0.13174839770231933

In [27]:
xgb = XGBRegressor()
xgb.fit(meso_tt_balanced[all_features],meso_tt_balanced.y)

In [28]:
mean_squared_error(meso_val.y, xgb.predict(meso_val[all_features]))

0.09460563059533747

In [29]:
rf = RandomForestRegressor()
rf.fit(meso_tt_balanced[all_features],meso_tt_balanced.y)

In [30]:
mean_squared_error(meso_val.y, rf.predict(meso_val[all_features]))

0.07947103175625231

In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
param_grid = {"max_depth":    [4, 5, 6],
              "n_estimators": np.arange(100,800,100),
              "learning_rate": [0.01, 0.1, 1]}

In [37]:
xgb_reg = XGBRegressor()

In [38]:
search = GridSearchCV(xgb_reg, param_grid, cv=5).fit(meso_tt_balanced[all_features], meso_tt_balanced.y)

print("The best hyperparameters are ",search.best_params_)

The best hyperparameters are  {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 700}


In [39]:
xgb_reg = XGBRegressor(**search.best_params_)

In [40]:
xgb_reg.fit(meso_tt_balanced[all_features], meso_tt_balanced.y)

In [41]:
mean_squared_error(meso_val.y,xgb_reg.predict(meso_val[all_features]))

0.09083279918973539

In [42]:
mean_squared_error(meso_test.y,xgb_reg.predict(meso_test[all_features]))

0.09019641400835501