In [5]:
# import useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl 
mpl.rcParams["figure.dpi"] = 150
import seaborn as sns
import os
import joblib

# enable copy on write (default in pandas 3.0)
pd.options.mode.copy_on_write = True

In [2]:
def read_merged(weather, year):
    return pd.read_csv('../../merged/merged_{}_{}.csv.gz'.format(weather, year)).drop(columns=['Unnamed: 0'])

In [3]:
meso = pd.concat([read_merged('meso', year) for year in range(2015, 2023)])

In [4]:
meso.replace({'TVS':{'Y': True, 'N': False}}, inplace=True)

  meso.replace({'TVS':{'Y': True, 'N': False}}, inplace=True)


In [14]:
meso.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5548137 entries, 0 to 667086
Data columns (total 17 columns):
 #   Column        Dtype  
---  ------        -----  
 0   DATE          object 
 1   MONTH         int64  
 2   LAT           float64
 3   LON           float64
 4   STR_RANK      object 
 5   LL_ROT_VEL    int64  
 6   LL_DV         int64  
 7   LL_BASE       int64  
 8   DEPTH_KFT     int64  
 9   DPTH_STMRL    int64  
 10  MAX_RV_KFT    int64  
 11  MAX_RV_KTS    int64  
 12  TVS           bool   
 13  MSI           int64  
 14  COUNTY        object 
 15  STATE         object 
 16  POWER_OUTAGE  bool   
dtypes: bool(2), float64(2), int64(9), object(4)
memory usage: 687.8+ MB


In [7]:
features = ['MONTH', 'LAT', 'LON', 'LL_ROT_VEL', 'LL_DV', 'LL_BASE', 'DEPTH_KFT', 'DPTH_STMRL',
            'MAX_RV_KFT', 'MAX_RV_KTS', 'TVS', 'MSI']
X,y = meso[features], meso['POWER_OUTAGE']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y)
X_tt, X_val, y_tt, y_val = train_test_split(X_train,y_train, test_size=0.2, stratify=y_train)

In [9]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline

# XGBoost

In [20]:
from xgboost import XGBClassifier
xgb_model = Pipeline([('resampler',RandomUnderSampler()), ('clf',XGBClassifier(objective = 'binary:logistic'))])

In [21]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'clf__max_depth': range (5, 17,  2),
    'clf__n_estimators': range(100, 440, 40),
    'clf__learning_rate': [0.01, 0.1, 0.3]
}

grid_search = GridSearchCV(xgb_model,
    param_grid=parameters,
    scoring='f1',
    cv = 5,
    verbose=True)
grid_search.fit(X_tt, y_tt)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [22]:
best_model = grid_search.best_estimator_

## Saving model

In [25]:
joblib.dump(best_model,'../../models/xgb_meso.pkl')

['../../models/xgb_meso.pkl']

# Random forest

In [26]:
from sklearn.ensemble import RandomForestClassifier
rf_model = Pipeline([('resampler',RandomUnderSampler()), ('clf',RandomForestClassifier())])

In [None]:
parameters = {
    'clf__max_depth': range (5, 17,  2),
    'clf__n_estimators': range(100, 440, 40),
}

grid_search = GridSearchCV(rf_model,
    param_grid=parameters,
    scoring='f1',
    cv = 5,
    verbose=True)
grid_search.fit(X_tt, y_tt)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


# Examining performance 

In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score, accuracy_score, PrecisionRecallDisplay

In [6]:
xgb_meso = joblib.load('../../models/xgb_meso.pkl')

In [12]:
xgb_predictions = xgb_meso.predict(X_val)
print('xGBoost results')
print('precision: ', precision_score(y_val,xgb_predictions))
print('recall: ', recall_score(y_val,xgb_predictions))
print('f1: ', f1_score(y_val,xgb_predictions))
print('accuracy score: ', accuracy_score(y_val,xgb_predictions))
print('balanced accuracy score: ', balanced_accuracy_score(y_val,xgb_predictions))

xGBoost results
precision:  0.17528142724158116
recall:  0.9536850899407358
f1:  0.29613506009201274
accuracy score:  0.8043859313147881
balanced accuracy score:  0.8756692447898873
