In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

#Libraries for data pre-processing
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# For dealing w imbalance data
from collections import Counter
from imblearn.over_sampling import SMOTENC
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks

# Model Hypertuning
from sklearn.model_selection import GridSearchCV

# Libraries for ROC curve
import plotly.express as px
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import make_classification

from catboost import CatBoostClassifier

In [None]:
weather_full=pd.read_csv("../Data/weather_data_2.csv")

### Data pre-processing

In [None]:
weather_full.timestamp=pd.to_datetime(weather_full.timestamp,infer_datetime_format=True)
weather_full["year"]=weather_full.timestamp.apply(lambda x: x.year)
weather_full["quarter"]=weather_full.timestamp.apply(lambda x: x.quarter)
weather_full["month"]=weather_full.timestamp.apply(lambda x: x.month)
weather_full["day"]=weather_full.timestamp.apply(lambda x: x.day)
weather_full["hour"]=weather_full.timestamp.apply(lambda x: x.hour)

In [None]:
weather_full=pd.concat([weather_full.iloc[:,12:],weather_full.iloc[:,:11],weather_full.iloc[:,11:12]],axis=1)

In [None]:
runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

### Below are several different methods to deal with the imbalance data, select one by running one of the cells

#### Use SMOTE-NC 

In [None]:
print(Counter(y_train.rainfall))
smote_nc = SMOTENC(categorical_features=[5], random_state=0)
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 143549, 1.0: 143549})

#### Use ENN

In [None]:
# For ENN, have to one hot encode region

temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

In [None]:
print(Counter(y_train.rainfall))
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 125276, 1.0: 12625})

#### Use SMOTEEN

In [None]:
# For SMOTENN, have to one hot encode region

temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

In [None]:
print(Counter(y_train.rainfall))
smoteenn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


KeyboardInterrupt: 

### Scaling the data

In [None]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
scaler=StandardScaler()
temp_df.iloc[:,6:14]=scaler.fit_transform(temp_df.iloc[:,6:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df

### Using PCA (Optional)

In [None]:
# If data is not one-hot encoded (using SMOTE-NC)

temp_df=pd.concat([X_resampled,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df

In [None]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
pca=PCA(n_components=0.95)
pca_components=pca.fit_transform(temp_df)
pca_df = pd.DataFrame(pca_components,columns=["PC1","PC2",
                                              "PC3","PC4","PC5"])
pca_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,-14.880876,8.462322,5.760845,-0.000820,0.863952
1,-14.878361,8.447230,5.867121,-1.229054,0.535909
2,-14.880858,8.457538,5.883434,-0.289807,0.626046
3,-14.882460,8.490851,5.871944,0.741605,0.672590
4,-14.880804,8.461879,5.827168,-0.189952,0.851750
...,...,...,...,...,...
328559,15.689674,-10.612139,-5.464108,-1.002921,1.714406
328560,15.682841,-10.592679,-5.450484,1.926322,1.286554
328561,15.684759,-10.596640,-5.435740,0.978755,1.239761
328562,15.681319,-10.562421,-5.406606,2.967383,-0.008761


### Hyper-tuning the model

In [None]:
cat_features = ['region']
# clf = CatBoostClassifier(random_seed = 8, verbose = 0, depth = 2, n_estimators = 25, boosting_type = 'Ordered')
clf = CatBoostClassifier(random_seed = 8)
parameters = {'depth':[2,3,4], 'n_estimators':[25,50]}
clf = GridSearchCV(clf, parameters)
clf.fit(X_resampled, y_resampled, cat_features = cat_features)

Learning rate set to 0.5
0:	learn: 0.5180751	total: 30.3ms	remaining: 727ms
1:	learn: 0.4596773	total: 52.9ms	remaining: 608ms
2:	learn: 0.4338717	total: 74.1ms	remaining: 544ms
3:	learn: 0.4209816	total: 95.5ms	remaining: 501ms
4:	learn: 0.4123749	total: 115ms	remaining: 460ms
5:	learn: 0.3884521	total: 137ms	remaining: 434ms
6:	learn: 0.3810554	total: 157ms	remaining: 404ms
7:	learn: 0.3739659	total: 188ms	remaining: 399ms
8:	learn: 0.3611843	total: 209ms	remaining: 372ms
9:	learn: 0.3480334	total: 230ms	remaining: 344ms
10:	learn: 0.3366774	total: 250ms	remaining: 318ms
11:	learn: 0.3215110	total: 270ms	remaining: 292ms
12:	learn: 0.3055940	total: 292ms	remaining: 269ms
13:	learn: 0.2876134	total: 313ms	remaining: 246ms
14:	learn: 0.2779769	total: 333ms	remaining: 222ms
15:	learn: 0.2678926	total: 353ms	remaining: 199ms
16:	learn: 0.2630001	total: 375ms	remaining: 177ms
17:	learn: 0.2482276	total: 396ms	remaining: 154ms
18:	learn: 0.2443717	total: 416ms	remaining: 131ms
19:	learn: 0

35:	learn: 0.1744506	total: 766ms	remaining: 298ms
36:	learn: 0.1729606	total: 786ms	remaining: 276ms
37:	learn: 0.1712711	total: 807ms	remaining: 255ms
38:	learn: 0.1699303	total: 827ms	remaining: 233ms
39:	learn: 0.1692131	total: 847ms	remaining: 212ms
40:	learn: 0.1638732	total: 867ms	remaining: 190ms
41:	learn: 0.1587371	total: 887ms	remaining: 169ms
42:	learn: 0.1557221	total: 907ms	remaining: 148ms
43:	learn: 0.1528462	total: 927ms	remaining: 126ms
44:	learn: 0.1504138	total: 949ms	remaining: 105ms
45:	learn: 0.1487776	total: 969ms	remaining: 84.2ms
46:	learn: 0.1474380	total: 989ms	remaining: 63.1ms
47:	learn: 0.1449286	total: 1.01s	remaining: 42.1ms
48:	learn: 0.1443709	total: 1.03s	remaining: 21.1ms
49:	learn: 0.1403132	total: 1.05s	remaining: 0us
Learning rate set to 0.5
0:	learn: 0.5298670	total: 21.6ms	remaining: 1.06s
1:	learn: 0.4739948	total: 43.9ms	remaining: 1.05s
2:	learn: 0.4510051	total: 67.3ms	remaining: 1.05s
3:	learn: 0.4383269	total: 90.3ms	remaining: 1.04s
4:	l

Learning rate set to 0.5
0:	learn: 0.5308876	total: 22.4ms	remaining: 1.1s
1:	learn: 0.4768819	total: 44.5ms	remaining: 1.07s
2:	learn: 0.4520253	total: 66ms	remaining: 1.03s
3:	learn: 0.4399186	total: 87.9ms	remaining: 1.01s
4:	learn: 0.4259933	total: 109ms	remaining: 978ms
5:	learn: 0.4046713	total: 129ms	remaining: 947ms
6:	learn: 0.3982056	total: 150ms	remaining: 924ms
7:	learn: 0.3911562	total: 171ms	remaining: 899ms
8:	learn: 0.3845494	total: 194ms	remaining: 883ms
9:	learn: 0.3651028	total: 216ms	remaining: 863ms
10:	learn: 0.3541983	total: 237ms	remaining: 840ms
11:	learn: 0.3443715	total: 258ms	remaining: 817ms
12:	learn: 0.3400263	total: 283ms	remaining: 806ms
13:	learn: 0.3269335	total: 305ms	remaining: 784ms
14:	learn: 0.3135940	total: 327ms	remaining: 764ms
15:	learn: 0.3055597	total: 349ms	remaining: 742ms
16:	learn: 0.2977206	total: 371ms	remaining: 719ms
17:	learn: 0.2933054	total: 392ms	remaining: 697ms
18:	learn: 0.2908648	total: 418ms	remaining: 682ms
19:	learn: 0.28

12:	learn: 0.2874922	total: 382ms	remaining: 352ms
13:	learn: 0.2798552	total: 409ms	remaining: 321ms
14:	learn: 0.2730518	total: 435ms	remaining: 290ms
15:	learn: 0.2671276	total: 460ms	remaining: 259ms
16:	learn: 0.2628740	total: 485ms	remaining: 228ms
17:	learn: 0.2576832	total: 509ms	remaining: 198ms
18:	learn: 0.2544876	total: 534ms	remaining: 169ms
19:	learn: 0.2523756	total: 559ms	remaining: 140ms
20:	learn: 0.2507550	total: 584ms	remaining: 111ms
21:	learn: 0.2468320	total: 610ms	remaining: 83.1ms
22:	learn: 0.2399836	total: 635ms	remaining: 55.2ms
23:	learn: 0.2363049	total: 661ms	remaining: 27.5ms
24:	learn: 0.2335896	total: 685ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 0.4991502	total: 50.7ms	remaining: 2.48s
1:	learn: 0.4285999	total: 82.3ms	remaining: 1.97s
2:	learn: 0.3985878	total: 110ms	remaining: 1.73s
3:	learn: 0.3818602	total: 137ms	remaining: 1.58s
4:	learn: 0.3577848	total: 165ms	remaining: 1.48s
5:	learn: 0.3444500	total: 191ms	remaining: 1.4s
6:	learn: 

Learning rate set to 0.5
0:	learn: 0.5345924	total: 27.3ms	remaining: 1.34s
1:	learn: 0.4697539	total: 56.7ms	remaining: 1.36s
2:	learn: 0.4430395	total: 86.7ms	remaining: 1.36s
3:	learn: 0.4123527	total: 114ms	remaining: 1.31s
4:	learn: 0.3970059	total: 144ms	remaining: 1.3s
5:	learn: 0.3699749	total: 172ms	remaining: 1.26s
6:	learn: 0.3610723	total: 201ms	remaining: 1.23s
7:	learn: 0.3489195	total: 229ms	remaining: 1.2s
8:	learn: 0.3420042	total: 257ms	remaining: 1.17s
9:	learn: 0.3291625	total: 284ms	remaining: 1.13s
10:	learn: 0.3154491	total: 310ms	remaining: 1.1s
11:	learn: 0.3071935	total: 337ms	remaining: 1.07s
12:	learn: 0.2996939	total: 364ms	remaining: 1.03s
13:	learn: 0.2951414	total: 391ms	remaining: 1s
14:	learn: 0.2897149	total: 418ms	remaining: 976ms
15:	learn: 0.2848664	total: 446ms	remaining: 947ms
16:	learn: 0.2803560	total: 472ms	remaining: 915ms
17:	learn: 0.2778353	total: 497ms	remaining: 884ms
18:	learn: 0.2731436	total: 553ms	remaining: 903ms
19:	learn: 0.270008

11:	learn: 0.2793135	total: 405ms	remaining: 439ms
12:	learn: 0.2723015	total: 438ms	remaining: 404ms
13:	learn: 0.2631992	total: 470ms	remaining: 369ms
14:	learn: 0.2608997	total: 503ms	remaining: 335ms
15:	learn: 0.2585872	total: 535ms	remaining: 301ms
16:	learn: 0.2531133	total: 565ms	remaining: 266ms
17:	learn: 0.2478479	total: 599ms	remaining: 233ms
18:	learn: 0.2406777	total: 633ms	remaining: 200ms
19:	learn: 0.2361542	total: 667ms	remaining: 167ms
20:	learn: 0.2303664	total: 699ms	remaining: 133ms
21:	learn: 0.2252177	total: 734ms	remaining: 100ms
22:	learn: 0.2221249	total: 768ms	remaining: 66.8ms
23:	learn: 0.2185085	total: 803ms	remaining: 33.5ms
24:	learn: 0.2151034	total: 838ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 0.5308573	total: 33.8ms	remaining: 811ms
1:	learn: 0.4637824	total: 67.6ms	remaining: 778ms
2:	learn: 0.4170932	total: 104ms	remaining: 760ms
3:	learn: 0.3843470	total: 137ms	remaining: 720ms
4:	learn: 0.3647480	total: 168ms	remaining: 674ms
5:	learn:

47:	learn: 0.1598240	total: 1.61s	remaining: 67ms
48:	learn: 0.1585949	total: 1.64s	remaining: 33.6ms
49:	learn: 0.1569907	total: 1.68s	remaining: 0us
Learning rate set to 0.5
0:	learn: 0.5174035	total: 46.7ms	remaining: 2.29s
1:	learn: 0.4536936	total: 81.6ms	remaining: 1.96s
2:	learn: 0.4200307	total: 114ms	remaining: 1.78s
3:	learn: 0.3860947	total: 148ms	remaining: 1.7s
4:	learn: 0.3671263	total: 185ms	remaining: 1.66s
5:	learn: 0.3586350	total: 218ms	remaining: 1.6s
6:	learn: 0.3490717	total: 255ms	remaining: 1.56s
7:	learn: 0.3256242	total: 289ms	remaining: 1.52s
8:	learn: 0.3157688	total: 324ms	remaining: 1.48s
9:	learn: 0.3088630	total: 356ms	remaining: 1.42s
10:	learn: 0.2937090	total: 390ms	remaining: 1.38s
11:	learn: 0.2793135	total: 426ms	remaining: 1.35s
12:	learn: 0.2723015	total: 459ms	remaining: 1.31s
13:	learn: 0.2631992	total: 493ms	remaining: 1.27s
14:	learn: 0.2608997	total: 527ms	remaining: 1.23s
15:	learn: 0.2585872	total: 559ms	remaining: 1.19s
16:	learn: 0.25311

13:	learn: 0.3267697	total: 392ms	remaining: 308ms
14:	learn: 0.3137949	total: 420ms	remaining: 280ms
15:	learn: 0.3062578	total: 446ms	remaining: 251ms
16:	learn: 0.2983295	total: 472ms	remaining: 222ms
17:	learn: 0.2943350	total: 499ms	remaining: 194ms
18:	learn: 0.2871347	total: 528ms	remaining: 167ms
19:	learn: 0.2832794	total: 555ms	remaining: 139ms
20:	learn: 0.2792088	total: 582ms	remaining: 111ms
21:	learn: 0.2780312	total: 613ms	remaining: 83.6ms
22:	learn: 0.2756813	total: 640ms	remaining: 55.7ms
23:	learn: 0.2736883	total: 666ms	remaining: 27.7ms
24:	learn: 0.2723522	total: 692ms	remaining: 0us


GridSearchCV(estimator=<catboost.core.CatBoostClassifier object at 0x000002937BE20AF0>,
             param_grid={'depth': [2, 3, 4], 'n_estimators': [25, 50]})

In [None]:
print(clf.best_params_)

{'depth': 2, 'n_estimators': 25}


### Fitting data into model

#### If PCA was used

In [None]:
# Uncomment the line below if SMOTE-NC is used
cat_features = ['region']

# Hyper-tuned model
model = CatBoostClassifier(random_seed = 8, verbose = 0, 
                           depth = 2, n_estimators = 25, 
                           boosting_type = 'Ordered',
                           auto_class_weights = 'Balanced',)

# Use the fit below if PCA was used 
model.fit(pca_df.iloc[:len(X_resampled),:], y_resampled)

y_pred = model.predict(pca_df.iloc[len(X_resampled):,:])

cm = confusion_matrix(y_test, y_pred)

tn, fp, fn, tp = cm.ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * ((precision * recall) / (precision + recall))

print(cm)
print(accuracy)
print(precision)
print(recall)
print(f1_score)


[[34972  2645]
 [ 1715  2134]]
0.8948536150098876
0.44653693241263864
0.5544297220057157
0.4946685210941122


#### If PCA was not used

In [None]:
# Uncomment the line below if SMOTE-NC is used
# cat_features = ['region']

model = CatBoostClassifier(random_seed = 8, verbose = 0)

# Hyper-tuned model
# model = CatBoostClassifier(random_seed = 8, verbose = 0, 
#                            depth = 2, n_estimators = 25, 
#                            boosting_type = 'Ordered',
#                            auto_class_weights = 'Balanced',)

# model.fit(X_resampled, y_resampled, cat_features = cat_features)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

tn, fp, fn, tp = cm.ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * ((precision * recall) / (precision + recall))

print(cm)
print(accuracy)
print(precision)
print(recall)
print(f1_score)

In [None]:
# Generating ROC Curve
final_pred_formatted = list(y_pred)
fpr, tpr, thresholds = roc_curve(y_test, final_pred_formatted)

# Plotting the chart
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)

# Format & add the dash-line 
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)
fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')


fig.show()