# Adaboost

## This notebook includes the following sections:
1. Data pre-processing
2. base Decision Tree without resampling
3. SMOTE-NC resampling
4. EEN resampling
5. SMOTE-EEN resampling
6. EEN + PCA
7. EEN + hypertuned k-NN

import needed libraries

In [None]:
# Import packages
import numpy as np
import pandas as pd

from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn import metrics
from sklearn.metrics import *
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTENC
from collections import Counter

# Data pre-processing

In [None]:
weather_adjusted = pd.read_csv("../Data/weather_data_2.csv")
weather_adjusted

Unnamed: 0,timestamp,region,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,rainfall
0,2017-01-01 03:00:00,central,26.850000,90.300000,0.0,-6.0,-4.0,-0.100000,-1.421085e-14,0.0,0.0,0.0
1,2017-01-01 03:00:00,east,26.125000,87.433333,0.0,-1.0,1.0,-0.150000,-5.333333e-01,0.0,1.0,0.0
2,2017-01-01 03:00:00,north,26.000000,87.000000,0.0,-1.0,2.0,-0.100000,4.000000e-01,0.0,0.0,0.0
3,2017-01-01 03:00:00,north-east,26.000000,89.250000,0.0,-1.0,1.0,0.033333,-1.500000e-01,-2.0,-1.0,0.0
4,2017-01-01 03:00:00,west,26.100000,87.066667,0.0,-4.0,0.0,-0.140000,8.666667e-01,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
197635,2021-12-31 23:00:00,central,23.666667,95.400000,1.0,-1.0,-1.0,-0.766667,-2.150000e+00,-3.0,1.0,1.0
197636,2021-12-31 23:00:00,east,24.800000,93.000000,1.0,-3.0,0.0,0.100000,-3.000000e-01,-2.0,-2.0,0.0
197637,2021-12-31 23:00:00,north,24.500000,96.200000,1.0,0.0,-1.0,0.200000,-6.000000e-01,-2.0,-1.0,0.0
197638,2021-12-31 23:00:00,north-east,24.300000,92.650000,0.0,0.0,-1.0,0.350000,-1.000000e+00,0.0,-3.0,0.0


create time variables

In [None]:
weather_adjusted.timestamp=pd.to_datetime(weather_adjusted.timestamp,infer_datetime_format=True)
weather_adjusted["year"]=weather_adjusted.timestamp.apply(lambda x: x.year)
weather_adjusted["quarter"]=weather_adjusted.timestamp.apply(lambda x: x.quarter)
weather_adjusted["month"]=weather_adjusted.timestamp.apply(lambda x: x.month)
weather_adjusted["day"]=weather_adjusted.timestamp.apply(lambda x: x.day)
weather_adjusted["hour"]=weather_adjusted.timestamp.apply(lambda x: x.hour)
weather_adjusted

Unnamed: 0,timestamp,region,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,rainfall,year,quarter,month,day,hour
0,2017-01-01 03:00:00,central,26.850000,90.300000,0.0,-6.0,-4.0,-0.100000,-1.421085e-14,0.0,0.0,0.0,2017,1,1,1,3
1,2017-01-01 03:00:00,east,26.125000,87.433333,0.0,-1.0,1.0,-0.150000,-5.333333e-01,0.0,1.0,0.0,2017,1,1,1,3
2,2017-01-01 03:00:00,north,26.000000,87.000000,0.0,-1.0,2.0,-0.100000,4.000000e-01,0.0,0.0,0.0,2017,1,1,1,3
3,2017-01-01 03:00:00,north-east,26.000000,89.250000,0.0,-1.0,1.0,0.033333,-1.500000e-01,-2.0,-1.0,0.0,2017,1,1,1,3
4,2017-01-01 03:00:00,west,26.100000,87.066667,0.0,-4.0,0.0,-0.140000,8.666667e-01,0.0,0.0,0.0,2017,1,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197635,2021-12-31 23:00:00,central,23.666667,95.400000,1.0,-1.0,-1.0,-0.766667,-2.150000e+00,-3.0,1.0,1.0,2021,4,12,31,23
197636,2021-12-31 23:00:00,east,24.800000,93.000000,1.0,-3.0,0.0,0.100000,-3.000000e-01,-2.0,-2.0,0.0,2021,4,12,31,23
197637,2021-12-31 23:00:00,north,24.500000,96.200000,1.0,0.0,-1.0,0.200000,-6.000000e-01,-2.0,-1.0,0.0,2021,4,12,31,23
197638,2021-12-31 23:00:00,north-east,24.300000,92.650000,0.0,0.0,-1.0,0.350000,-1.000000e+00,0.0,-3.0,0.0,2021,4,12,31,23


In [None]:
weather_adjusted=pd.concat([weather_adjusted.iloc[:,12:],weather_adjusted.iloc[:,:11],weather_adjusted.iloc[:,11:12]],axis=1)
weather_adjusted

Unnamed: 0,year,quarter,month,day,hour,timestamp,region,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,rainfall
0,2017,1,1,1,3,2017-01-01 03:00:00,central,26.850000,90.300000,0.0,-6.0,-4.0,-0.100000,-1.421085e-14,0.0,0.0,0.0
1,2017,1,1,1,3,2017-01-01 03:00:00,east,26.125000,87.433333,0.0,-1.0,1.0,-0.150000,-5.333333e-01,0.0,1.0,0.0
2,2017,1,1,1,3,2017-01-01 03:00:00,north,26.000000,87.000000,0.0,-1.0,2.0,-0.100000,4.000000e-01,0.0,0.0,0.0
3,2017,1,1,1,3,2017-01-01 03:00:00,north-east,26.000000,89.250000,0.0,-1.0,1.0,0.033333,-1.500000e-01,-2.0,-1.0,0.0
4,2017,1,1,1,3,2017-01-01 03:00:00,west,26.100000,87.066667,0.0,-4.0,0.0,-0.140000,8.666667e-01,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197635,2021,4,12,31,23,2021-12-31 23:00:00,central,23.666667,95.400000,1.0,-1.0,-1.0,-0.766667,-2.150000e+00,-3.0,1.0,1.0
197636,2021,4,12,31,23,2021-12-31 23:00:00,east,24.800000,93.000000,1.0,-3.0,0.0,0.100000,-3.000000e-01,-2.0,-2.0,0.0
197637,2021,4,12,31,23,2021-12-31 23:00:00,north,24.500000,96.200000,1.0,0.0,-1.0,0.200000,-6.000000e-01,-2.0,-1.0,0.0
197638,2021,4,12,31,23,2021-12-31 23:00:00,north-east,24.300000,92.650000,0.0,0.0,-1.0,0.350000,-1.000000e+00,0.0,-3.0,0.0


Train test split

In [None]:
runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_adjusted[weather_adjusted["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_adjusted[~weather_adjusted["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_adjusted[weather_adjusted["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_adjusted[~weather_adjusted["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])
X_train

Unnamed: 0,year,quarter,month,day,hour,region,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y
0,2017,1,1,1,3,central,26.850000,90.300000,0.0,-6.0,-4.0,-0.100000,-1.421085e-14,0.0,0.0
1,2017,1,1,1,3,east,26.125000,87.433333,0.0,-1.0,1.0,-0.150000,-5.333333e-01,0.0,1.0
2,2017,1,1,1,3,north,26.000000,87.000000,0.0,-1.0,2.0,-0.100000,4.000000e-01,0.0,0.0
3,2017,1,1,1,3,north-east,26.000000,89.250000,0.0,-1.0,1.0,0.033333,-1.500000e-01,-2.0,-1.0
4,2017,1,1,1,3,west,26.100000,87.066667,0.0,-4.0,0.0,-0.140000,8.666667e-01,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156169,2020,4,12,31,18,central,25.960000,89.800000,1.0,0.0,-1.0,-0.080000,1.850000e+00,1.0,2.0
156170,2020,4,12,31,18,east,29.300000,66.200000,0.0,3.0,5.0,-0.200000,6.000000e-01,0.0,0.0
156171,2020,4,12,31,18,north,25.650000,83.300000,1.0,-3.0,3.0,-2.700000,1.085000e+01,-10.0,7.0
156172,2020,4,12,31,18,north-east,28.333333,74.800000,0.0,4.0,-1.0,-0.833333,4.433333e+00,0.0,-4.0


In [None]:
temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_train = temp_df.iloc[:len(X_train),:]
X_train

Unnamed: 0,year,quarter,month,day,hour,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,region_central,region_east,region_north,region_north-east,region_west
0,2017,1,1,1,3,-0.538605,0.953032,-0.300872,-2.214278,-0.910514,-0.118713,-0.005020,0.000239,-0.000271,1,0,0,0,0
1,2017,1,1,1,3,-0.883708,0.690795,-0.300872,-0.587383,0.463936,-0.181761,-0.127890,0.000239,0.351850,0,1,0,0,0
2,2017,1,1,1,3,-0.943208,0.651154,-0.300872,-0.587383,0.738826,-0.118713,0.087132,0.000239,-0.000271,0,0,1,0,0
3,2017,1,1,1,3,-0.943208,0.856980,-0.300872,-0.587383,0.463936,0.049417,-0.039577,-0.850689,-0.352392,0,0,0,1,0
4,2017,1,1,1,3,-0.895608,0.657253,-0.300872,-1.563520,0.189046,-0.169152,0.194642,0.000239,-0.000271,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156169,2020,4,12,31,18,-0.962249,0.907293,3.323674,-0.262004,-0.085844,-0.093493,0.421183,0.425703,0.703971,1,0,0,0,0
156170,2020,4,12,31,18,0.627606,-1.251587,-0.300872,0.714133,1.563496,-0.244810,0.133208,0.000239,-0.000271,0,1,0,0,0
156171,2020,4,12,31,18,-1.109810,0.312686,3.323674,-1.238141,1.013716,-3.397241,2.494603,-4.254402,2.464576,0,0,1,0,0
156172,2020,4,12,31,18,0.167469,-0.464877,-0.300872,1.039512,-0.085844,-1.043426,1.016331,0.000239,-1.408755,0,0,0,1,0


In [None]:
X_test = temp_df.iloc[len(X_train):,:]
del temp_df
X_test

Unnamed: 0,year,quarter,month,day,hour,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,region_central,region_east,region_north,region_north-east,region_west
156174,2020,4,12,31,19,-1.685775,1.472169,3.323674,0.714133,-0.085844,-1.909294,1.417576,1.276631,-0.000271,1,0,0,0,0
156175,2020,4,12,31,19,0.246803,-0.684424,-0.300872,1.690270,1.838387,-1.001393,1.423336,1.276631,0.351850,0,1,0,0,0
156176,2020,4,12,31,19,-1.252611,0.824962,3.323674,0.714133,0.189046,-0.370907,1.285108,2.553023,-1.056634,0,0,1,0,0
156177,2020,4,12,31,19,-0.863874,0.684696,3.323674,0.063375,-0.635624,-2.724722,2.890089,-1.276153,-0.704513,0,0,0,1,0
156178,2020,4,12,31,19,-1.673082,1.328091,3.323674,1.039512,0.738826,-0.665134,0.102490,1.276631,2.816697,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197635,2021,4,12,31,23,-2.053886,1.419569,3.323674,-0.587383,-0.085844,-0.959361,-0.500338,-1.276153,0.351850,1,0,0,0,0
197636,2021,4,12,31,23,-1.514414,1.200022,3.323674,-1.238141,0.189046,0.133482,-0.074134,-0.850689,-0.704513,0,1,0,0,0
197637,2021,4,12,31,23,-1.657215,1.492752,3.323674,-0.262004,-0.085844,0.259579,-0.143249,-0.850689,-0.352392,0,0,1,0,0
197638,2021,4,12,31,23,-1.752416,1.168005,-0.300872,-0.262004,-0.085844,0.448725,-0.235401,0.000239,-1.056634,0,0,0,1,0


# base Decision Tree without resampling

In [None]:
#initialise base adaboost 
model = AdaBoostClassifier()

#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

#Use the trained model to predict the test data
predictions = model.predict(X_test)

#Accuracy, Precision, Recall and F1
accuracy = accuracy_score(y_test,predictions)
precision = metrics.precision_score(y_test, predictions)
recall = metrics.recall_score(y_test, predictions)
f_measure = metrics.f1_score(y_test, predictions)
print("Accuracy:", accuracy)
print("Precision:",precision)
print("Recall:",recall)
print("F1:",f_measure)

  y = column_or_1d(y, warn=True)


Accuracy: 0.9202720300969469
Precision: 0.6187144731088763
Recall: 0.36762795531306836
F1: 0.4612125162972621


In [None]:
#rocauc
ada_base_pred = model.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, ada_base_pred, pos_label=1)
auc(fpr, tpr)

0.6722234733632625

# SMOTE-NC Resampling

In [None]:
#pre-process data again
weather_full=pd.read_csv("weather_adjusted.csv")

weather_full.timestamp=pd.to_datetime(weather_full.timestamp,infer_datetime_format=True)
weather_full["year"]=weather_full.timestamp.apply(lambda x: x.year)
weather_full["quarter"]=weather_full.timestamp.apply(lambda x: x.quarter)
weather_full["month"]=weather_full.timestamp.apply(lambda x: x.month)
weather_full["day"]=weather_full.timestamp.apply(lambda x: x.day)
weather_full["hour"]=weather_full.timestamp.apply(lambda x: x.hour)

weather_full=pd.concat([weather_full.iloc[:,12:],weather_full.iloc[:,:11],weather_full.iloc[:,11:12]],axis=1)

runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

#SMOTE-NC resampling
print(Counter(y_train.rainfall))
smote_nc = SMOTENC(categorical_features=[5], random_state=0)
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 143549, 1.0: 143549})

In [None]:
X_resampled

Unnamed: 0,year,quarter,month,day,hour,region,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y
0,2017,1,1,1,3,central,26.850000,90.300000,0.000000,-6.000000,-4.000000,-0.100000,-1.421085e-14,0.000000,0.000000
1,2017,1,1,1,3,east,26.125000,87.433333,0.000000,-1.000000,1.000000,-0.150000,-5.333333e-01,0.000000,1.000000
2,2017,1,1,1,3,north,26.000000,87.000000,0.000000,-1.000000,2.000000,-0.100000,4.000000e-01,0.000000,0.000000
3,2017,1,1,1,3,north-east,26.000000,89.250000,0.000000,-1.000000,1.000000,0.033333,-1.500000e-01,-2.000000,-1.000000
4,2017,1,1,1,3,west,26.100000,87.066667,0.000000,-4.000000,0.000000,-0.140000,8.666667e-01,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287093,2018,2,6,6,10,north-east,26.018087,93.458274,0.713579,2.286421,-2.286421,-1.694997,8.821806e+00,0.000000,-3.286421
287094,2017,2,6,18,9,west,28.372432,82.265984,0.000000,-5.108495,-3.108495,-0.204401,2.121119e+00,-3.847491,-1.261004
287095,2017,1,1,2,14,central,26.551796,81.207136,1.000000,6.657089,-0.328545,-0.309497,-1.882089e+00,7.342911,4.000000
287096,2019,2,6,3,19,east,28.960638,81.380277,1.000000,-4.852709,-1.803612,-2.009128,9.654007e+00,-0.901806,0.049097


In [None]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_resampled

Unnamed: 0,year,quarter,month,day,hour,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,region_central,region_east,region_north,region_north-east,region_west
0,2017,1,1,1,3,-0.368775,0.785725,-0.619154,-1.927747,-0.908897,0.076037,-0.151417,0.016280,0.017967,1,0,0,0,0
1,2017,1,1,1,3,-0.693481,0.525594,-0.619154,-0.405321,0.536957,0.025337,-0.258075,0.016280,0.354667,0,1,0,0,0
2,2017,1,1,1,3,-0.749465,0.486272,-0.619154,-0.405321,0.826128,0.076037,-0.071423,0.016280,0.017967,0,0,1,0,0
3,2017,1,1,1,3,-0.749465,0.690445,-0.619154,-0.405321,0.536957,0.211238,-0.181414,-0.730024,-0.318733,0,0,0,1,0
4,2017,1,1,1,3,-0.704678,0.492322,-0.619154,-1.318777,0.247786,0.035477,0.021903,0.016280,0.017967,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287093,2018,2,6,6,10,-0.741365,1.072317,1.108104,0.595346,-0.413380,-1.541297,1.612804,0.016280,-1.088571,0,0,0,1,0
287094,2017,2,6,18,9,0.313078,0.056692,-0.619154,-1.656297,-0.651100,-0.029826,0.272773,-1.419418,-0.406613,0,0,0,0,1
287095,2017,1,1,2,14,-0.502332,-0.039392,1.801403,1.926150,0.152781,-0.136394,-0.527804,2.756299,1.364767,1,0,0,0,0
287096,2019,2,6,3,19,0.576518,-0.023680,1.801403,-1.578414,-0.273766,-1.859828,1.779230,-0.320231,0.034498,0,1,0,0,0


In [None]:
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df
X_test

Unnamed: 0,year,quarter,month,day,hour,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,region_central,region_east,region_north,region_north-east,region_west
156174,2020,4,12,31,19,-1.448144,1.300693,1.801403,0.812620,-0.041385,-1.363849,1.083485,1.135735,0.017967,1,0,0,0,0
156175,2020,4,12,31,19,0.370212,-0.838581,-0.619154,1.726076,1.982812,-0.633766,1.088484,1.135735,0.354667,0,1,0,0,0
156176,2020,4,12,31,19,-1.040581,0.658684,1.801403,0.812620,0.247786,-0.126764,0.968494,2.255190,-0.992133,0,0,1,0,0
156177,2020,4,12,31,19,-0.674820,0.519545,1.801403,0.203650,-0.619726,-2.019573,2.361716,-1.103175,-0.655433,0,0,0,1,0
156178,2020,4,12,31,19,-1.436201,1.157773,1.801403,1.117105,0.826128,-0.363365,-0.058091,1.135735,2.711566,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197635,2021,4,12,31,23,-1.794498,1.248516,1.801403,-0.405321,-0.041385,-0.599966,-0.581382,-1.103175,0.354667,1,0,0,0,0
197636,2021,4,12,31,23,-1.286911,1.030732,1.801403,-1.014291,0.247786,0.278838,-0.211412,-0.730024,-0.655433,0,1,0,0,0
197637,2021,4,12,31,23,-1.421272,1.321111,1.801403,-0.100836,-0.041385,0.380239,-0.271407,-0.730024,-0.318733,0,0,1,0,0
197638,2021,4,12,31,23,-1.510846,0.998972,-0.619154,-0.100836,-0.041385,0.532340,-0.351401,0.016280,-0.992133,0,0,0,1,0


In [None]:
#initialise base adaboost 
model = AdaBoostClassifier()

#Fit the resampled training feature Xs and training label Ys
model.fit(X_resampled, y_resampled)

#Use the trained model to predict the test data
predictions = model.predict(X_test)

#Accuracy, Precision, Recall and F1
accuracy = accuracy_score(y_test,predictions)
precision = metrics.precision_score(y_test, predictions)
recall = metrics.recall_score(y_test, predictions)
f_measure = metrics.f1_score(y_test, predictions)
print("Accuracy:", accuracy)
print("Precision:",precision)
print("Recall:",recall)
print("F1:",f_measure)

Accuracy: 0.9152558722809049
Precision: 0.5442770288131112
Recall: 0.5349441413354118
F1: 0.5395702306079665


In [None]:
#rocauc
ada_base_pred = model.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, ada_base_pred, pos_label=1)
auc(fpr, tpr)

0.7445568993355953

# EEN resampling

In [None]:
from imblearn.under_sampling import EditedNearestNeighbours

#pre-process data again
weather_full=pd.read_csv("weather_adjusted.csv")

weather_full.timestamp=pd.to_datetime(weather_full.timestamp,infer_datetime_format=True)
weather_full["year"]=weather_full.timestamp.apply(lambda x: x.year)
weather_full["quarter"]=weather_full.timestamp.apply(lambda x: x.quarter)
weather_full["month"]=weather_full.timestamp.apply(lambda x: x.month)
weather_full["day"]=weather_full.timestamp.apply(lambda x: x.day)
weather_full["hour"]=weather_full.timestamp.apply(lambda x: x.hour)

weather_full=pd.concat([weather_full.iloc[:,12:],weather_full.iloc[:,:11],weather_full.iloc[:,11:12]],axis=1)

runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

#EEN resampling
print(Counter(y_train.rainfall))
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 125276, 1.0: 12625})

In [None]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df

In [None]:
#initialise base adaboost 
model = AdaBoostClassifier()

#Fit the resampled training feature Xs and training label Ys
model.fit(X_resampled, y_resampled)

#Use the trained model to predict the test data
predictions = model.predict(X_test)

#Accuracy, Precision, Recall and F1
accuracy = accuracy_score(y_test,predictions)
precision = metrics.precision_score(y_test, predictions)
recall = metrics.recall_score(y_test, predictions)
f_measure = metrics.f1_score(y_test, predictions)
print("Accuracy:", accuracy)
print("Precision:",precision)
print("Recall:",recall)
print("F1:",f_measure)

Accuracy: 0.919331500506439
Precision: 0.5778739184177998
Recall: 0.4858404780462458
F1: 0.5278757939308397


In [None]:
#rocauc
ada_base_pred = model.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, ada_base_pred, pos_label=1)
auc(fpr, tpr)

0.7247635545453601

# SMOTE-ENN resampling

In [None]:
from imblearn.combine import SMOTEENN

#pre-process data again
weather_full=pd.read_csv("weather_adjusted.csv")

weather_full.timestamp=pd.to_datetime(weather_full.timestamp,infer_datetime_format=True)
weather_full["year"]=weather_full.timestamp.apply(lambda x: x.year)
weather_full["quarter"]=weather_full.timestamp.apply(lambda x: x.quarter)
weather_full["month"]=weather_full.timestamp.apply(lambda x: x.month)
weather_full["day"]=weather_full.timestamp.apply(lambda x: x.day)
weather_full["hour"]=weather_full.timestamp.apply(lambda x: x.hour)

weather_full=pd.concat([weather_full.iloc[:,12:],weather_full.iloc[:,:11],weather_full.iloc[:,11:12]],axis=1)

runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

#SMOTE-EEN resampling
print(Counter(y_train.rainfall))
smoteenn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 109258, 1.0: 142925})

In [None]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df

In [None]:
#initialise base adaboost 
model = AdaBoostClassifier()

#Fit the resampled training feature Xs and training label Ys
model.fit(X_resampled, y_resampled)

#Use the trained model to predict the test data
predictions = model.predict(X_test)

#Accuracy, Precision, Recall and F1
accuracy = accuracy_score(y_test,predictions)
precision = metrics.precision_score(y_test, predictions)
recall = metrics.recall_score(y_test, predictions)
f_measure = metrics.f1_score(y_test, predictions)
print("Accuracy:", accuracy)
print("Precision:",precision)
print("Recall:",recall)
print("F1:",f_measure)

Accuracy: 0.9097332754545893
Precision: 0.5125177137458667
Recall: 0.5637828007274617
F1: 0.5369293579116664


In [None]:
#rocauc
ada_base_pred = model.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, ada_base_pred, pos_label=1)
auc(fpr, tpr)

0.7544569957062621

# SMOTE-NC + PCA

Since resampling method of SMOTE-NC produced the best results, see if PCA helps

In [None]:
#pre-process data again
weather_full=pd.read_csv("weather_adjusted.csv")

weather_full.timestamp=pd.to_datetime(weather_full.timestamp,infer_datetime_format=True)
weather_full["year"]=weather_full.timestamp.apply(lambda x: x.year)
weather_full["quarter"]=weather_full.timestamp.apply(lambda x: x.quarter)
weather_full["month"]=weather_full.timestamp.apply(lambda x: x.month)
weather_full["day"]=weather_full.timestamp.apply(lambda x: x.day)
weather_full["hour"]=weather_full.timestamp.apply(lambda x: x.hour)

weather_full=pd.concat([weather_full.iloc[:,12:],weather_full.iloc[:,:11],weather_full.iloc[:,11:12]],axis=1)

runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

#SMOTE-NC resampling
print(Counter(y_train.rainfall))
smote_nc = SMOTENC(categorical_features=[5], random_state=0)
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 143549, 1.0: 143549})

In [None]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_resampled

Unnamed: 0,year,quarter,month,day,hour,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,region_central,region_east,region_north,region_north-east,region_west
0,2017,1,1,1,3,-0.368775,0.785725,-0.619154,-1.927747,-0.908897,0.076037,-0.151417,0.016280,0.017967,1,0,0,0,0
1,2017,1,1,1,3,-0.693481,0.525594,-0.619154,-0.405321,0.536957,0.025337,-0.258075,0.016280,0.354667,0,1,0,0,0
2,2017,1,1,1,3,-0.749465,0.486272,-0.619154,-0.405321,0.826128,0.076037,-0.071423,0.016280,0.017967,0,0,1,0,0
3,2017,1,1,1,3,-0.749465,0.690445,-0.619154,-0.405321,0.536957,0.211238,-0.181414,-0.730024,-0.318733,0,0,0,1,0
4,2017,1,1,1,3,-0.704678,0.492322,-0.619154,-1.318777,0.247786,0.035477,0.021903,0.016280,0.017967,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287093,2018,2,6,6,10,-0.741365,1.072317,1.108104,0.595346,-0.413380,-1.541297,1.612804,0.016280,-1.088571,0,0,0,1,0
287094,2017,2,6,18,9,0.313078,0.056692,-0.619154,-1.656297,-0.651100,-0.029826,0.272773,-1.419418,-0.406613,0,0,0,0,1
287095,2017,1,1,2,14,-0.502332,-0.039392,1.801403,1.926150,0.152781,-0.136394,-0.527804,2.756299,1.364767,1,0,0,0,0
287096,2019,2,6,3,19,0.576518,-0.023680,1.801403,-1.578414,-0.273766,-1.859828,1.779230,-0.320231,0.034498,0,1,0,0,0


In [None]:
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df
X_test

Unnamed: 0,year,quarter,month,day,hour,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,region_central,region_east,region_north,region_north-east,region_west
156174,2020,4,12,31,19,-1.448144,1.300693,1.801403,0.812620,-0.041385,-1.363849,1.083485,1.135735,0.017967,1,0,0,0,0
156175,2020,4,12,31,19,0.370212,-0.838581,-0.619154,1.726076,1.982812,-0.633766,1.088484,1.135735,0.354667,0,1,0,0,0
156176,2020,4,12,31,19,-1.040581,0.658684,1.801403,0.812620,0.247786,-0.126764,0.968494,2.255190,-0.992133,0,0,1,0,0
156177,2020,4,12,31,19,-0.674820,0.519545,1.801403,0.203650,-0.619726,-2.019573,2.361716,-1.103175,-0.655433,0,0,0,1,0
156178,2020,4,12,31,19,-1.436201,1.157773,1.801403,1.117105,0.826128,-0.363365,-0.058091,1.135735,2.711566,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197635,2021,4,12,31,23,-1.794498,1.248516,1.801403,-0.405321,-0.041385,-0.599966,-0.581382,-1.103175,0.354667,1,0,0,0,0
197636,2021,4,12,31,23,-1.286911,1.030732,1.801403,-1.014291,0.247786,0.278838,-0.211412,-0.730024,-0.655433,0,1,0,0,0
197637,2021,4,12,31,23,-1.421272,1.321111,1.801403,-0.100836,-0.041385,0.380239,-0.271407,-0.730024,-0.318733,0,0,1,0,0
197638,2021,4,12,31,23,-1.510846,0.998972,-0.619154,-0.100836,-0.041385,0.532340,-0.351401,0.016280,-0.992133,0,0,0,1,0


In [None]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
pca=PCA(n_components=0.95)
pca_components=pca.fit_transform(temp_df)
pca_df = pd.DataFrame(pca_components,columns=["PC1","PC2","PC3","PC4","PC5"])
pca_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,-14.880755,8.463824,5.761134,0.829468,-2.126599
1,-14.880415,8.460514,5.871510,0.559080,-1.282495
2,-14.880803,8.459678,5.884406,0.630299,-1.246453
3,-14.880274,8.481644,5.869281,0.646356,-1.466767
4,-14.880726,8.463801,5.827905,0.841523,-1.608660
...,...,...,...,...,...
328559,15.687172,-10.601566,-5.460768,1.769499,2.624000
328560,15.686689,-10.615775,-5.457656,1.271116,2.304801
328561,15.686498,-10.608580,-5.439477,1.245831,2.443421
328562,15.687299,-10.596882,-5.417433,-0.051533,2.211964


In [None]:
#initialise base adaboost 
model = AdaBoostClassifier()

#Fit the resampled training feature Xs and training label Ys
model.fit(pca_df.iloc[:len(X_resampled),:], y_resampled)

#Use the trained model to predict the test data
predictions = model.predict(pca_df.iloc[len(X_resampled):,:])

#Accuracy, Precision, Recall and F1
accuracy = accuracy_score(y_test,predictions)
precision = metrics.precision_score(y_test, predictions)
recall = metrics.recall_score(y_test, predictions)
f_measure = metrics.f1_score(y_test, predictions)
print("Accuracy:", accuracy)
print("Precision:",precision)
print("Recall:",recall)
print("F1:",f_measure)

Accuracy: 0.8927555105387547
Precision: 0.4344585708022797
Recall: 0.5149389451805664
F1: 0.47128759957198907


In [None]:
#rocauc
ada_pca_pred = model.predict(pca_df.iloc[len(X_resampled):,:])
fpr, tpr, thresholds = roc_curve(y_test, ada_pca_pred, pos_label=1)
auc(fpr, tpr)

0.7231764667684473

# SMOTE-NC + Hypertuned Decision Tree

In [None]:
#pre-process data again
weather_full=pd.read_csv("weather_adjusted.csv")

weather_full.timestamp=pd.to_datetime(weather_full.timestamp,infer_datetime_format=True)
weather_full["year"]=weather_full.timestamp.apply(lambda x: x.year)
weather_full["quarter"]=weather_full.timestamp.apply(lambda x: x.quarter)
weather_full["month"]=weather_full.timestamp.apply(lambda x: x.month)
weather_full["day"]=weather_full.timestamp.apply(lambda x: x.day)
weather_full["hour"]=weather_full.timestamp.apply(lambda x: x.hour)

weather_full=pd.concat([weather_full.iloc[:,12:],weather_full.iloc[:,:11],weather_full.iloc[:,11:12]],axis=1)

runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

#SMOTE-NC resampling
print(Counter(y_train.rainfall))
smote_nc = SMOTENC(categorical_features=[5], random_state=0)
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 143549, 1.0: 143549})

In [None]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_resampled

Unnamed: 0,year,quarter,month,day,hour,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,region_central,region_east,region_north,region_north-east,region_west
0,2017,1,1,1,3,-0.368775,0.785725,-0.619154,-1.927747,-0.908897,0.076037,-0.151417,0.016280,0.017967,1,0,0,0,0
1,2017,1,1,1,3,-0.693481,0.525594,-0.619154,-0.405321,0.536957,0.025337,-0.258075,0.016280,0.354667,0,1,0,0,0
2,2017,1,1,1,3,-0.749465,0.486272,-0.619154,-0.405321,0.826128,0.076037,-0.071423,0.016280,0.017967,0,0,1,0,0
3,2017,1,1,1,3,-0.749465,0.690445,-0.619154,-0.405321,0.536957,0.211238,-0.181414,-0.730024,-0.318733,0,0,0,1,0
4,2017,1,1,1,3,-0.704678,0.492322,-0.619154,-1.318777,0.247786,0.035477,0.021903,0.016280,0.017967,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287093,2018,2,6,6,10,-0.741365,1.072317,1.108104,0.595346,-0.413380,-1.541297,1.612804,0.016280,-1.088571,0,0,0,1,0
287094,2017,2,6,18,9,0.313078,0.056692,-0.619154,-1.656297,-0.651100,-0.029826,0.272773,-1.419418,-0.406613,0,0,0,0,1
287095,2017,1,1,2,14,-0.502332,-0.039392,1.801403,1.926150,0.152781,-0.136394,-0.527804,2.756299,1.364767,1,0,0,0,0
287096,2019,2,6,3,19,0.576518,-0.023680,1.801403,-1.578414,-0.273766,-1.859828,1.779230,-0.320231,0.034498,0,1,0,0,0


In [None]:
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df
X_test

Unnamed: 0,year,quarter,month,day,hour,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,region_central,region_east,region_north,region_north-east,region_west
156174,2020,4,12,31,19,-1.448144,1.300693,1.801403,0.812620,-0.041385,-1.363849,1.083485,1.135735,0.017967,1,0,0,0,0
156175,2020,4,12,31,19,0.370212,-0.838581,-0.619154,1.726076,1.982812,-0.633766,1.088484,1.135735,0.354667,0,1,0,0,0
156176,2020,4,12,31,19,-1.040581,0.658684,1.801403,0.812620,0.247786,-0.126764,0.968494,2.255190,-0.992133,0,0,1,0,0
156177,2020,4,12,31,19,-0.674820,0.519545,1.801403,0.203650,-0.619726,-2.019573,2.361716,-1.103175,-0.655433,0,0,0,1,0
156178,2020,4,12,31,19,-1.436201,1.157773,1.801403,1.117105,0.826128,-0.363365,-0.058091,1.135735,2.711566,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197635,2021,4,12,31,23,-1.794498,1.248516,1.801403,-0.405321,-0.041385,-0.599966,-0.581382,-1.103175,0.354667,1,0,0,0,0
197636,2021,4,12,31,23,-1.286911,1.030732,1.801403,-1.014291,0.247786,0.278838,-0.211412,-0.730024,-0.655433,0,1,0,0,0
197637,2021,4,12,31,23,-1.421272,1.321111,1.801403,-0.100836,-0.041385,0.380239,-0.271407,-0.730024,-0.318733,0,0,1,0,0
197638,2021,4,12,31,23,-1.510846,0.998972,-0.619154,-0.100836,-0.041385,0.532340,-0.351401,0.016280,-0.992133,0,0,0,1,0


Under Hyperparameter tuning, we can attempt to tune the following features:
1. n estimators
2. learning rate


In [None]:
from sklearn.model_selection import GridSearchCV

n estimators

In [None]:
#nestimator f1
clf = AdaBoostClassifier()
params = {"n_estimators":[150,200,250]}
grid_search=GridSearchCV(estimator=clf,
                        param_grid=params,
                        scoring="f1",
                        cv=10,
                        verbose=1,
                        n_jobs=-1)
grid_search.fit(X_resampled,y_resampled)
grid_search.best_params_

Fitting 10 folds for each of 3 candidates, totalling 30 fits


{'n_estimators': 250}

learning rate

In [None]:
#learningrate f1
clf = AdaBoostClassifier(n_estimators=250)
params = {"learning_rate":[1,1.2,1.4]}
grid_search=GridSearchCV(estimator=clf,
                        param_grid=params,
                        scoring="f1",
                        cv=10,
                        verbose=1,
                        n_jobs=-1)
grid_search.fit(X_resampled,y_resampled)
grid_search.best_params_

Fitting 10 folds for each of 3 candidates, totalling 30 fits


{'learning_rate': 1.2}

Using the tuned hyperparameters here are the results:

In [None]:
#initialise tuned adaboost 
model = AdaBoostClassifier(n_estimators=250,learning_rate=1.2)

#Fit the resampled training feature Xs and training label Ys
model.fit(X_resampled, y_resampled)

#Use the trained model to predict the test data
predictions = model.predict(X_test)

#Accuracy, Precision, Recall and F1
accuracy = accuracy_score(y_test,predictions)
precision = metrics.precision_score(y_test, predictions)
recall = metrics.recall_score(y_test, predictions)
f_measure = metrics.f1_score(y_test, predictions)
print("Accuracy:", accuracy)
print("Precision:",precision)
print("Recall:",recall)
print("F1:",f_measure)

Accuracy: 0.9193073843630927
Precision: 0.5806865575874238
Recall: 0.4702520135100026
F1: 0.5196669537754809


In [None]:
#rocauc
ada_tuned_pred = model.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, ada_tuned_pred, pos_label=1)
auc(fpr, tpr)

0.7177535421778154

## <center>Summary</center>
|           |   Basic   |  SMOTE-NC |    ENN    |  SMOTEENN | With PCA  | With Tuning |
|:---------:|:---------:|:---------:|:---------:|:---------:|-----------|-------------|
|  ROC AUC  |   0.672   |   0.745   |   0.725   | **0.754** |   0.723   |    0.718    |
| F1 Score  |   0.461   | **0.540** |   0.528   |   0.537   |   0.471   |    0.520    |
|   Recall  |   0.368   |   0.535   |   0.486   | **0.564** |   0.515   |    0.470    |
| Precision | **0.619** |   0.544   |   0.578   |   0.513   |   0.434   |    0.581    |
| Accuracy  | **0.920** |   0.915   |   0.920   |   0.910   |   0.893   |    0.919    |