# Dataset Comparisons

1. Dataset 1 (weather_data.csv)
2. Dataset 2 (weather_data_2.csv)

Load Libraries

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.model_selection import *
from collections import Counter
from sklearn.metrics import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import *

## Section 1: Dataset 1

In [2]:
data_1=pd.read_csv("weather_data_1.csv")
data_1

Unnamed: 0,timestamp,planning area,past_temperature,past_humidity,past_winddirection,past_windspeed,rainfall
0,2017-01-01 01:00:00,ang mo kio,26.10,90.5,NNE,2.00,0.0
1,2017-01-01 01:00:00,changi,26.15,92.9,NNW,1.65,0.0
2,2017-01-01 01:00:00,changi bay,26.60,87.1,N,2.10,0.0
3,2017-01-01 01:00:00,hougang,26.90,88.1,NE,2.20,0.0
4,2017-01-01 01:00:00,marina south,27.00,87.1,N,11.80,0.0
...,...,...,...,...,...,...,...
495564,2021-12-31 23:00:00,sungei kadut,24.60,95.7,SSE,0.40,0.0
495565,2021-12-31 23:00:00,tuas,25.20,87.2,ESE,1.00,0.0
495566,2021-12-31 23:00:00,woodlands,24.40,96.7,S,2.40,0.0
495567,2021-12-31 23:00:00,western water catchment,24.35,93.9,SE,4.10,0.0


In [3]:
data_1.timestamp=pd.to_datetime(data_1.timestamp,infer_datetime_format=True)
data_1["year"]=data_1.timestamp.apply(lambda x: x.year)
data_1["quarter"]=data_1.timestamp.apply(lambda x: x.quarter)
data_1["month"]=data_1.timestamp.apply(lambda x: x.month)
data_1["day"]=data_1.timestamp.apply(lambda x: x.day)
data_1["hour"]=data_1.timestamp.apply(lambda x: x.hour)
data_1

Unnamed: 0,timestamp,planning area,past_temperature,past_humidity,past_winddirection,past_windspeed,rainfall,year,quarter,month,day,hour
0,2017-01-01 01:00:00,ang mo kio,26.10,90.5,NNE,2.00,0.0,2017,1,1,1,1
1,2017-01-01 01:00:00,changi,26.15,92.9,NNW,1.65,0.0,2017,1,1,1,1
2,2017-01-01 01:00:00,changi bay,26.60,87.1,N,2.10,0.0,2017,1,1,1,1
3,2017-01-01 01:00:00,hougang,26.90,88.1,NE,2.20,0.0,2017,1,1,1,1
4,2017-01-01 01:00:00,marina south,27.00,87.1,N,11.80,0.0,2017,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
495564,2021-12-31 23:00:00,sungei kadut,24.60,95.7,SSE,0.40,0.0,2021,4,12,31,23
495565,2021-12-31 23:00:00,tuas,25.20,87.2,ESE,1.00,0.0,2021,4,12,31,23
495566,2021-12-31 23:00:00,woodlands,24.40,96.7,S,2.40,0.0,2021,4,12,31,23
495567,2021-12-31 23:00:00,western water catchment,24.35,93.9,SE,4.10,0.0,2021,4,12,31,23


Train-test Split

In [5]:
data_1=pd.concat([data_1.iloc[:,7:],data_1.iloc[:,:7]],axis=1)
runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T').strftime('%Y-%m-%d %H:%M:%S'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = data_1[data_1["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = data_1[~data_1["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = data_1[data_1["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = data_1[~data_1["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

In [6]:
temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["planning area","past_winddirection"], prefix=["area","past_windir"])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

In [7]:
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print("F1:",f1_score(y_test, y_pred))
print("ROC AUC:",roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
cross_val_score(clf, X_train, y_train, scoring="f1",cv=10).mean()

F1: 0.0977508650519031
ROC AUC: 0.5464220903403113


0.04700365639798876

## Section 2: Dataset 2

In [9]:
data_2=pd.read_csv("weather_data_2.csv")
data_2

Unnamed: 0,timestamp,region,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,rainfall
0,2017-01-01 03:00:00,central,26.850000,90.300000,0.0,-6.0,-4.0,-0.100000,-1.421085e-14,0.0,0.0,0.0
1,2017-01-01 03:00:00,east,26.125000,87.433333,0.0,-1.0,1.0,-0.150000,-5.333333e-01,0.0,1.0,0.0
2,2017-01-01 03:00:00,north,26.000000,87.000000,0.0,-1.0,2.0,-0.100000,4.000000e-01,0.0,0.0,0.0
3,2017-01-01 03:00:00,north-east,26.000000,89.250000,0.0,-1.0,1.0,0.033333,-1.500000e-01,-2.0,-1.0,0.0
4,2017-01-01 03:00:00,west,26.100000,87.066667,0.0,-4.0,0.0,-0.140000,8.666667e-01,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
197635,2021-12-31 23:00:00,central,23.666667,95.400000,1.0,-1.0,-1.0,-0.766667,-2.150000e+00,-3.0,1.0,1.0
197636,2021-12-31 23:00:00,east,24.800000,93.000000,1.0,-3.0,0.0,0.100000,-3.000000e-01,-2.0,-2.0,0.0
197637,2021-12-31 23:00:00,north,24.500000,96.200000,1.0,0.0,-1.0,0.200000,-6.000000e-01,-2.0,-1.0,0.0
197638,2021-12-31 23:00:00,north-east,24.300000,92.650000,0.0,0.0,-1.0,0.350000,-1.000000e+00,0.0,-3.0,0.0


In [10]:
data_2.timestamp=pd.to_datetime(data_2.timestamp,infer_datetime_format=True)
data_2["year"]=data_2.timestamp.apply(lambda x: x.year)
data_2["quarter"]=data_2.timestamp.apply(lambda x: x.quarter)
data_2["month"]=data_2.timestamp.apply(lambda x: x.month)
data_2["day"]=data_2.timestamp.apply(lambda x: x.day)
data_2["hour"]=data_2.timestamp.apply(lambda x: x.hour)
data_2

Unnamed: 0,timestamp,region,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,rainfall,year,quarter,month,day,hour
0,2017-01-01 03:00:00,central,26.850000,90.300000,0.0,-6.0,-4.0,-0.100000,-1.421085e-14,0.0,0.0,0.0,2017,1,1,1,3
1,2017-01-01 03:00:00,east,26.125000,87.433333,0.0,-1.0,1.0,-0.150000,-5.333333e-01,0.0,1.0,0.0,2017,1,1,1,3
2,2017-01-01 03:00:00,north,26.000000,87.000000,0.0,-1.0,2.0,-0.100000,4.000000e-01,0.0,0.0,0.0,2017,1,1,1,3
3,2017-01-01 03:00:00,north-east,26.000000,89.250000,0.0,-1.0,1.0,0.033333,-1.500000e-01,-2.0,-1.0,0.0,2017,1,1,1,3
4,2017-01-01 03:00:00,west,26.100000,87.066667,0.0,-4.0,0.0,-0.140000,8.666667e-01,0.0,0.0,0.0,2017,1,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197635,2021-12-31 23:00:00,central,23.666667,95.400000,1.0,-1.0,-1.0,-0.766667,-2.150000e+00,-3.0,1.0,1.0,2021,4,12,31,23
197636,2021-12-31 23:00:00,east,24.800000,93.000000,1.0,-3.0,0.0,0.100000,-3.000000e-01,-2.0,-2.0,0.0,2021,4,12,31,23
197637,2021-12-31 23:00:00,north,24.500000,96.200000,1.0,0.0,-1.0,0.200000,-6.000000e-01,-2.0,-1.0,0.0,2021,4,12,31,23
197638,2021-12-31 23:00:00,north-east,24.300000,92.650000,0.0,0.0,-1.0,0.350000,-1.000000e+00,0.0,-3.0,0.0,2021,4,12,31,23


In [11]:
data_2=pd.concat([data_2.iloc[:,12:],data_2.iloc[:,:12]],axis=1)
runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = data_2[data_2["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = data_2[~data_2["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = data_2[data_2["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = data_2[~data_2["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

In [12]:
temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

In [13]:
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print("F1:",f1_score(y_test, y_pred))
print("ROC AUC:",roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
cross_val_score(clf, X_train, y_train, scoring="f1",cv=10).mean()

F1: 0.32171648269874736
ROC AUC: 0.6297129573035326


0.2579864727021275

Hence, the adjustments made during feature engineering has improved the data quality.

# Random vs Chronological Train-test Split Comparisons

1. Choronological Splitting
2. Random Splitting

## Section 2: Random Splitting

In [14]:
X = data_2.iloc[:,:-1]
y = data_2.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

In [15]:
temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

In [16]:
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print("F1:",f1_score(y_test, y_pred))
print("ROC AUC:",roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
cross_val_score(clf, X_train, y_train, scoring="f1",cv=10).mean()

F1: 0.3341623994147769
ROC AUC: 0.6397666847944432


0.35271945451121134

While the training sets cross validation score is higher using random splitting, the test sets metrics show comparable results so we believe that random splitting by coincidence would perform better in this case since the data is shown to have some time dependency so splitting it chronologically makes more sense.

To illustrate, random splitting could assign data on '2021-12-31 23:00:00' into the training set and data on '2017-01-01 03:00:00' into the test set and this results in data leakage since the model is using information that has not happened in 2017 to predict 2017 data.