# Imbalanced Data Sampling Methods:
1. None
2. SMOTE-NC
3. SMOTE
4. ADASYN
5. ENN
6. TomekLinks
7. SOMTEENN
8. TabGAN

Load Libraries

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from imblearn.over_sampling import SMOTENC,ADASYN,SMOTE
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks
from imblearn.combine import SMOTEENN
from tabgan.sampler import GANGenerator
from collections import Counter
from sklearn.metrics import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import *

Read Data

In [2]:
weather_full=pd.read_csv("weather_data_2.csv")
weather_full

Unnamed: 0,timestamp,region,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,rainfall
0,2017-01-01 03:00:00,central,26.850000,90.300000,0.0,-6.0,-4.0,-0.100000,-1.421085e-14,0.0,0.0,0.0
1,2017-01-01 03:00:00,east,26.125000,87.433333,0.0,-1.0,1.0,-0.150000,-5.333333e-01,0.0,1.0,0.0
2,2017-01-01 03:00:00,north,26.000000,87.000000,0.0,-1.0,2.0,-0.100000,4.000000e-01,0.0,0.0,0.0
3,2017-01-01 03:00:00,north-east,26.000000,89.250000,0.0,-1.0,1.0,0.033333,-1.500000e-01,-2.0,-1.0,0.0
4,2017-01-01 03:00:00,west,26.100000,87.066667,0.0,-4.0,0.0,-0.140000,8.666667e-01,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
197635,2021-12-31 23:00:00,central,23.666667,95.400000,1.0,-1.0,-1.0,-0.766667,-2.150000e+00,-3.0,1.0,1.0
197636,2021-12-31 23:00:00,east,24.800000,93.000000,1.0,-3.0,0.0,0.100000,-3.000000e-01,-2.0,-2.0,0.0
197637,2021-12-31 23:00:00,north,24.500000,96.200000,1.0,0.0,-1.0,0.200000,-6.000000e-01,-2.0,-1.0,0.0
197638,2021-12-31 23:00:00,north-east,24.300000,92.650000,0.0,0.0,-1.0,0.350000,-1.000000e+00,0.0,-3.0,0.0


Create Time Variables

In [3]:
weather_full.timestamp=pd.to_datetime(weather_full.timestamp,infer_datetime_format=True)
weather_full["year"]=weather_full.timestamp.apply(lambda x: x.year)
weather_full["quarter"]=weather_full.timestamp.apply(lambda x: x.quarter)
weather_full["month"]=weather_full.timestamp.apply(lambda x: x.month)
weather_full["day"]=weather_full.timestamp.apply(lambda x: x.day)
weather_full["hour"]=weather_full.timestamp.apply(lambda x: x.hour)
weather_full

Unnamed: 0,timestamp,region,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,rainfall,year,quarter,month,day,hour
0,2017-01-01 03:00:00,central,26.850000,90.300000,0.0,-6.0,-4.0,-0.100000,-1.421085e-14,0.0,0.0,0.0,2017,1,1,1,3
1,2017-01-01 03:00:00,east,26.125000,87.433333,0.0,-1.0,1.0,-0.150000,-5.333333e-01,0.0,1.0,0.0,2017,1,1,1,3
2,2017-01-01 03:00:00,north,26.000000,87.000000,0.0,-1.0,2.0,-0.100000,4.000000e-01,0.0,0.0,0.0,2017,1,1,1,3
3,2017-01-01 03:00:00,north-east,26.000000,89.250000,0.0,-1.0,1.0,0.033333,-1.500000e-01,-2.0,-1.0,0.0,2017,1,1,1,3
4,2017-01-01 03:00:00,west,26.100000,87.066667,0.0,-4.0,0.0,-0.140000,8.666667e-01,0.0,0.0,0.0,2017,1,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197635,2021-12-31 23:00:00,central,23.666667,95.400000,1.0,-1.0,-1.0,-0.766667,-2.150000e+00,-3.0,1.0,1.0,2021,4,12,31,23
197636,2021-12-31 23:00:00,east,24.800000,93.000000,1.0,-3.0,0.0,0.100000,-3.000000e-01,-2.0,-2.0,0.0,2021,4,12,31,23
197637,2021-12-31 23:00:00,north,24.500000,96.200000,1.0,0.0,-1.0,0.200000,-6.000000e-01,-2.0,-1.0,0.0,2021,4,12,31,23
197638,2021-12-31 23:00:00,north-east,24.300000,92.650000,0.0,0.0,-1.0,0.350000,-1.000000e+00,0.0,-3.0,0.0,2021,4,12,31,23


In [4]:
weather_full=pd.concat([weather_full.iloc[:,12:],weather_full.iloc[:,:11],weather_full.iloc[:,11:12]],axis=1)
weather_full

Unnamed: 0,year,quarter,month,day,hour,timestamp,region,past_temperature,past_humidity,past_rainfall,past_wind_x,past_wind_y,delta_temperature,delta_humidity,delta_wind_x,delta_wind_y,rainfall
0,2017,1,1,1,3,2017-01-01 03:00:00,central,26.850000,90.300000,0.0,-6.0,-4.0,-0.100000,-1.421085e-14,0.0,0.0,0.0
1,2017,1,1,1,3,2017-01-01 03:00:00,east,26.125000,87.433333,0.0,-1.0,1.0,-0.150000,-5.333333e-01,0.0,1.0,0.0
2,2017,1,1,1,3,2017-01-01 03:00:00,north,26.000000,87.000000,0.0,-1.0,2.0,-0.100000,4.000000e-01,0.0,0.0,0.0
3,2017,1,1,1,3,2017-01-01 03:00:00,north-east,26.000000,89.250000,0.0,-1.0,1.0,0.033333,-1.500000e-01,-2.0,-1.0,0.0
4,2017,1,1,1,3,2017-01-01 03:00:00,west,26.100000,87.066667,0.0,-4.0,0.0,-0.140000,8.666667e-01,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197635,2021,4,12,31,23,2021-12-31 23:00:00,central,23.666667,95.400000,1.0,-1.0,-1.0,-0.766667,-2.150000e+00,-3.0,1.0,1.0
197636,2021,4,12,31,23,2021-12-31 23:00:00,east,24.800000,93.000000,1.0,-3.0,0.0,0.100000,-3.000000e-01,-2.0,-2.0,0.0
197637,2021,4,12,31,23,2021-12-31 23:00:00,north,24.500000,96.200000,1.0,0.0,-1.0,0.200000,-6.000000e-01,-2.0,-1.0,0.0
197638,2021,4,12,31,23,2021-12-31 23:00:00,north-east,24.300000,92.650000,0.0,0.0,-1.0,0.350000,-1.000000e+00,0.0,-3.0,0.0


Train-test Splits

In [5]:
runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

## Section 1: No Sampling

Dummifying and Scaling

In [6]:
temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

Vanilla Decistion Tree used as a standard

In [7]:
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print("F1:",f1_score(y_test, y_pred))
print("ROC AUC:",roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

F1: 0.32171648269874736
ROC AUC: 0.6297129573035326


In [8]:
from sklearn.model_selection import *
cross_val_score(clf, X_train, y_train, scoring="f1",cv=10).mean()

0.2579864727021275

## Section 2: SMOTE-NC

Redo the steps:
1. Train-test split
2. Resampling
3. Dummyfying and Normalization
4. Modelling

In [9]:
runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

In [10]:
print(Counter(y_train.rainfall))
smote_nc = SMOTENC(categorical_features=[5], random_state=0)
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 143549, 1.0: 143549})

In [11]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df

In [12]:
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_resampled,y_resampled)
y_pred=clf.predict(X_test)
print("F1:",f1_score(y_test, y_pred))
print("ROC AUC:",roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

F1: 0.3363921477523522
ROC AUC: 0.6440781560699232


In [14]:
cross_val_score(clf, X_resampled, y_resampled, scoring="f1",cv=10).mean()

0.8158130709095193

Result: Improved Slightly

## Section 3: SMOTE

Redo the steps:
1. Train-test split
2. Dummifying first since normal SMOTE cannot handle categorical
3. Resampling
3. Normalization
4. Modelling

In [15]:
runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

In [16]:
temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

In [17]:
print(Counter(y_train.rainfall))
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 143549, 1.0: 143549})

In [18]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df

In [19]:
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_resampled,y_resampled)
y_pred=clf.predict(X_test)
print("F1:",f1_score(y_test, y_pred))
print("ROC AUC:",roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

F1: 0.3398565698310441
ROC AUC: 0.6419958609367404


In [20]:
cross_val_score(clf, X_resampled, y_resampled, scoring="f1",cv=10).mean()

0.8468453314116507

Result: Comparable  to (but slightly better than) SMOTE-NC

## Section 4: ADASYN

Redo the steps:
1. Train-test split
2. Dummifying first since ADASYN cannot handle categorical
3. Resampling
3. Normalization
4. Modelling

In [21]:
runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

In [22]:
temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

In [23]:
print(Counter(y_train.rainfall))
adasyn = ADASYN(random_state=0)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 143549, 1.0: 142540})

In [24]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df

In [25]:
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_resampled,y_resampled)
y_pred=clf.predict(X_test)
print("F1:",f1_score(y_test, y_pred))
print("ROC AUC:",roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

F1: 0.3342472285135296
ROC AUC: 0.6404153621112625


In [26]:
cross_val_score(clf, X_resampled, y_resampled, scoring="f1",cv=10).mean()

0.8482891122662475

Result: Comparable to (but slightly worse than) SMOTE

## Section 5: Edited Nearest Neighbors (Undersampling)

Redo the steps:
1. Train-test split
2. Dummifying first since ENN cannot handle categorical
3. Resampling
3. Normalization
4. Modelling

In [27]:
runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

In [28]:
temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

In [29]:
print(Counter(y_train.rainfall))
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 125275, 1.0: 12625})

In [30]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df

In [31]:
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_resampled,y_resampled)
y_pred=clf.predict(X_test)
print("F1:",f1_score(y_test, y_pred))
print("ROC AUC:",roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

F1: 0.3898815931108719
ROC AUC: 0.6870064489465769


In [32]:
cross_val_score(clf, X_resampled, y_resampled, scoring="f1",cv=10).mean()

0.3474141094676732

Result: Improvement over Oversampling Methods in the test results but worse than SMOTE variants in training.

## Section 6: TOMEK Links

Redo the steps:
1. Train-test split
2. Dummifying first since Tomek Links cannot handle categorical
3. Resampling
3. Normalization
4. Modelling

In [33]:
runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

In [34]:
temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

In [35]:
from imblearn.under_sampling import TomekLinks
print(Counter(y_train.rainfall))
tomek = TomekLinks()
X_resampled, y_resampled = tomek.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 140499, 1.0: 12625})

In [36]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df

In [37]:
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_resampled,y_resampled)
y_pred=clf.predict(X_test)
print("F1:",f1_score(y_test, y_pred))
print("ROC AUC:",roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

F1: 0.3443993312634344
ROC AUC: 0.6463425694063671


In [38]:
cross_val_score(clf, X_resampled, y_resampled, scoring="f1",cv=10).mean()

0.2738792047850264

Result: Comparable to (but slightly better than) SMOTE-NC in test but worse in training.

## Section 7: SMOTEENN

Redo the steps:
1. Train-test split
2. Dummifying first since SMOTEENN cannot handle categorical
3. Resampling
3. Normalization
4. Modelling

In [39]:
runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

In [40]:
temp_df=pd.concat([X_train,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
X_train = temp_df.iloc[:len(X_train),:]
X_test = temp_df.iloc[len(X_train):,:]
del temp_df

In [41]:
print(Counter(y_train.rainfall))
smoteenn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train.rainfall)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Counter({0.0: 109257, 1.0: 142924})

In [42]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df

In [43]:
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_resampled,y_resampled)
y_pred=clf.predict(X_test)
print("F1:",f1_score(y_test, y_pred))
print("ROC AUC:",roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

F1: 0.3974919247577427
ROC AUC: 0.710815707836445


In [44]:
cross_val_score(clf, X_resampled, y_resampled, scoring="f1",cv=10).mean()

0.8965584692511737

Combination of both does the best in both test and training.

## Section 8: TabGANs

Redo the steps:
1. Train-test split
2. Resampling
3. Dummifying and Normalization
4. Modelling

In [45]:
runtimes=list(pd.date_range('2017-01-01 00:00:00',
                            '2021-12-31 23:59:59',
                            freq='60T'))
training_runtimes=runtimes[:int(0.8*len(runtimes))]
X_train = weather_full[weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
X_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)].iloc[:,:-1]
y_train = weather_full[weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
y_test = weather_full[~weather_full["timestamp"].isin(training_runtimes)][["rainfall"]]
X_train=X_train.drop(columns=["timestamp"])
X_test=X_test.drop(columns=["timestamp"])

In [46]:
print(Counter(y_train.rainfall))
y_train.rainfall=y_train.rainfall.astype(int)
X_resampled, y_resampled = GANGenerator(cat_cols=["year","quarter","month","day","hour","region"]).generate_data_pipe(X_train, y_train, X_test)
Counter(y_resampled)

Counter({0.0: 143549, 1.0: 12625})


Fitting CTGAN transformers for each column:   0%|          | 0/16 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]



Counter({0: 66773, 1: 6753})

In [47]:
temp_df=pd.concat([X_resampled,X_test],axis=0)
temp_df=pd.get_dummies(temp_df, columns=["region"], prefix=["region"])
scaler=StandardScaler()
temp_df.iloc[:,5:14]=scaler.fit_transform(temp_df.iloc[:,5:14])
X_resampled = temp_df.iloc[:len(X_resampled),:]
X_test = temp_df.iloc[len(X_resampled):,:]
del temp_df

In [48]:
clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_resampled,y_resampled)
y_pred=clf.predict(X_test)
print("F1:",f1_score(y_test, y_pred))
print("ROC AUC:",roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

F1: 0.316138540899042
ROC AUC: 0.6272309428099528


In [49]:
cross_val_score(clf, X_resampled, y_resampled, scoring="f1",cv=10).mean()

0.264966196645999

Result: It could be possible that GANs do not work well for tabular data.

## <center>Summary</center>

|       |          |  None | SMOTE-NC | SMOTE | ADASYN |  ENN  | TomekLinks |  SMOTEENN | TabGAN |
|-------|:--------:|:-----:|:--------:|:-----:|:------:|:-----:|:----------:|:---------:|:------:|
| Test  |    F1    | 0.322 |   0.336  | 0.340 |  0.334 | 0.390 |    0.344   | **0.397** |  0.316 |
|       |  ROC AUC | 0.630 |   0.644  | 0.642 |  0.640 | 0.687 |    0.646   | **0.711** |  0.627 |
| Train | CV Score | 0.258 |   0.816  | 0.847 |  0.848 | 0.347 |    0.274   | **0.897** |  0.265 |

From this table, we can see that SMOTEENN performed the best, and surprisingly Tabular GANs did not perform very well with our dataset.