In [1]:
# Ref :
# https://beckernick.github.io/oversampling-modeling/
# https://machinelearningmastery.com/standard-machine-learning-datasets-for-imbalanced-classification/
# https://towardsdatascience.com/methods-for-dealing-with-imbalanced-data-5b761be45a18

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier  
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE

random_seed = 100

### Read Data

In [3]:
df = pd.read_csv('creditcard.csv', header=None)
df.columns = ['col_' + str(col) for col in df.columns]
df.sample(10)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30
15086,26433.0,-0.417926,1.043044,1.567393,-0.029926,-0.06117,-0.902928,0.70265,-0.068392,-0.401435,...,-0.226879,-0.591314,-0.021278,0.36748,-0.192481,0.071021,0.265376,0.121238,2.67,0
113323,73024.0,1.300105,-0.849455,0.833857,-0.793458,-1.375184,-0.221304,-1.139217,0.13126,-0.546319,...,0.421564,1.04446,-0.172465,0.043947,0.410095,-0.047777,0.023995,0.016019,39.99,0
55588,47021.0,1.286835,0.410463,-0.033514,0.918373,0.241875,-0.461275,0.303026,-0.258748,-0.160592,...,-0.006903,0.076803,-0.22695,-0.41923,0.840486,-0.281174,0.019672,0.016127,13.31,0
176116,122631.0,1.958079,-0.04271,-0.527311,1.384067,-0.337813,-0.688529,-0.120918,-0.060492,0.861065,...,-0.465249,-1.273636,0.494686,-0.15092,-0.459013,-1.153382,0.031874,-0.032433,6.9,0
180468,124554.0,1.935659,-0.868297,-0.81673,-1.118171,-0.109123,1.056691,-1.052618,0.514812,1.512231,...,0.251211,0.942057,0.194124,-1.590342,-0.535108,0.821741,0.00139,-0.082935,0.01,0
146642,87794.0,-1.801135,1.836773,-0.294232,-3.012458,1.028076,-0.741334,1.327397,-0.225957,0.955492,...,-0.459982,-0.879859,-0.154755,0.105348,0.21318,0.311494,-0.080491,-0.096973,1.0,0
58000,48184.0,-1.282103,0.156339,1.352965,0.255524,-2.36279,1.198594,0.887357,0.392508,-1.666066,...,-0.128962,0.117074,0.032143,-0.020794,0.447928,-0.084783,0.02421,-0.087214,360.0,0
131132,79506.0,1.282396,-0.908166,0.815159,-0.782122,-1.420962,-0.227897,-1.155307,0.152406,-0.484788,...,0.422271,1.00585,-0.175444,0.033091,0.392799,-0.047353,0.019591,0.0167,49.0,0
104386,69044.0,1.238181,0.328666,0.196792,0.494349,-0.110838,-0.546887,0.006199,-0.060493,-0.27163,...,-0.253041,-0.742232,0.070734,-0.030328,0.245782,0.096219,-0.024326,0.018129,1.29,0
143216,85198.0,-0.965645,0.94401,2.235519,0.758746,0.104065,0.072509,0.387714,-0.151986,-0.033736,...,-0.077646,0.148241,-0.032525,0.079586,0.225299,-0.386201,-0.200464,-0.199822,0.99,0


## check imblanace of target class

In [4]:
df_features = df.drop(['col_30'], axis=1)
df_target = df['col_30']
df_target.value_counts()

0    284315
1       492
Name: col_30, dtype: int64

## Keep 10% of data aside to validate which technique would really work ?

In [5]:
x, x_real_world, y, y_real_world = train_test_split(df_features, df_target, test_size = .1, random_state=random_seed)

# SMOTE Before Split

In [6]:
sm = SMOTE(random_state=random_seed)
x_res, y_res = sm.fit_resample(x, y)
print("\nCounts before oversampling : \n", y.value_counts())
print("\nCounts after oversampling : \n", y_res.value_counts())

df_oversampled = pd.concat([x_res,y_res], axis=1)


Counts before oversampling : 
 0    255881
1       445
Name: col_30, dtype: int64

Counts after oversampling : 
 0    255881
1    255881
Name: col_30, dtype: int64


### Note : Not all rows are duplicated

In [7]:
print(df_oversampled.duplicated().sum())

7047


In [8]:
df_oversampled.to_csv('resample_before_split.csv', index=False)

### Best Model ?

In [9]:
x_train_res, x_test_res, y_train_res, y_test_res = train_test_split(x_res, y_res, test_size = .2, random_state=random_seed)

clf_dt = DecisionTreeClassifier(random_state=random_seed)
clf_dt.fit(x_train_res, y_train_res)
print("Train set f1_score : ", f1_score(y_train_res, clf_dt.predict(x_train_res)))
print("Test set f1_score : ", f1_score(y_test_res, clf_dt.predict(x_test_res)))

Train set f1_score :  1.0
Test set f1_score :  0.9987073196808149


# What happend in Production : resampling before split ?

In [10]:
print("Real world data Accuracy : ", clf_dt.score(x_real_world, y_real_world))   # Big red, because of this you never identify your mistakes
print("Real world data Recall : ", recall_score(y_real_world, clf_dt.predict(x_real_world)))
print("Real World f1_score : ", f1_score(y_real_world, clf_dt.predict(x_real_world)))

Real world data Accuracy :  0.9972613321161476
Real world data Recall :  0.851063829787234
Real World f1_score :  0.5063291139240507


# SMOTE, Oversampling after split

In [11]:
# Train, test split the original data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2, random_state=random_seed)

# Oversample using SMOTE
sm = SMOTE(random_state=random_seed)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)
print("\nCounts before oversampling : \n", y_train.value_counts())
print("\nCounts after oversampling : \n", y_train_res.value_counts())




Counts before oversampling : 
 0    204705
1       355
Name: col_30, dtype: int64

Counts after oversampling : 
 0    204705
1    204705
Name: col_30, dtype: int64


In [12]:
clf_dt = DecisionTreeClassifier(random_state=random_seed)
clf_dt.fit(x_train_res, y_train_res)
print("Train set f1_score : ", f1_score(y_train_res, clf_dt.predict(x_train_res)))
print("Test set f1_score : ", f1_score(y_test, clf_dt.predict(x_test)))

Train set f1_score :  1.0
Test set f1_score :  0.549618320610687


In [13]:
print("Real world data Accuracy : ", clf_dt.score(x_real_world, y_real_world))   # Big red, because of this you never identify your mistakes
print("Real world data Recall : ", recall_score(y_real_world, clf_dt.predict(x_real_world)))
print("Real World f1_score : ", f1_score(y_real_world, clf_dt.predict(x_real_world)))


Real world data Accuracy :  0.997542221129876
Real world data Recall :  0.7872340425531915
Real World f1_score :  0.513888888888889
