In [1]:
# Ref :
# https://beckernick.github.io/oversampling-modeling/
# https://machinelearningmastery.com/standard-machine-learning-datasets-for-imbalanced-classification/
# https://towardsdatascience.com/methods-for-dealing-with-imbalanced-data-5b761be45a18

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier  
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE

random_seed = 100

### Read Data

In [3]:
df = pd.read_csv('creditcard.csv', header=None)
df.columns = ['col_' + str(col) for col in df.columns]
df.sample(10)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30
43425,41504.0,-6.797653,-4.793346,0.8535,0.271509,-0.701888,1.728763,3.228925,-2.099489,2.73483,...,-1.762661,0.303422,0.024669,-0.04565,0.269454,0.679259,-3.236207,3.264494,503.17,0
157294,109829.0,2.082359,0.048573,-1.358575,0.360051,0.321684,-0.742663,0.032101,-0.323002,2.095164,...,0.109176,0.700305,0.023825,0.556139,0.31586,-0.480242,-0.022373,-0.055645,1.0,0
102743,68341.0,1.074375,-0.451795,0.700428,0.53076,-0.49834,0.728326,-0.621554,0.250028,0.674246,...,-0.065684,-0.176245,-0.197534,-0.773088,0.36753,0.361757,0.000113,0.019505,84.9,0
261743,160164.0,-1.279342,-1.869396,-1.275823,-0.921627,-0.111758,-1.405867,0.599728,0.09497,-0.753754,...,0.510062,1.064316,0.809556,-0.129636,-1.439054,-0.458293,0.180995,0.102892,300.0,0
30387,35899.0,0.683799,-0.647512,1.593073,3.125469,-1.256558,0.83915,-0.636885,0.443054,0.618408,...,-0.151895,-0.515475,-0.090087,0.517485,0.247412,-0.103601,0.015661,0.052367,178.97,0
197430,132010.0,2.061438,-1.14565,-0.100859,-0.409397,-1.394455,-0.06263,-1.363556,0.148666,0.448967,...,-0.361926,-0.554301,0.43695,0.68493,-0.729978,0.43066,-0.003931,-0.025631,29.46,0
266372,162303.0,0.158887,0.844501,-0.27397,-0.501845,0.880159,-1.210719,1.432307,-0.399581,-0.160023,...,0.07314,0.387527,-0.238596,-0.07909,0.14372,-0.290074,0.037062,0.028897,12.99,0
117531,74709.0,1.258466,0.405979,0.321117,0.686284,-0.310995,-1.057697,0.131232,-0.241829,-0.069228,...,-0.280393,-0.774355,0.122077,0.363735,0.239144,0.092243,-0.018511,0.032116,0.91,0
125136,77546.0,-1.202942,0.939222,2.055081,-0.816046,-0.359038,-0.636781,0.062208,-0.862487,-0.020122,...,0.920108,0.041033,-0.206653,0.443779,0.17928,1.031494,-0.286651,0.106187,1.0,0
230134,146202.0,0.320465,-4.041756,-1.418161,-0.05467,-2.231265,-0.399258,0.137614,-0.337486,0.071038,...,0.971113,0.686191,-0.70014,-0.005579,-0.56782,-0.158919,-0.162298,0.119019,962.0,0


## check imblanace of target class

In [4]:
df_features = df.drop(['col_30'], axis=1)
df_target = df['col_30']
df_target.value_counts()

0    284315
1       492
Name: col_30, dtype: int64

In [5]:
print(df_features.duplicated().sum())

1081


## Keep 10% of data aside to validate which technique would really work ?

In [6]:
x, x_real_world, y, y_real_world = train_test_split(df_features, df_target, test_size = .1, random_state=random_seed)

# SMOTE Before Split

In [7]:
sm = SMOTE(random_state=random_seed)
x_res, y_res = sm.fit_resample(x, y)
print("\nCounts before oversampling : \n", y.value_counts())
print("\nCounts after oversampling : \n", y_res.value_counts())

df_oversampled = pd.concat([x_res,y_res], axis=1)


Counts before oversampling : 
 0    255881
1       445
Name: col_30, dtype: int64

Counts after oversampling : 
 0    255881
1    255881
Name: col_30, dtype: int64


### Note : Not all rows are duplicated

In [8]:
print(df_oversampled.duplicated().sum())

7047


In [9]:
df_oversampled.sample(1000).to_csv('resample_before_split.csv', index=False)

### Best Model ?

In [10]:
x_train_res, x_test_res, y_train_res, y_test_res = train_test_split(x_res, y_res, test_size = .2, random_state=random_seed)

clf_dt = DecisionTreeClassifier(random_state=random_seed)
clf_dt.fit(x_train_res, y_train_res)
print("Train set f1_score : ", f1_score(y_train_res, clf_dt.predict(x_train_res)))
print("Test set f1_score : ", f1_score(y_test_res, clf_dt.predict(x_test_res)))

Train set f1_score :  1.0
Test set f1_score :  0.9987073196808149


# What happend in Production : resampling before split ?

In [11]:
print("Real world data Accuracy : ", clf_dt.score(x_real_world, y_real_world))   # Big red, because of this you never identify your mistakes
print("Real world data Recall : ", recall_score(y_real_world, clf_dt.predict(x_real_world)))
print("Real World f1_score : ", f1_score(y_real_world, clf_dt.predict(x_real_world)))

Real world data Accuracy :  0.9972613321161476
Real world data Recall :  0.851063829787234
Real World f1_score :  0.5063291139240507


# SMOTE, Oversampling after split

In [12]:
# Train, test split the original data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2, random_state=random_seed)

# Oversample using SMOTE
sm = SMOTE(random_state=random_seed)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)
print("\nCounts before oversampling : \n", y_train.value_counts())
print("\nCounts after oversampling : \n", y_train_res.value_counts())




Counts before oversampling : 
 0    204705
1       355
Name: col_30, dtype: int64

Counts after oversampling : 
 0    204705
1    204705
Name: col_30, dtype: int64


In [13]:
clf_dt = DecisionTreeClassifier(random_state=random_seed)
clf_dt.fit(x_train_res, y_train_res)
print("Train set f1_score : ", f1_score(y_train_res, clf_dt.predict(x_train_res)))
print("Test set f1_score : ", f1_score(y_test, clf_dt.predict(x_test)))

Train set f1_score :  1.0
Test set f1_score :  0.549618320610687


In [14]:
print("Real world data Accuracy : ", clf_dt.score(x_real_world, y_real_world))   # Big red, because of this you never identify your mistakes
print("Real world data Recall : ", recall_score(y_real_world, clf_dt.predict(x_real_world)))
print("Real World f1_score : ", f1_score(y_real_world, clf_dt.predict(x_real_world)))


Real world data Accuracy :  0.997542221129876
Real world data Recall :  0.7872340425531915
Real World f1_score :  0.513888888888889
