In [9]:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE
from imblearn.under_sampling import CondensedNearestNeighbour, EditedNearestNeighbours,  \
                                    RepeatedEditedNearestNeighbours, AllKNN, NearMiss, NeighbourhoodCleaningRule, OneSidedSelection, TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.pipeline import make_pipeline

import random
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('./datasets/BankChurners.csv')
list = ['Attrition_Flag', 'Total_Trans_Ct', 'Total_Trans_Amt', 'Total_Revolving_Bal', 'Total_Ct_Chng_Q4_Q1', 'Contacts_Count_12_mon', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Months_on_book']
data = data[list]
data

Unnamed: 0,Attrition_Flag,Total_Trans_Ct,Total_Trans_Amt,Total_Revolving_Bal,Total_Ct_Chng_Q4_Q1,Contacts_Count_12_mon,Total_Relationship_Count,Months_Inactive_12_mon,Months_on_book
0,Existing Customer,42,1144,777,1.625,3,5,1,39
1,Existing Customer,33,1291,864,3.714,2,6,1,44
2,Existing Customer,20,1887,0,2.333,0,4,1,36
3,Existing Customer,20,1171,2517,2.333,1,3,4,34
4,Existing Customer,28,816,0,2.500,0,5,1,21
...,...,...,...,...,...,...,...,...,...
10122,Existing Customer,117,15476,1851,0.857,3,3,2,40
10123,Attrited Customer,69,8764,2186,0.683,3,4,2,25
10124,Attrited Customer,60,10291,0,0.818,4,5,3,36
10125,Attrited Customer,62,8395,0,0.722,3,4,3,36


In [4]:
object_columns = data.select_dtypes('object').columns

for i in object_columns:
    
    lb = LabelEncoder()
    lb.fit(data[i])
    data[i] = lb.transform(data[i])

    print(f'category : {np.unique(data[i])}\nclasses : {lb.classes_}\n')

input = data.iloc[:,1:]
target = data.iloc[:,0]

category : [0 1]
classes : ['Attrited Customer' 'Existing Customer']



In [8]:
strategy_ = [0.3, 0.4, 0.5]
columns = ['f1_score', 'recall', 'precision', 'accuracy']

df = pd.DataFrame(columns=columns)

under_sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]
over_sampling = [SMOTE, BorderlineSMOTE, ADASYN]

rf = RandomForestClassifier()

random.seed(42)

for over in over_sampling:

    for under in under_sampling:

        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        for strategy in strategy_:

            under_ = under()
            over_ = over(sampling_strategy=strategy)

            ss = StandardScaler()
            x_train_ss = ss.fit_transform(x_train)
            x_test_ss = ss.fit_transform(x_test)

            model = make_pipeline(over_, under_, rf)   
            model.fit(x_train_ss, y_train)
            pred = model.predict(x_test_ss)

            acc = accuracy_score(y_test, pred)
            recall = recall_score(y_test, pred)
            precision = precision_score(y_test, pred)
            f1 = f1_score(y_test, pred)

            df.loc[over.__name__+under.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("RandomForest Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

RandomForest Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
ADASYNTomekLinks(0.3),0.977086,0.978811,0.975367,0.9615
ADASYNOneSidedSelection(0.3),0.976826,0.979988,0.973684,0.961007
BorderlineSMOTEOneSidedSelection(0.3),0.976484,0.977634,0.975338,0.960513
SMOTETomekLinks(0.3),0.975981,0.980577,0.971429,0.959526
SMOTEOneSidedSelection(0.3),0.975967,0.979988,0.971979,0.959526
SMOTEOneSidedSelection(0.5),0.975825,0.974102,0.977555,0.959526
BorderlineSMOTETomekLinks(0.3),0.975624,0.977634,0.973623,0.959033
ADASYNOneSidedSelection(0.4),0.975466,0.97116,0.97981,0.959033
SMOTETomekLinks(0.4),0.974449,0.976457,0.97245,0.957058
SMOTEOneSidedSelection(0.4),0.974359,0.972925,0.975797,0.957058


In [10]:
strategy_ = [0.3, 0.4, 0.5]
columns = ['f1_score', 'recall', 'precision', 'accuracy']

df = pd.DataFrame(columns=columns)

under_sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]
over_sampling = [SMOTE, BorderlineSMOTE, ADASYN]

svc = SVC()

random.seed(42)

for over in over_sampling:

    for under in under_sampling:

        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        for strategy in strategy_:

            under_ = under()
            over_ = over(sampling_strategy=strategy)

            ss = StandardScaler()
            x_train_ss = ss.fit_transform(x_train)
            x_test_ss = ss.fit_transform(x_test)

            model = make_pipeline(over_, under_, svc)   
            model.fit(x_train_ss, y_train)
            pred = model.predict(x_test_ss)

            acc = accuracy_score(y_test, pred)
            recall = recall_score(y_test, pred)
            precision = precision_score(y_test, pred)
            f1 = f1_score(y_test, pred)

            df.loc[over.__name__+under.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("SVM Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

SVM Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
ADASYNTomekLinks(0.3),0.965619,0.967039,0.964202,0.942251
SMOTEOneSidedSelection(0.3),0.965195,0.97116,0.959302,0.941264
SMOTETomekLinks(0.3),0.965133,0.969394,0.96091,0.941264
BorderlineSMOTETomekLinks(0.3),0.964907,0.962919,0.966903,0.941264
BorderlineSMOTEOneSidedSelection(0.3),0.964317,0.962331,0.966312,0.940276
ADASYNOneSidedSelection(0.3),0.964202,0.967039,0.961381,0.939783
SMOTETomekLinks(0.4),0.963706,0.961154,0.966272,0.939289
SMOTEOneSidedSelection(0.4),0.963508,0.963508,0.963508,0.938796
SMOTETomekLinks(0.5),0.963029,0.958211,0.967895,0.938302
SMOTEOneSidedSelection(0.5),0.9627,0.957034,0.968434,0.937808


In [12]:
strategy_ = [0.3, 0.4, 0.5]
columns = ['f1_score', 'recall', 'precision', 'accuracy']

df = pd.DataFrame(columns=columns)

under_sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]
over_sampling = [SMOTE, BorderlineSMOTE, ADASYN]

lr = LogisticRegression()

random.seed(42)

for over in over_sampling:

    for under in under_sampling:

        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        for strategy in strategy_:

            under_ = under()
            over_ = over(sampling_strategy=strategy)
            
            ss = StandardScaler()
            x_train_ss = ss.fit_transform(x_train)
            x_test_ss = ss.fit_transform(x_test)

            model = make_pipeline(over_, under_, lr)   
            model.fit(x_train_ss, y_train)
            pred = model.predict(x_test_ss)
            
            acc = accuracy_score(y_test, pred)
            recall = recall_score(y_test, pred)
            precision = precision_score(y_test, pred)
            f1 = f1_score(y_test, pred)

            df.loc[over.__name__+under.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]


print("SVM Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

SVM Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTETomekLinks(0.3),0.939146,0.944673,0.933682,0.897335
SMOTEOneSidedSelection(0.3),0.938871,0.944673,0.93314,0.896841
ADASYNOneSidedSelection(0.3),0.937793,0.940553,0.93505,0.89536
ADASYNTomekLinks(0.3),0.937518,0.940553,0.934503,0.894867
BorderlineSMOTETomekLinks(0.3),0.935882,0.936433,0.935332,0.892399
SMOTENeighbourhoodCleaningRule(0.3),0.935791,0.926427,0.945345,0.893386
BorderlineSMOTEOneSidedSelection(0.3),0.934943,0.934667,0.935218,0.890918
SMOTETomekLinks(0.4),0.934242,0.928193,0.94037,0.890424
SMOTEOneSidedSelection(0.4),0.933017,0.926427,0.939701,0.88845
SMOTEEditedNearestNeighbours(0.3),0.932259,0.919364,0.945521,0.887957


In [15]:
strategy_ = [0.3, 0.4, 0.5]
columns = ['f1_score', 'recall', 'precision', 'accuracy']

df = pd.DataFrame(columns=columns)

under_sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]
over_sampling = [SMOTE, BorderlineSMOTE, ADASYN]

xgb = XGBClassifier()

random.seed(42)

for over in over_sampling:

    for under in under_sampling:

        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        for strategy in strategy_:

            under_ = under()
            over_ = over(sampling_strategy=strategy)
            
            ss = StandardScaler()
            x_train_ss = ss.fit_transform(x_train)
            x_test_ss = ss.fit_transform(x_test)

            model = make_pipeline(over_, under_, xgb)   
            model.fit(x_train_ss, y_train)
            pred = model.predict(x_test_ss)

            acc = accuracy_score(y_test, pred)
            recall = recall_score(y_test, pred)
            precision = precision_score(y_test, pred)
            f1 = f1_score(y_test, pred)

            df.loc[over.__name__+under.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("xgb Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

xgb Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTEOneSidedSelection(0.3),0.971174,0.961742,0.980792,0.952122
BorderlineSMOTETomekLinks(0.3),0.970816,0.959388,0.98252,0.951629
SMOTENeighbourhoodCleaningRule(0.3),0.968012,0.952913,0.983597,0.947187
SMOTETomekLinks(0.3),0.967993,0.952325,0.984185,0.947187
SMOTEEditedNearestNeighbours(0.3),0.965538,0.948205,0.983516,0.943238
SMOTEAllKNN(0.3),0.965455,0.945851,0.98589,0.943238
ADASYNOneSidedSelection(0.3),0.961597,0.935845,0.988806,0.937315
SMOTERepeatedEditedNearestNeighbours(0.3),0.959128,0.932313,0.987531,0.933366
SMOTEEditedNearestNeighbours(0.4),0.957627,0.931136,0.98567,0.930898
SMOTETomekLinks(0.4),0.956891,0.927604,0.988088,0.929911
