In [4]:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE
from imblearn.under_sampling import CondensedNearestNeighbour, EditedNearestNeighbours,  \
                                    RepeatedEditedNearestNeighbours, AllKNN, NearMiss, NeighbourhoodCleaningRule, OneSidedSelection, TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, classification_report, confusion_matrix, precision_score, f1_score 
from imblearn.combine import SMOTETomek, SMOTEENN

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('./datasets/BankChurners.csv')
data = data.iloc[:,1:21]
data

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.000
3,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.760
4,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.500,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,Existing Customer,50,M,2,Graduate,Single,$40K - $60K,Blue,40,3,2,3,4003.0,1851,2152.0,0.703,15476,117,0.857,0.462
10123,Attrited Customer,41,M,2,Unknown,Divorced,$40K - $60K,Blue,25,4,2,3,4277.0,2186,2091.0,0.804,8764,69,0.683,0.511
10124,Attrited Customer,44,F,1,High School,Married,Less than $40K,Blue,36,5,3,4,5409.0,0,5409.0,0.819,10291,60,0.818,0.000
10125,Attrited Customer,30,M,2,Graduate,Unknown,$40K - $60K,Blue,36,4,3,3,5281.0,0,5281.0,0.535,8395,62,0.722,0.000


In [3]:
object_columns = data.select_dtypes('object').columns
for i in object_columns:
    lb = LabelEncoder()
    lb.fit(data[i])
    data[i] = lb.transform(data[i])
    print(f'category : {np.unique(data[i])}\nclasses : {lb.classes_}\n')

input = data.iloc[:,1:]
target = data.iloc[:,0]

category : [0 1]
classes : ['Attrited Customer' 'Existing Customer']

category : [0 1]
classes : ['F' 'M']

category : [0 1 2 3 4 5 6]
classes : ['College' 'Doctorate' 'Graduate' 'High School' 'Post-Graduate'
 'Uneducated' 'Unknown']

category : [0 1 2 3]
classes : ['Divorced' 'Married' 'Single' 'Unknown']

category : [0 1 2 3 4 5]
classes : ['$120K +' '$40K - $60K' '$60K - $80K' '$80K - $120K' 'Less than $40K'
 'Unknown']

category : [0 1 2 3]
classes : ['Blue' 'Gold' 'Platinum' 'Silver']



In [36]:
from imblearn.pipeline import make_pipeline
import random
strategy_ = [0.3, 0.4, 0.5]
columns = ['accuracy', 'recall', 'precision', 'f1_score']
df = pd.DataFrame(columns=columns)

under_sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]
over_sampling = [SMOTE, BorderlineSMOTE, ADASYN]
rf = RandomForestClassifier()

random.seed(42)

for over in over_sampling:
    for under in under_sampling:
        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        for strategy in strategy_:
            under_ = under()
            over_ = over(sampling_strategy=strategy)
            ss = StandardScaler()
            x_train_ss = ss.fit_transform(x_train)
            x_test_ss = ss.fit_transform(x_test)

            model = make_pipeline(over_, under_, rf)   
            model.fit(x_train_ss, y_train)
            pred = model.predict(x_test_ss)
            acc = accuracy_score(y_test, pred)
            recall = recall_score(y_test, pred)
            precision = precision_score(y_test, pred)
            f1 = f1_score(y_test, pred)
            df.loc[str(over)+str(under)+str(strategy)] = [acc, recall, precision, f1]

print("SVM Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

SVM Model's score by sampling


Unnamed: 0,accuracy,recall,precision,f1_score
<class 'imblearn.over_sampling._adasyn.ADASYN'><class 'imblearn.under_sampling._prototype_selection._tomek_links.TomekLinks'>0.3,0.964956,0.981165,0.97714,0.979148
<class 'imblearn.over_sampling._smote.base.SMOTE'><class 'imblearn.under_sampling._prototype_selection._one_sided_selection.OneSidedSelection'>0.4,0.964956,0.977634,0.980519,0.979075
<class 'imblearn.over_sampling._smote.filter.BorderlineSMOTE'><class 'imblearn.under_sampling._prototype_selection._one_sided_selection.OneSidedSelection'>0.3,0.963475,0.979988,0.97654,0.978261
<class 'imblearn.over_sampling._adasyn.ADASYN'><class 'imblearn.under_sampling._prototype_selection._one_sided_selection.OneSidedSelection'>0.4,0.962981,0.97528,0.980473,0.97787
<class 'imblearn.over_sampling._adasyn.ADASYN'><class 'imblearn.under_sampling._prototype_selection._one_sided_selection.OneSidedSelection'>0.3,0.9615,0.980577,0.9737,0.977126
<class 'imblearn.over_sampling._smote.base.SMOTE'><class 'imblearn.under_sampling._prototype_selection._tomek_links.TomekLinks'>0.4,0.961007,0.97528,0.978158,0.976717
<class 'imblearn.over_sampling._smote.base.SMOTE'><class 'imblearn.under_sampling._prototype_selection._tomek_links.TomekLinks'>0.5,0.961007,0.972925,0.980427,0.976662
<class 'imblearn.over_sampling._smote.base.SMOTE'><class 'imblearn.under_sampling._prototype_selection._one_sided_selection.OneSidedSelection'>0.5,0.961007,0.971748,0.98157,0.976634
<class 'imblearn.over_sampling._smote.base.SMOTE'><class 'imblearn.under_sampling._prototype_selection._tomek_links.TomekLinks'>0.3,0.960513,0.9794,0.973669,0.976526
<class 'imblearn.over_sampling._smote.filter.BorderlineSMOTE'><class 'imblearn.under_sampling._prototype_selection._tomek_links.TomekLinks'>0.4,0.96002,0.975868,0.976443,0.976155


In [32]:
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False).head(10)

Unnamed: 0,accuracy,recall,precision,f1_score
<class 'imblearn.over_sampling._adasyn.ADASYN'><class 'imblearn.under_sampling._prototype_selection._tomek_links.TomekLinks'>,0.964462,0.978811,0.978811,0.978811
<class 'imblearn.over_sampling._smote.filter.BorderlineSMOTE'><class 'imblearn.under_sampling._prototype_selection._tomek_links.TomekLinks'>,0.962488,0.979988,0.975395,0.977686
<class 'imblearn.over_sampling._adasyn.ADASYN'><class 'imblearn.under_sampling._prototype_selection._one_sided_selection.OneSidedSelection'>,0.962488,0.978811,0.976512,0.97766
<class 'imblearn.over_sampling._smote.base.SMOTE'><class 'imblearn.under_sampling._prototype_selection._neighbourhood_cleaning_rule.NeighbourhoodCleaningRule'>,0.961007,0.971748,0.98157,0.976634
<class 'imblearn.over_sampling._smote.base.SMOTE'><class 'imblearn.under_sampling._prototype_selection._one_sided_selection.OneSidedSelection'>,0.96002,0.98352,0.969258,0.976337
<class 'imblearn.over_sampling._smote.base.SMOTE'><class 'imblearn.under_sampling._prototype_selection._tomek_links.TomekLinks'>,0.96002,0.981165,0.971445,0.976281
<class 'imblearn.over_sampling._smote.filter.BorderlineSMOTE'><class 'imblearn.under_sampling._prototype_selection._one_sided_selection.OneSidedSelection'>,0.958045,0.977634,0.972482,0.975051
<class 'imblearn.over_sampling._adasyn.ADASYN'><class 'imblearn.under_sampling._prototype_selection._neighbourhood_cleaning_rule.NeighbourhoodCleaningRule'>,0.957552,0.968217,0.980918,0.974526
<class 'imblearn.over_sampling._smote.filter.BorderlineSMOTE'><class 'imblearn.under_sampling._prototype_selection._neighbourhood_cleaning_rule.NeighbourhoodCleaningRule'>,0.956071,0.965274,0.982036,0.973583
<class 'imblearn.over_sampling._adasyn.ADASYN'><class 'imblearn.under_sampling._prototype_selection._edited_nearest_neighbours.EditedNearestNeighbours'>,0.955084,0.959388,0.986683,0.972844


In [None]:
import random
columns = ['accuracy', 'recall', 'precision', 'f1_score']
df = pd.DataFrame(columns=columns)

sampling = [EditedNearestNeighbours,RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]
random.seed(42)
for i in sampling:
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)
    sampling = i()
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    svm = SVC(random_state=42)
    svm.fit(x_train_ss, y_train)
    pred = svm.predict(x_test_ss)
    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    df.loc[sampling] = [acc, recall, precision, f1]

print("SVM Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)