In [11]:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE
from imblearn.under_sampling import CondensedNearestNeighbour, EditedNearestNeighbours,  \
                                    RepeatedEditedNearestNeighbours, AllKNN, NearMiss, NeighbourhoodCleaningRule, OneSidedSelection, TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score 
from imblearn.combine import SMOTETomek, SMOTEENN

import random
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [12]:
data = pd.read_csv('../datasets/BankChurners.csv')
list = ['Attrition_Flag', 'Total_Trans_Ct', 'Total_Trans_Amt', 'Total_Revolving_Bal', 'Total_Ct_Chng_Q4_Q1', 'Contacts_Count_12_mon', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Months_on_book']
data = data[list]
data

Unnamed: 0,Attrition_Flag,Total_Trans_Ct,Total_Trans_Amt,Total_Revolving_Bal,Total_Ct_Chng_Q4_Q1,Contacts_Count_12_mon,Total_Relationship_Count,Months_Inactive_12_mon,Months_on_book
0,Existing Customer,42,1144,777,1.625,3,5,1,39
1,Existing Customer,33,1291,864,3.714,2,6,1,44
2,Existing Customer,20,1887,0,2.333,0,4,1,36
3,Existing Customer,20,1171,2517,2.333,1,3,4,34
4,Existing Customer,28,816,0,2.500,0,5,1,21
...,...,...,...,...,...,...,...,...,...
10122,Existing Customer,117,15476,1851,0.857,3,3,2,40
10123,Attrited Customer,69,8764,2186,0.683,3,4,2,25
10124,Attrited Customer,60,10291,0,0.818,4,5,3,36
10125,Attrited Customer,62,8395,0,0.722,3,4,3,36


In [13]:
object_columns = data.select_dtypes('object').columns

for i in object_columns:

    lb = LabelEncoder()
    lb.fit(data[i])
    data[i] = lb.transform(data[i])
    
    print(f'category : {np.unique(data[i])}\nclasses : {lb.classes_}\n')

input = data.iloc[:,1:]
target = data.iloc[:,0]

category : [0 1]
classes : ['Attrited Customer' 'Existing Customer']



In [4]:
## Sampling 전부

columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [EditedNearestNeighbours,RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]

random.seed(42)

for i in sampling:
    
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i()
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    svm = SVC(random_state=42)
    svm.fit(x_train_ss, y_train)
    pred = svm.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[sampling] = [f1, recall, precision, acc]

print("SVM Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

SVM Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
OneSidedSelection(),0.964917,0.9794,0.950857,0.940276
TomekLinks(),0.964917,0.9794,0.950857,0.940276
EditedNearestNeighbours(),0.963116,0.960565,0.96568,0.938302
NeighbourhoodCleaningRule(),0.962441,0.965274,0.959626,0.936821
AllKNN(),0.960355,0.955268,0.965497,0.93386
RepeatedEditedNearestNeighbours(),0.958073,0.948205,0.968149,0.930405
SMOTETomek(),0.949955,0.921719,0.979975,0.918559
SMOTE(),0.949348,0.92113,0.979349,0.917572
SMOTEENN(),0.936537,0.894644,0.982547,0.898322
BorderlineSMOTE(),0.93288,0.887581,0.983051,0.892892


In [14]:
## Under Sampling 만

columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]

random.seed(42)

for i in sampling:
    
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i()
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    svm = SVC(random_state=42)
    svm.fit(x_train_ss, y_train)
    pred = svm.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[sampling] = [f1, recall, precision, acc]

print("SVM Model's score by Under sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

SVM Model's score by Under sampling


Unnamed: 0,f1_score,recall,precision,accuracy
OneSidedSelection(),0.964917,0.9794,0.950857,0.940276
TomekLinks(),0.964917,0.9794,0.950857,0.940276
EditedNearestNeighbours(),0.963116,0.960565,0.96568,0.938302
NeighbourhoodCleaningRule(),0.962441,0.965274,0.959626,0.936821
AllKNN(),0.960355,0.955268,0.965497,0.93386
RepeatedEditedNearestNeighbours(),0.958073,0.948205,0.968149,0.930405
NearMiss(),0.844987,0.753973,0.96099,0.768016


In [15]:
## Over Sampling 만

columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTE, BorderlineSMOTE, ADASYN]
strategy_ = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

random.seed(42)

for i in sampling:
    
    for strategy in strategy_:
        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        ss = StandardScaler()
        x_train_ss = ss.fit_transform(x_train)
        x_test_ss = ss.fit_transform(x_test)

        sampling = i(sampling_strategy=strategy)
        x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

        svm = SVC(random_state=42)
        svm.fit(x_train_ss, y_train)
        pred = svm.predict(x_test_ss)

        acc = accuracy_score(y_test, pred)
        recall = recall_score(y_test, pred)
        precision = precision_score(y_test, pred)
        f1 = f1_score(y_test, pred)
        
        df.loc[i.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("SVM Model's score by Over sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

SVM Model's score by Over sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTE(0.4),0.966589,0.970571,0.962639,0.943731
ADASYN(0.3),0.965639,0.967628,0.963658,0.942251
SMOTE(0.3),0.965558,0.973514,0.95773,0.941757
SMOTE(0.5),0.964539,0.960565,0.968546,0.94077
BorderlineSMOTE(0.3),0.964317,0.962331,0.966312,0.940276
SMOTE(0.6),0.963248,0.956445,0.970149,0.938796
SMOTE(0.7),0.962101,0.948793,0.975787,0.937315
ADASYN(0.4),0.960977,0.949382,0.972859,0.935341
BorderlineSMOTE(0.4),0.960024,0.947028,0.973382,0.93386
BorderlineSMOTE(0.5),0.957229,0.935256,0.980259,0.929911


In [16]:
## Combine Sampling 만

columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTETomek, SMOTEENN]

random.seed(42)

for i in sampling:

    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i()
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    svm = SVC(random_state=42)
    svm.fit(x_train_ss, y_train)
    pred = svm.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[sampling] = [f1, recall, precision, acc]

print("SVM Model's score by Combine sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

SVM Model's score by Combine sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTETomek(),0.951833,0.924662,0.980649,0.92152
SMOTEENN(),0.938863,0.899353,0.982005,0.901777
