In [1]:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE
from imblearn.under_sampling import CondensedNearestNeighbour, EditedNearestNeighbours,  \
                                    RepeatedEditedNearestNeighbours, AllKNN, NearMiss, NeighbourhoodCleaningRule, OneSidedSelection, TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score 
from imblearn.combine import SMOTETomek, SMOTEENN

import random
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../datasets/BankChurners.csv')
list = ['Attrition_Flag', 'Total_Trans_Ct', 'Total_Trans_Amt', 'Total_Revolving_Bal', 'Total_Ct_Chng_Q4_Q1', 'Contacts_Count_12_mon', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Months_on_book']
data = data[list]
data

Unnamed: 0,Attrition_Flag,Total_Trans_Ct,Total_Trans_Amt,Total_Revolving_Bal,Total_Ct_Chng_Q4_Q1,Contacts_Count_12_mon,Total_Relationship_Count,Months_Inactive_12_mon,Months_on_book
0,Existing Customer,42,1144,777,1.625,3,5,1,39
1,Existing Customer,33,1291,864,3.714,2,6,1,44
2,Existing Customer,20,1887,0,2.333,0,4,1,36
3,Existing Customer,20,1171,2517,2.333,1,3,4,34
4,Existing Customer,28,816,0,2.500,0,5,1,21
...,...,...,...,...,...,...,...,...,...
10122,Existing Customer,117,15476,1851,0.857,3,3,2,40
10123,Attrited Customer,69,8764,2186,0.683,3,4,2,25
10124,Attrited Customer,60,10291,0,0.818,4,5,3,36
10125,Attrited Customer,62,8395,0,0.722,3,4,3,36


In [6]:
object_columns = data.select_dtypes('object').columns
for i in object_columns:
    lb = LabelEncoder()
    lb.fit(data[i])
    data[i] = lb.transform(data[i])
    print(f'category : {np.unique(data[i])}\nclasses : {lb.classes_}\n')

input = data.iloc[:,1:]
target = data.iloc[:,0]

category : [0 1]
classes : ['Attrited Customer' 'Existing Customer']



In [8]:
## Sampling 전부

columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [EditedNearestNeighbours,RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]

random.seed(42)

for i in sampling:
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i()
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    lr = LogisticRegression(random_state=42)
    lr.fit(x_train_ss, y_train)
    pred = lr.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    df.loc[sampling] = [f1, recall, precision, acc]

print("Logistic Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

Logistic Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
OneSidedSelection(),0.940839,0.964097,0.918676,0.898322
TomekLinks(),0.940839,0.964097,0.918676,0.898322
NeighbourhoodCleaningRule(),0.939217,0.945851,0.932676,0.897335
EditedNearestNeighbours(),0.937665,0.942908,0.93248,0.894867
AllKNN(),0.937518,0.940553,0.934503,0.894867
RepeatedEditedNearestNeighbours(),0.936597,0.934667,0.938534,0.89388
SMOTE(),0.902317,0.848146,0.96388,0.846002
SMOTETomek(),0.902317,0.848146,0.96388,0.846002
NearMiss(),0.891101,0.842849,0.945215,0.827246
SMOTEENN(),0.887256,0.81754,0.969972,0.825765


In [9]:
## Under Sampling 만

columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]

random.seed(42)

for i in sampling:
    
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i()
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    lr = LogisticRegression(random_state=42)
    lr.fit(x_train_ss, y_train)
    pred = lr.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[sampling] = [f1, recall, precision, acc]

print("Logistic Model's score by Under sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

Logistic Model's score by Under sampling


Unnamed: 0,f1_score,recall,precision,accuracy
OneSidedSelection(),0.940839,0.964097,0.918676,0.898322
TomekLinks(),0.940839,0.964097,0.918676,0.898322
NeighbourhoodCleaningRule(),0.939217,0.945851,0.932676,0.897335
EditedNearestNeighbours(),0.937665,0.942908,0.93248,0.894867
AllKNN(),0.937518,0.940553,0.934503,0.894867
RepeatedEditedNearestNeighbours(),0.936597,0.934667,0.938534,0.89388
NearMiss(),0.891101,0.842849,0.945215,0.827246


In [10]:
## Over Sampling 만

columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTE, BorderlineSMOTE, ADASYN]
strategy_ = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

random.seed(42)

for i in sampling:
    
    for strategy in strategy_:
        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        ss = StandardScaler()
        x_train_ss = ss.fit_transform(x_train)
        x_test_ss = ss.fit_transform(x_test)

        sampling = i(sampling_strategy=strategy)
        x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

        lr = LogisticRegression(random_state=42)
        lr.fit(x_train_ss, y_train)
        pred = lr.predict(x_test_ss)

        acc = accuracy_score(y_test, pred)
        recall = recall_score(y_test, pred)
        precision = precision_score(y_test, pred)
        f1 = f1_score(y_test, pred)
        
        df.loc[i.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("Logistic Model's score by Over sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

Logistic Model's score by Over sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTE(0.3),0.939252,0.946439,0.932174,0.897335
ADASYN(0.3),0.937006,0.941142,0.932905,0.89388
BorderlineSMOTE(0.3),0.935995,0.938199,0.933802,0.892399
SMOTE(0.4),0.933767,0.92937,0.938206,0.889437
SMOTE(0.5),0.928614,0.911124,0.946789,0.882527
ADASYN(0.4),0.92823,0.913479,0.943465,0.88154
SMOTE(0.6),0.926329,0.902884,0.951023,0.879566
BorderlineSMOTE(0.4),0.925981,0.909358,0.943223,0.878085
ADASYN(0.5),0.919281,0.88817,0.952652,0.8692
BorderlineSMOTE(0.5),0.918672,0.887581,0.95202,0.868213


In [11]:
## Combine Sampling 만

columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTETomek, SMOTEENN]

random.seed(42)

for i in sampling:

    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i()
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    lr = LogisticRegression(random_state=42)
    lr.fit(x_train_ss, y_train)
    pred = lr.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[sampling] = [f1, recall, precision, acc]

print("Logistic Model's score by Combine sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

Logistic Model's score by Combine sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTETomek(),0.90363,0.849912,0.964596,0.847976
SMOTEENN(),0.887683,0.818717,0.969338,0.826259
