In [16]:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss, NeighbourhoodCleaningRule, OneSidedSelection, TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score 
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import make_pipeline

import random
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../datasets/BankChurners.csv')
list = ['Attrition_Flag', 'Total_Trans_Ct', 'Total_Trans_Amt', 'Total_Revolving_Bal', 'Total_Ct_Chng_Q4_Q1', 'Contacts_Count_12_mon', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Months_on_book']
data = data[list]
data

Unnamed: 0,Attrition_Flag,Total_Trans_Ct,Total_Trans_Amt,Total_Revolving_Bal,Total_Ct_Chng_Q4_Q1,Contacts_Count_12_mon,Total_Relationship_Count,Months_Inactive_12_mon,Months_on_book
0,Existing Customer,42,1144,777,1.625,3,5,1,39
1,Existing Customer,33,1291,864,3.714,2,6,1,44
2,Existing Customer,20,1887,0,2.333,0,4,1,36
3,Existing Customer,20,1171,2517,2.333,1,3,4,34
4,Existing Customer,28,816,0,2.500,0,5,1,21
...,...,...,...,...,...,...,...,...,...
10122,Existing Customer,117,15476,1851,0.857,3,3,2,40
10123,Attrited Customer,69,8764,2186,0.683,3,4,2,25
10124,Attrited Customer,60,10291,0,0.818,4,5,3,36
10125,Attrited Customer,62,8395,0,0.722,3,4,3,36


In [3]:
object_columns = data.select_dtypes('object').columns

for i in object_columns:

    lb = LabelEncoder()
    lb.fit(data[i])
    data[i] = lb.transform(data[i])
    
    print(f'category : {np.unique(data[i])}\nclasses : {lb.classes_}\n')

input = data.iloc[:,1:]
target = data.iloc[:,0]

category : [0 1]
classes : ['Attrited Customer' 'Existing Customer']



## Under Sampling

In [11]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]

random.seed(42)

for i in sampling:
    
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    if i == OneSidedSelection:
        sampling = i(random_state=42)
    else:
        sampling = i()

    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    rf = RandomForestClassifier(random_state=42)
    rf.fit(x_train_ss, y_train)
    pred = rf.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[i.__name__] = [f1, recall, precision, acc]

print("RandomForest Model's score by Under sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

RandomForest Model's score by Under sampling


Unnamed: 0,f1_score,recall,precision,accuracy
OneSidedSelection,0.975752,0.982931,0.968677,0.959033
NeighbourhoodCleaningRule,0.974374,0.973514,0.975236,0.957058
TomekLinks,0.974329,0.982931,0.965876,0.956565
EditedNearestNeighbours,0.97398,0.969394,0.97861,0.956565
AllKNN,0.970362,0.963508,0.977313,0.950642
RepeatedEditedNearestNeighbours,0.966974,0.956445,0.977738,0.945212
NearMiss,0.895541,0.827546,0.975711,0.838105


## Over Sampling

In [12]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTE, BorderlineSMOTE, ADASYN]
strategy_ = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

random.seed(42)

for i in sampling:
    
    for strategy in strategy_:
        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        ss = StandardScaler()
        x_train_ss = ss.fit_transform(x_train)
        x_test_ss = ss.fit_transform(x_test)

        sampling = i(sampling_strategy=strategy, random_state=42)
        x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

        rf = RandomForestClassifier(random_state=42)
        rf.fit(x_train_ss, y_train)
        pred = rf.predict(x_test_ss)

        acc = accuracy_score(y_test, pred)
        recall = recall_score(y_test, pred)
        precision = precision_score(y_test, pred)
        f1 = f1_score(y_test, pred)
        
        df.loc[i.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("RandomForest Model's score by Over sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

RandomForest Model's score by Over sampling


Unnamed: 0,f1_score,recall,precision,accuracy
BorderlineSMOTE(0.3),0.97654,0.979988,0.973115,0.960513
SMOTE(0.6),0.975538,0.974102,0.976978,0.959033
SMOTE(0.3),0.975467,0.982931,0.968116,0.958539
SMOTE(0.4),0.974449,0.976457,0.97245,0.957058
BorderlineSMOTE(0.4),0.974329,0.971748,0.976923,0.957058
BorderlineSMOTE(0.5),0.973964,0.968805,0.979179,0.956565
ADASYN(0.3),0.973592,0.976457,0.970743,0.955577
SMOTE(0.5),0.973212,0.972925,0.973498,0.955084
SMOTE(0.7),0.972461,0.966451,0.978546,0.954097
SMOTE(0.8),0.972395,0.964097,0.980838,0.954097


## Combine Sampling

In [13]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTETomek, SMOTEENN]

random.seed(42)

for i in sampling:

    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i(random_state=42)
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    rf = RandomForestClassifier(random_state=42)
    rf.fit(x_train_ss, y_train)
    pred = rf.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[sampling] = [f1, recall, precision, acc]

print("RandomForest Model's score by Combine sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

RandomForest Model's score by Combine sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTETomek(random_state=42),0.971174,0.961742,0.980792,0.952122
SMOTEENN(random_state=42),0.958232,0.931725,0.986293,0.931885


## Pipeline을 통한 Combine Sampling

In [17]:
strategy_ = [0.3, 0.4, 0.5]
columns = ['f1_score', 'recall', 'precision', 'accuracy']

df = pd.DataFrame(columns=columns)

under_sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]
over_sampling = [SMOTE, BorderlineSMOTE, ADASYN]

rf = RandomForestClassifier()

random.seed(42)

for over in over_sampling:

    for under in under_sampling:

        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        for strategy in strategy_:
            
            if under == OneSidedSelection:
                under_ = under(random_state=42)
            else:
                under_ = under()

            over_ = over(sampling_strategy=strategy, random_state=42)

            ss = StandardScaler()
            x_train_ss = ss.fit_transform(x_train)
            x_test_ss = ss.fit_transform(x_test)

            model = make_pipeline(over_, under_, rf)   
            model.fit(x_train_ss, y_train)
            pred = model.predict(x_test_ss)

            acc = accuracy_score(y_test, pred)
            recall = recall_score(y_test, pred)
            precision = precision_score(y_test, pred)
            f1 = f1_score(y_test, pred)

            df.loc[over.__name__+under.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("RandomForest Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

RandomForest Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
ADASYNOneSidedSelection(0.3),0.975967,0.979988,0.971979,0.959526
SMOTEOneSidedSelection(0.3),0.975766,0.98352,0.968134,0.959033
BorderlineSMOTEOneSidedSelection(0.4),0.975538,0.974102,0.976978,0.959033
SMOTETomekLinks(0.3),0.97541,0.980577,0.970297,0.958539
BorderlineSMOTETomekLinks(0.5),0.975163,0.970571,0.979798,0.958539
BorderlineSMOTETomekLinks(0.3),0.975037,0.977045,0.973036,0.958045
ADASYNTomekLinks(0.3),0.975022,0.976457,0.973592,0.958045
BorderlineSMOTEOneSidedSelection(0.3),0.975007,0.975868,0.974148,0.958045
SMOTEOneSidedSelection(0.5),0.974314,0.97116,0.977488,0.957058
SMOTETomekLinks(0.5),0.974298,0.970571,0.978055,0.957058


## Under, Over, Combine Sampling과 Pipeline을 통한 Combine Sampling 전체 결과

In [18]:
sampling = [EditedNearestNeighbours,RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]

random.seed(42)

for i in sampling:
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    if i in [OneSidedSelection, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]:
        sampling = i(random_state=42)
    else:
        sampling = i()
        
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    rf = RandomForestClassifier(random_state=42)
    rf.fit(x_train_ss, y_train)
    pred = rf.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    df.loc[i.__name__] = [f1, recall, precision, acc]

print("RandomForest Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

RandomForest Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
ADASYNOneSidedSelection(0.3),0.975967,0.979988,0.971979,0.959526
SMOTEOneSidedSelection(0.3),0.975766,0.98352,0.968134,0.959033
OneSidedSelection,0.975752,0.982931,0.968677,0.959033
BorderlineSMOTEOneSidedSelection(0.4),0.975538,0.974102,0.976978,0.959033
SMOTETomekLinks(0.3),0.97541,0.980577,0.970297,0.958539
BorderlineSMOTETomekLinks(0.5),0.975163,0.970571,0.979798,0.958539
BorderlineSMOTETomekLinks(0.3),0.975037,0.977045,0.973036,0.958045
ADASYNTomekLinks(0.3),0.975022,0.976457,0.973592,0.958045
BorderlineSMOTEOneSidedSelection(0.3),0.975007,0.975868,0.974148,0.958045
NeighbourhoodCleaningRule,0.974374,0.973514,0.975236,0.957058


## No Sampling

In [22]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
NO_Sampling = pd.DataFrame(columns=columns)

random.seed(42)

x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

ss = StandardScaler()
x_train_ss = ss.fit_transform(x_train)
x_test_ss = ss.fit_transform(x_test)


rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)
pred = rf.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)

NO_Sampling.loc['No Sampling'] = [f1, recall, precision, acc]

print("RandomForest Model's score by sampling")
NO_Sampling.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
NO_Sampling.head(10)

RandomForest Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
No Sampling,0.973256,0.985285,0.961516,0.95459
