In [2]:
from catboost import  CatBoostClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss, NeighbourhoodCleaningRule, OneSidedSelection, TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score 
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import make_pipeline

import random
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

  from pandas import MultiIndex, Int64Index


In [15]:
data = pd.read_csv('../datasets/BankChurners.csv')
data = data.iloc[:,1:21]
np.random.seed(42)

In [9]:
# input = data.iloc[:,1:]
# target = data.iloc[:,0]

# object_col = [1,3,4,5,6]
# object_columns = input.select_dtypes('object').columns
# num_columns = input.select_dtypes('int64').columns | input.select_dtypes('float64').columns

# print(object_columns)
# print(num_columns)

# lb = LabelEncoder()
# lb.fit(target)
# target = lb.transform(target)

In [16]:
object_columns = data.select_dtypes('object').columns

for i in object_columns:

    lb = LabelEncoder()
    lb.fit(data[i])
    data[i] = lb.transform(data[i])
    
    print(f'category : {np.unique(data[i])}\nclasses : {lb.classes_}\n')

input = data.iloc[:,1:]
target = data.iloc[:,0]

category : [0 1]
classes : ['Attrited Customer' 'Existing Customer']

category : [0 1]
classes : ['F' 'M']

category : [0 1 2 3 4 5 6]
classes : ['College' 'Doctorate' 'Graduate' 'High School' 'Post-Graduate'
 'Uneducated' 'Unknown']

category : [0 1 2 3]
classes : ['Divorced' 'Married' 'Single' 'Unknown']

category : [0 1 2 3 4 5]
classes : ['$120K +' '$40K - $60K' '$60K - $80K' '$80K - $120K' 'Less than $40K'
 'Unknown']

category : [0 1 2 3]
classes : ['Blue' 'Gold' 'Platinum' 'Silver']



## Under Sampling

In [17]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]

random.seed(42)

for i in sampling:
    
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    if i == OneSidedSelection:
        sampling = i(random_state=42)
    else:
        sampling = i()

    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    cat = CatBoostClassifier(random_state=42, verbose=0)
    cat.fit(x_train_ss, y_train)
    pred = cat.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[i.__name__] = [f1, recall, precision, acc]

print("Cat Model's score by Under sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

Cat Model's score by Under sampling


Unnamed: 0,f1_score,recall,precision,accuracy
TomekLinks,0.982713,0.987051,0.978413,0.970879
OneSidedSelection,0.982138,0.987051,0.977273,0.969891
EditedNearestNeighbours,0.981355,0.975868,0.986905,0.968904
NeighbourhoodCleaningRule,0.980531,0.978222,0.98285,0.967423
AllKNN,0.979278,0.973514,0.98511,0.965449
RepeatedEditedNearestNeighbours,0.978648,0.97116,0.986252,0.964462
NearMiss,0.950459,0.914656,0.989179,0.920039


## Over Sampling

In [18]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTE, BorderlineSMOTE, ADASYN]
strategy_ = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

random.seed(42)

for i in sampling:
    
    for strategy in strategy_:
        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        ss = StandardScaler()
        x_train_ss = ss.fit_transform(x_train)
        x_test_ss = ss.fit_transform(x_test)

        sampling = i(sampling_strategy=strategy, random_state=42)
        x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

        cat = CatBoostClassifier(random_state=42, verbose=0)
        cat.fit(x_train_ss, y_train)
        pred = cat.predict(x_test_ss)

        acc = accuracy_score(y_test, pred)
        recall = recall_score(y_test, pred)
        precision = precision_score(y_test, pred)
        f1 = f1_score(y_test, pred)
        
        df.loc[i.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("Cat Model's score by Over sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

Cat Model's score by Over sampling


Unnamed: 0,f1_score,recall,precision,accuracy
ADASYN(0.3),0.977989,0.967628,0.988575,0.963475
SMOTE(0.3),0.977778,0.97116,0.984487,0.962981
BorderlineSMOTE(0.3),0.97494,0.961742,0.988506,0.958539
SMOTE(0.4),0.973022,0.955268,0.991448,0.955577
BorderlineSMOTE(0.4),0.968675,0.946439,0.99198,0.948667
ADASYN(0.4),0.964513,0.935845,0.994994,0.942251
BorderlineSMOTE(0.5),0.962603,0.931725,0.995597,0.939289
SMOTE(0.5),0.962064,0.932902,0.993108,0.938302
SMOTE(0.6),0.958461,0.923484,0.99619,0.932873
SMOTE(0.7),0.954949,0.91701,0.996164,0.927443


In [19]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTE, BorderlineSMOTE, ADASYN]

random.seed(42)

for i in sampling:

    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i(random_state=42)
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    cat = CatBoostClassifier(random_state=42, verbose=0)
    cat.fit(x_train_ss, y_train)
    pred = cat.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    df.loc[i.__name__] = [f1, recall, precision, acc]

print("Cat Model's score by Over sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

Cat Model's score by Over sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTE,0.928594,0.868746,0.997297,0.887957
BorderlineSMOTE,0.92132,0.85462,0.999312,0.877591
ADASYN,0.882392,0.790465,0.998513,0.823297


## Combine Sampling

In [8]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTETomek, SMOTEENN]

random.seed(42)

for i in sampling:

    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i(random_state=42)
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    cat = CatBoostClassifier(random_state=42, verbose=0)
    cat.fit(x_train_ss, y_train)
    pred = cat.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[sampling] = [f1, recall, precision, acc]

print("Cat Model's score by Combine sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

Cat Model's score by Combine sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTEENN(random_state=42),0.94441,0.899941,0.993502,0.911155
SMOTETomek(random_state=42),0.933583,0.881107,0.992706,0.894867


## Pipeline을 통한 Combine Sampling

In [9]:
strategy_ = [0.3, 0.4, 0.5]
columns = ['f1_score', 'recall', 'precision', 'accuracy']

df = pd.DataFrame(columns=columns)

under_sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]
over_sampling = [SMOTE, BorderlineSMOTE, ADASYN]

cat = CatBoostClassifier(random_state=42, verbose=0)

random.seed(42)

for over in over_sampling:

    for under in under_sampling:

        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        for strategy in strategy_:
            
            if under == OneSidedSelection:
                under_ = under(random_state=42)
            else:
                under_ = under()

            over_ = over(sampling_strategy=strategy, random_state=42)

            ss = StandardScaler()
            x_train_ss = ss.fit_transform(x_train)
            x_test_ss = ss.fit_transform(x_test)

            model = make_pipeline(over_, under_, cat)   
            model.fit(x_train_ss, y_train)
            pred = model.predict(x_test_ss)

            acc = accuracy_score(y_test, pred)
            recall = recall_score(y_test, pred)
            precision = precision_score(y_test, pred)
            f1 = f1_score(y_test, pred)

            df.loc[over.__name__+under.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("Cat Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

Cat Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTETomekLinks(0.3),0.978774,0.977045,0.980508,0.964462
SMOTEOneSidedSelection(0.3),0.978184,0.976457,0.979917,0.963475
SMOTEEditedNearestNeighbours(0.3),0.977421,0.968217,0.986803,0.962488
SMOTENeighbourhoodCleaningRule(0.3),0.976882,0.969982,0.983881,0.9615
ADASYNTomekLinks(0.3),0.975682,0.968217,0.983264,0.959526
SMOTETomekLinks(0.4),0.975639,0.966451,0.985003,0.959526
BorderlineSMOTEOneSidedSelection(0.3),0.975595,0.964685,0.986755,0.959526
ADASYNNeighbourhoodCleaningRule(0.3),0.974405,0.963508,0.985551,0.957552
BorderlineSMOTETomekLinks(0.3),0.974328,0.960565,0.988492,0.957552
SMOTEAllKNN(0.3),0.973794,0.962331,0.985533,0.956565


## Under, Over, Combine Sampling과 Pipeline을 통한 Combine Sampling 전체 결과

In [10]:
sampling = [EditedNearestNeighbours,RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]

random.seed(42)

for i in sampling:
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    if i in [OneSidedSelection, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]:
        sampling = i(random_state=42)
    else:
        sampling = i()
        
    x_train, y_train = sampling.fit_resample(x_train, y_train)

    cat = CatBoostClassifier(random_state=42, verbose=0)
    cat.fit(x_train, y_train)
    pred = cat.predict(x_test)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    df.loc[i.__name__] = [f1, recall, precision, acc]

print("Cat Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

Cat Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
OneSidedSelection,0.980657,0.984697,0.976649,0.967423
TomekLinks,0.980657,0.984697,0.976649,0.967423
NeighbourhoodCleaningRule,0.979087,0.978222,0.979953,0.964956
SMOTETomekLinks(0.3),0.978774,0.977045,0.980508,0.964462
SMOTE,0.978673,0.972337,0.985092,0.964462
SMOTETomek,0.978673,0.972337,0.985092,0.964462
SMOTEOneSidedSelection(0.3),0.978184,0.976457,0.979917,0.963475
ADASYN,0.978068,0.97116,0.985075,0.963475
SMOTEEditedNearestNeighbours(0.3),0.977421,0.968217,0.986803,0.962488
EditedNearestNeighbours,0.977172,0.969982,0.984468,0.961994


In [20]:
## column 전체

sampling = [EditedNearestNeighbours,RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]

random.seed(42)

for i in sampling:
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    if i in [OneSidedSelection, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]:
        sampling = i(random_state=42)
    else:
        sampling = i()
        
    x_train, y_train = sampling.fit_resample(x_train, y_train)

    cat = CatBoostClassifier(random_state=42, verbose=0)
    cat.fit(x_train, y_train)
    pred = cat.predict(x_test)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    df.loc[i.__name__] = [f1, recall, precision, acc]

print("Cat Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

Cat Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
OneSidedSelection,0.981797,0.984108,0.979496,0.969398
TomekLinks,0.980323,0.982343,0.978312,0.96693
NeighbourhoodCleaningRule,0.979929,0.977045,0.98283,0.966436
SMOTETomek,0.979894,0.97528,0.984551,0.966436
ADASYN,0.979604,0.97528,0.983967,0.965943
EditedNearestNeighbours,0.979568,0.973514,0.985697,0.965943
SMOTE,0.979302,0.974691,0.983957,0.965449
BorderlineSMOTE,0.978686,0.972925,0.984515,0.964462
AllKNN,0.97709,0.966451,0.987966,0.961994
RepeatedEditedNearestNeighbours,0.972455,0.955856,0.98964,0.95459


## No Sampling

In [21]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
No_Sampling = pd.DataFrame(columns=columns)

random.seed(42)

x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

ss = StandardScaler()
x_train_ss = ss.fit_transform(x_train)
x_test_ss = ss.fit_transform(x_test)

## CatBoost는 cat_features에 문자열 인덱스를 넣어주면 알아서 라벨링해주는 기능이 있다.
cat = CatBoostClassifier(iterations=100, random_state=42,verbose=0)
cat.fit(x_train, y_train) #, cat_features=object_col
pred = cat.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)

No_Sampling.loc['No Sampling'] = [f1, recall, precision, acc]

print("Cat Model's score by sampling")
No_Sampling.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
No_Sampling.head(10)

Cat Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
No Sampling,0.980404,0.986463,0.974419,0.96693
