In [2]:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss, NeighbourhoodCleaningRule, OneSidedSelection, TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score 
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings('ignore')

import random
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('../datasets/BankChurners_change.csv', encoding='CP949')
data = data.iloc[:,2:23]
data.drop('Customer_Age', axis=1, inplace=True)
np.random.seed(42)
random.seed(42)

In [4]:
object_columns = data.select_dtypes('object').columns
for i in object_columns:
    lb = LabelEncoder()
    lb.fit(data[i])
    data[i] = lb.transform(data[i])
    print(f'category : {np.unique(data[i])}\nclasses : {lb.classes_}\n')

input = data.iloc[:,1:]
target = data.iloc[:,0]

category : [0 1]
classes : ['Attrited Customer' 'Existing Customer']

category : [0 1]
classes : ['F' 'M']

category : [0 1 2 3 4 5 6]
classes : ['College' 'Doctorate' 'Graduate' 'High School' 'Post-Graduate'
 'Uneducated' 'Unknown']

category : [0 1 2 3]
classes : ['Divorced' 'Married' 'Single' 'Unknown']

category : [0 1 2 3 4 5]
classes : ['$120K +' '$40K - $60K' '$60K - $80K' '$80K - $120K' 'Less than $40K'
 'Unknown']

category : [0 1 2 3]
classes : ['Blue' 'Gold' 'Platinum' 'Silver']

category : [0 1 2 3 4]
classes : ['20대' '30대' '40대' '50대' '60대 이상']



## Under Sampling

In [5]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]

for i in sampling:
    
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    if i == OneSidedSelection:
        sampling = i(random_state=42)
    else:
        sampling = i()
    
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    lr = LogisticRegression(random_state=42)
    lr.fit(x_train_ss, y_train)
    pred = lr.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[i.__name__] = [f1, recall, precision, acc]

print("Logistic Model's score by Under sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

Logistic Model's score by Under sampling


Unnamed: 0,f1_score,recall,precision,accuracy
OneSidedSelection,0.942628,0.967039,0.919418,0.901283
TomekLinks,0.942628,0.967039,0.919418,0.901283
NeighbourhoodCleaningRule,0.941416,0.950559,0.932448,0.90079
EditedNearestNeighbours,0.937996,0.939376,0.93662,0.895854
AllKNN,0.935503,0.930547,0.940512,0.892399
RepeatedEditedNearestNeighbours,0.934718,0.927016,0.942549,0.891412
NearMiss,0.859729,0.782813,0.953405,0.785785


## Over Sampling

In [6]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTE, BorderlineSMOTE, ADASYN]
strategy_ = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

for i in sampling:
    
    for strategy in strategy_:
        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        ss = StandardScaler()
        x_train_ss = ss.fit_transform(x_train)
        x_test_ss = ss.fit_transform(x_test)

        sampling = i(sampling_strategy=strategy, random_state=42)
        x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

        lr = LogisticRegression(random_state=42)
        lr.fit(x_train_ss, y_train)
        pred = lr.predict(x_test_ss)

        acc = accuracy_score(y_test, pred)
        recall = recall_score(y_test, pred)
        precision = precision_score(y_test, pred)
        f1 = f1_score(y_test, pred)
        
        df.loc[i.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("Logistic Model's score by Over sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

Logistic Model's score by Over sampling


Unnamed: 0,f1_score,recall,precision,accuracy
BorderlineSMOTE(0.3),0.941796,0.947616,0.936047,0.901777
SMOTE(0.3),0.941451,0.951148,0.931949,0.90079
ADASYN(0.3),0.941451,0.951148,0.931949,0.90079
ADASYN(0.4),0.934125,0.926427,0.941951,0.890424
SMOTE(0.4),0.933571,0.926427,0.940825,0.889437
BorderlineSMOTE(0.4),0.932856,0.919953,0.946126,0.888944
SMOTE(0.5),0.930859,0.915244,0.947016,0.885982
SMOTE(0.6),0.930514,0.906416,0.955928,0.886476
ADASYN(0.5),0.929305,0.905238,0.954687,0.884501
BorderlineSMOTE(0.5),0.92911,0.906416,0.95297,0.884008


## Combine Sampling

In [7]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTETomek, SMOTEENN]

for i in sampling:

    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i(random_state=42)
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    lr = LogisticRegression(random_state=42)
    lr.fit(x_train_ss, y_train)
    pred = lr.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[i.__name__] = [f1, recall, precision, acc]

print("Logistic Model's score by Combine sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

Logistic Model's score by Combine sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTETomek,0.910615,0.863449,0.96323,0.857848
SMOTEENN,0.879305,0.804002,0.97017,0.814906


## Pipeline을 통한 Combine Sampling

In [8]:
strategy_ = [0.3, 0.4, 0.5]
columns = ['f1_score', 'recall', 'precision', 'accuracy']

df = pd.DataFrame(columns=columns)

under_sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]
over_sampling = [SMOTE, BorderlineSMOTE, ADASYN]

lr = LogisticRegression()

for over in over_sampling:

    for under in under_sampling:

        x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

        for strategy in strategy_:
            
            if under == OneSidedSelection:
                under_ = under(random_state=42)
            else:
                under_ = under()

            over_ = over(sampling_strategy=strategy, random_state=42)

            ss = StandardScaler()
            x_train_ss = ss.fit_transform(x_train)
            x_test_ss = ss.fit_transform(x_test)

            model = make_pipeline(over_, under_, lr)   
            model.fit(x_train_ss, y_train)
            pred = model.predict(x_test_ss)

            acc = accuracy_score(y_test, pred)
            recall = recall_score(y_test, pred)
            precision = precision_score(y_test, pred)
            f1 = f1_score(y_test, pred)

            df.loc[over.__name__+under.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("Logistic Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

Logistic Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTEOneSidedSelection(0.3),0.941108,0.949971,0.932409,0.900296
SMOTETomekLinks(0.3),0.941108,0.949971,0.932409,0.900296
ADASYNTomekLinks(0.3),0.940216,0.948793,0.931792,0.898815
BorderlineSMOTEOneSidedSelection(0.3),0.939936,0.944085,0.935823,0.898815
ADASYNOneSidedSelection(0.3),0.939907,0.948205,0.931752,0.898322
BorderlineSMOTETomekLinks(0.3),0.939314,0.942908,0.935748,0.897828
SMOTENeighbourhoodCleaningRule(0.3),0.934246,0.924073,0.944645,0.890918
ADASYNTomekLinks(0.4),0.934046,0.92525,0.943011,0.890424
SMOTEOneSidedSelection(0.4),0.933848,0.926427,0.941388,0.889931
SMOTETomekLinks(0.4),0.933531,0.925839,0.941352,0.889437


## Under, Over, Combine Sampling과 Pipeline을 통한 Combine Sampling 전체 결과

In [9]:
sampling = [EditedNearestNeighbours,RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]

for i in sampling:
    x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    if i in [OneSidedSelection, SMOTE, BorderlineSMOTE, ADASYN, SMOTETomek, SMOTEENN]:
        sampling = i(random_state=42)
    else:
        sampling = i()
        
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    lr = LogisticRegression(random_state=42)
    lr.fit(x_train_ss, y_train)
    pred = lr.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)

    df.loc[i.__name__] = [f1, recall, precision, acc]

print("Logistic Model's score by sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

Logistic Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
OneSidedSelection,0.942628,0.967039,0.919418,0.901283
TomekLinks,0.942628,0.967039,0.919418,0.901283
NeighbourhoodCleaningRule,0.941416,0.950559,0.932448,0.90079
SMOTEOneSidedSelection(0.3),0.941108,0.949971,0.932409,0.900296
SMOTETomekLinks(0.3),0.941108,0.949971,0.932409,0.900296
ADASYNTomekLinks(0.3),0.940216,0.948793,0.931792,0.898815
BorderlineSMOTEOneSidedSelection(0.3),0.939936,0.944085,0.935823,0.898815
ADASYNOneSidedSelection(0.3),0.939907,0.948205,0.931752,0.898322
BorderlineSMOTETomekLinks(0.3),0.939314,0.942908,0.935748,0.897828
EditedNearestNeighbours,0.937996,0.939376,0.93662,0.895854


## NO Sampling

In [10]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
NO_Sampling = pd.DataFrame(columns=columns)

x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=42)

ss = StandardScaler()
x_train_ss = ss.fit_transform(x_train)
x_test_ss = ss.fit_transform(x_test)


lr = LogisticRegression(random_state=42)
lr.fit(x_train, y_train)
pred = lr.predict(x_test)

acc = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)

NO_Sampling.loc['No Sampling'] = [f1, recall, precision, acc]

print("Logistic Model's score by sampling")
NO_Sampling.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
NO_Sampling.head(10)

Logistic Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
No Sampling,0.927982,0.959388,0.898567,0.875123
