In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report, recall_score, precision_score
from imblearn.under_sampling import EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import make_pipeline

import pandas as pd
import numpy as np

In [4]:
all = pd.read_csv('./preprocessed_data/all.csv', encoding='CP949')
feature = all.iloc[:,2:7]
target = all.iloc[:,7]

In [104]:
# data2020 = all[(all.회계년도=='2020/06')| (all.회계년도=='2020/12')]
# feature_ = data2020.iloc[:,2:7]
# target_ = data2020.iloc[:,7]

# all = all[(all.회계년도!='2020/06') & (all.회계년도!='2020/12')]
# feature = all.iloc[:,2:7]
# target = all.iloc[:,7]

### 오버샘플링 train만 샘플링

In [5]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTE, BorderlineSMOTE, ADASYN]
strategy_ = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

for i in sampling:
    
    for strategy in strategy_:
        x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.2)

        ss = StandardScaler()
        x_train_ss = ss.fit_transform(x_train)
        x_test_ss = ss.fit_transform(x_test)

        sampling = i(sampling_strategy=strategy, random_state=42)
        x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)        

        rf = RandomForestClassifier(random_state=42)
        rf.fit(x_train_ss, y_train)
        pred = rf.predict(x_test_ss)

        acc = accuracy_score(y_test, pred)
        recall = recall_score(y_test, pred)
        precision = precision_score(y_test, pred)
        f1 = f1_score(y_test, pred)
        
        df.loc[i.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("RandomForest Model's score by Over sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

RandomForest Model's score by Over sampling


Unnamed: 0,f1_score,recall,precision,accuracy
BorderlineSMOTE(0.5),0.37931,0.52381,0.297297,0.972912
BorderlineSMOTE(0.4),0.37037,0.47619,0.30303,0.974417
SMOTE(0.3),0.363636,0.47619,0.294118,0.973664
ADASYN(0.4),0.360656,0.52381,0.275,0.970655
BorderlineSMOTE(0.6),0.354839,0.52381,0.268293,0.969902
ADASYN(0.3),0.350877,0.47619,0.277778,0.97216
SMOTE(0.5),0.349206,0.52381,0.261905,0.96915
BorderlineSMOTE(0.8),0.349206,0.52381,0.261905,0.96915
SMOTE(0.7),0.347826,0.571429,0.25,0.96614
BorderlineSMOTE(0.9),0.338983,0.47619,0.263158,0.970655


In [10]:
columns = ['f1_score']
df = pd.DataFrame(columns=columns)

sampling = [SMOTE, BorderlineSMOTE, ADASYN]
strategy_ = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

for i in sampling:
    
    for strategy in strategy_:
        x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.2)

        ss = StandardScaler()
        x_train_ss = ss.fit_transform(x_train)
        x_test_ss = ss.fit_transform(x_test)

        sampling = i(sampling_strategy=strategy, random_state=42)
        x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)        

        stf = StratifiedKFold(n_splits=3)
        rf = RandomForestClassifier(random_state=42)
        scores = cross_val_score(rf, x_train_ss, y_train, scoring='f1', cv=stf)
        
        
        df.loc[i.__name__+'('+str(strategy)+')'] = [np.mean(np.round(scores,4))]

print("RandomForest Model's score by Over sampling")
df.sort_values(['f1_score'], ascending=False, inplace=True)
df.head(10)

RandomForest Model's score by Over sampling


Unnamed: 0,f1_score
BorderlineSMOTE(1),0.9851
BorderlineSMOTE(0.9),0.982633
BorderlineSMOTE(0.8),0.9805
BorderlineSMOTE(0.7),0.979333
BorderlineSMOTE(0.6),0.9738
SMOTE(1),0.970733
BorderlineSMOTE(0.5),0.9706
SMOTE(0.9),0.969467
SMOTE(0.8),0.9645
BorderlineSMOTE(0.4),0.962667


### 언더샘플링 train만 샘플링

In [9]:
columns = ['f1_score']
df = pd.DataFrame(columns=columns)

sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]

for i in sampling:
    
    x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.3)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    if i == OneSidedSelection:
        sampling = i(random_state=42)
    else:
        sampling = i()

    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)        

    stf = StratifiedKFold(n_splits=3)
    rf = RandomForestClassifier(random_state=42)
    scores = cross_val_score(rf, x_train_ss, y_train, scoring='f1', cv=stf)
    
    df.loc[i.__name__] = [np.mean(np.round(scores,4))]

print("RandomForest Model's score by Over sampling")
df.sort_values(['f1_score'], ascending=False, inplace=True)
df.head(10)



RandomForest Model's score by Over sampling


Unnamed: 0,f1_score
NearMiss,0.924467
RepeatedEditedNearestNeighbours,0.591067
AllKNN,0.541233
EditedNearestNeighbours,0.5169
NeighbourhoodCleaningRule,0.5077
OneSidedSelection,0.331767
TomekLinks,0.323367


### 복합샘플링 train만 샘플링

In [6]:
columns = ['f1_score']
df = pd.DataFrame(columns=columns)

sampling = [SMOTETomek, SMOTEENN]

for i in sampling:

    x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.2)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i(random_state=42)
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    stf = StratifiedKFold(n_splits=3)
    rf = RandomForestClassifier(random_state=42)
    scores = cross_val_score(rf, x_train_ss, y_train, scoring='f1', cv=stf)
    
    df.loc[i.__name__] = [np.mean(np.round(scores,4))]

print("RandomForest Model's score by Combine sampling")
df.sort_values(['f1_score'], ascending=False)
df.head(10)

RandomForest Model's score by Combine sampling


Unnamed: 0,f1_score
SMOTETomek,0.971167
SMOTEENN,0.988267


In [28]:
strategy_ = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
columns = ['f1_score']

df = pd.DataFrame(columns=columns)

under_sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]
over_sampling = [SMOTE, BorderlineSMOTE, ADASYN]

rf = RandomForestClassifier()

for over in over_sampling:

    for under in under_sampling:

        x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.2)

        for strategy in strategy_:
            
            if under == OneSidedSelection:
                under_ = under(random_state=42)
            else:
                under_ = under()

            over_ = over(sampling_strategy=strategy, random_state=42)

            ss = StandardScaler()
            x_train_ss = ss.fit_transform(x_train)
            x_test_ss = ss.fit_transform(x_test)

            model = make_pipeline(over_, under_, rf)   

            scores = cross_val_score(model, x_train_ss, y_train, scoring='f1', cv=5)

            df.loc[i.__name__] = [np.mean(np.round(scores,4))]


print("RandomForest Model's score by sampling")
df.sort_values(['f1_score'], ascending=False, inplace=True)
df.head(10)

RandomForest Model's score by sampling


Unnamed: 0,f1_score
SMOTEENN,0.28538


In [85]:
print(len(y_test[y_test==0]), len(y_test[y_test==1]))

1308 21
