In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report, recall_score, precision_score
from imblearn.under_sampling import EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import make_pipeline

import pandas as pd

In [6]:
all = pd.read_csv('./preprocessed_data/all_final.csv', encoding='CP949')
feature = all.iloc[:,2:7]
target = all.iloc[:,7]

In [104]:
# data2020 = all[(all.회계년도=='2020/06')| (all.회계년도=='2020/12')]
# feature_ = data2020.iloc[:,2:7]
# target_ = data2020.iloc[:,7]

# all = all[(all.회계년도!='2020/06') & (all.회계년도!='2020/12')]
# feature = all.iloc[:,2:7]
# target = all.iloc[:,7]

### 오버샘플링 train만 샘플링

In [9]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTE, BorderlineSMOTE, ADASYN]
strategy_ = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

for i in sampling:
    
    for strategy in strategy_:
        x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.2)

        ss = StandardScaler()
        x_train_ss = ss.fit_transform(x_train)
        x_test_ss = ss.fit_transform(x_test)

        sampling = i(sampling_strategy=strategy, random_state=42)
        x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)        

        lr = LogisticRegression(random_state=42)
        lr.fit(x_train_ss, y_train)
        pred = lr.predict(x_test_ss)

        acc = accuracy_score(y_test, pred)
        recall = recall_score(y_test, pred)
        precision = precision_score(y_test, pred)
        f1 = f1_score(y_test, pred)
        
        df.loc[i.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("Logistic Model's score by Over sampling")
df.sort_values(['recall', 'f1_score', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

Logistic Model's score by Over sampling


Unnamed: 0,f1_score,recall,precision,accuracy
ADASYN(1),0.11,0.846154,0.058824,0.854753
SMOTE(1),0.105541,0.769231,0.056657,0.861689
ADASYN(0.9),0.104712,0.769231,0.05618,0.860465
SMOTE(0.9),0.100559,0.692308,0.054217,0.868625
ADASYN(0.8),0.099174,0.692308,0.053412,0.866585
ADASYN(0.7),0.102102,0.653846,0.055375,0.878009
BorderlineSMOTE(0.8),0.122605,0.615385,0.068085,0.906569
BorderlineSMOTE(0.9),0.115523,0.615385,0.063745,0.900041
ADASYN(0.5),0.113475,0.615385,0.0625,0.898001
BorderlineSMOTE(1),0.111888,0.615385,0.061538,0.896369


### 언더샘플링 train만 샘플링

In [10]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]

for i in sampling:
    
    x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.3)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    if i == OneSidedSelection:
        sampling = i(random_state=42)
    else:
        sampling = i()
    
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    lr = LogisticRegression(random_state=42)
    lr.fit(x_train_ss, y_train)
    pred = lr.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[i.__name__] = [f1, recall, precision, acc]

print("Logistic Model's score by Under sampling")
df.sort_values(['recall'], ascending=False)

Logistic Model's score by Under sampling


Unnamed: 0,f1_score,recall,precision,accuracy
NearMiss,0.12749,0.744186,0.069717,0.880849
RepeatedEditedNearestNeighbours,0.235294,0.232558,0.238095,0.982318
NeighbourhoodCleaningRule,0.125,0.093023,0.190476,0.984766
AllKNN,0.09375,0.069767,0.142857,0.984222
EditedNearestNeighbours,0.072727,0.046512,0.166667,0.986126
OneSidedSelection,0.041667,0.023256,0.2,0.987486
TomekLinks,0.041667,0.023256,0.2,0.987486


### 복합샘플링 test까지 샘플링

In [66]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTETomek, SMOTEENN]

for i in sampling:

    x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.2)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i(random_state=42)
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)
    x_test_ss, y_test = sampling.fit_resample(x_test_ss, y_test)

    lr = LogisticRegression(random_state=42)
    lr.fit(x_train_ss, y_train)
    pred = lr.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[i.__name__] = [f1, recall, precision, acc]

print("Logistic Model's score by Combine sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

Logistic Model's score by Combine sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTEENN,0.901761,0.900704,0.902821,0.89805
SMOTETomek,0.86228,0.861621,0.86294,0.862385


In [29]:
df

Unnamed: 0,f1_score,recall,precision,accuracy
SMOTEEditedNearestNeighbours(0.4),0.843931,0.837476,0.850485,0.905484


### 복합샘플링 train만 샘플링

In [5]:
columns = ['f1_score', 'recall', 'precision', 'accuracy']
df = pd.DataFrame(columns=columns)

sampling = [SMOTETomek, SMOTEENN]

for i in sampling:

    x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.2)

    ss = StandardScaler()
    x_train_ss = ss.fit_transform(x_train)
    x_test_ss = ss.fit_transform(x_test)

    sampling = i(random_state=42)
    x_train_ss, y_train = sampling.fit_resample(x_train_ss, y_train)

    lr = LogisticRegression(random_state=42)
    lr.fit(x_train_ss, y_train)
    pred = lr.predict(x_test_ss)

    acc = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    
    df.loc[i.__name__] = [f1, recall, precision, acc]

print("Logistic Model's score by Combine sampling")
df.sort_values(['f1_score', 'recall', 'precision', 'accuracy'], ascending=False)

Logistic Model's score by Combine sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTETomek,0.083969,0.814815,0.044266,0.865622
SMOTEENN,0.079585,0.851852,0.041742,0.851064


In [8]:
strategy_ = [0.3, 0.4, 0.5]
columns = ['f1_score', 'recall', 'precision', 'accuracy']

df = pd.DataFrame(columns=columns)

under_sampling = [EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss,NeighbourhoodCleaningRule, OneSidedSelection,TomekLinks]
over_sampling = [SMOTE, BorderlineSMOTE, ADASYN]

lr = LogisticRegression()

for over in over_sampling:

    for under in under_sampling:

        x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.2)

        for strategy in strategy_:
            
            if under == OneSidedSelection:
                under_ = under(random_state=42)
            else:
                under_ = under()

            over_ = over(sampling_strategy=strategy, random_state=42)

            ss = StandardScaler()
            x_train_ss = ss.fit_transform(x_train)
            x_test_ss = ss.fit_transform(x_test)

            model = make_pipeline(over_, under_, lr)   
            model.fit(x_train_ss, y_train)
            pred = model.predict(x_test_ss)

            acc = accuracy_score(y_test, pred)
            recall = recall_score(y_test, pred)
            precision = precision_score(y_test, pred)
            f1 = f1_score(y_test, pred)

            df.loc[over.__name__+under.__name__+'('+str(strategy)+')'] = [f1, recall, precision, acc]

print("Logistic Model's score by sampling")
df.sort_values(['recall', 'f1_score', 'precision', 'accuracy'], ascending=False, inplace=True)
df.head(10)

Logistic Model's score by sampling


Unnamed: 0,f1_score,recall,precision,accuracy
SMOTERepeatedEditedNearestNeighbours(0.5),0.107345,0.730769,0.057927,0.871073
ADASYNAllKNN(0.5),0.105556,0.730769,0.056886,0.868625
ADASYNRepeatedEditedNearestNeighbours(0.5),0.103542,0.730769,0.055718,0.865769
ADASYNEditedNearestNeighbours(0.5),0.103152,0.692308,0.055728,0.872297
ADASYNRepeatedEditedNearestNeighbours(0.4),0.103152,0.692308,0.055728,0.872297
SMOTENearMiss(0.4),0.101408,0.692308,0.054711,0.869849
SMOTENearMiss(0.5),0.101124,0.692308,0.054545,0.869441
SMOTENearMiss(0.3),0.098901,0.692308,0.053254,0.866177
ADASYNNearMiss(0.3),0.097297,0.692308,0.052326,0.863729
ADASYNNearMiss(0.4),0.096515,0.692308,0.051873,0.862505


In [76]:
x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.2, \
    shuffle=True, stratify=all['부실기업여부'])

In [84]:
print(len(y_train[y_train==0]), len(y_train[y_train==1]))

5226 89


In [85]:
print(len(y_test[y_test==0]), len(y_test[y_test==1]))

1308 21


In [83]:
x_train, x_test, y_train, y_test = train_test_split(feature, target, random_state=42, test_size=0.2)