In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
x_train = pd.read_csv('./data/track1/features/x_train_normal.csv')
x_valid = pd.read_csv('./data/track1/features/x_valid_normal.csv')
x_test = pd.read_csv('./data/track1/features/x_test_normal.csv')
y_train = pd.read_csv('./data/track1/features/y_train_normal.csv')
y_valid = pd.read_csv('./data/track1/features/y_valid_normal.csv')
y_test = pd.read_csv('./data/track1/features/y_test_normal.csv')

In [4]:
x_train_features = x_train.drop(columns=['날짜', 'CODE'], inplace=False)
x_valid_features = x_valid.drop(columns=['날짜', 'CODE'], inplace=False)
x_test_features = x_test.drop(columns=['날짜', 'CODE'], inplace=False)
y_train_bool = y_train['Y'] <-2.0
y_valid_bool = y_valid['Y'] <-2.0
y_test_bool = y_test['Y'] <-2.0

In [4]:
y_train_bool

0        False
1        False
2        False
3        False
4         True
         ...  
76108     True
76109    False
76110    False
76111    False
76112     True
Name: Y, Length: 76113, dtype: bool

### 1. Decision Tree

In [9]:
from sklearn import tree
decisionTree = tree.DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=5,
    class_weight={True: 10, False: 1}
)
decisionTree.fit(x_train_features, y_train_bool)

In [71]:
from sklearn.metrics import classification_report

y_pred = decisionTree.predict(x_train_features)
target_names = ['no risk', 'risk']
print(classification_report(y_train_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.96      0.26      0.41     63323
        risk       0.21      0.95      0.34     12790

    accuracy                           0.38     76113
   macro avg       0.58      0.61      0.37     76113
weighted avg       0.84      0.38      0.40     76113



In [72]:
from sklearn.metrics import classification_report

y_pred = decisionTree.predict(x_valid_features)
target_names = ['no risk', 'risk']
print(classification_report(y_valid_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.92      0.25      0.40     21102
        risk       0.20      0.89      0.32      4267

    accuracy                           0.36     25369
   macro avg       0.56      0.57      0.36     25369
weighted avg       0.80      0.36      0.38     25369



### 2. Random Forest 

In [5]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

oversample = RandomOverSampler(random_state = 42)
x_over, y_over = oversample.fit_resample(x_train_features, y_train_bool)
pd.DataFrame(y_over).value_counts()

Y    
False    63323
True     63323
dtype: int64

In [16]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200, 
    criterion='log_loss', 
    bootstrap=True,
    max_depth=10,
    class_weight={True: 10, False: 1}
    )
rf.fit(x_train_features, y_train_bool)
# rf.fit(x_over, y_over)

In [14]:
from sklearn.metrics import classification_report

y_pred = rf.predict(x_train_features)
target_names = ['no risk', 'risk']
print(classification_report(y_train_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.99      0.44      0.61     63323
        risk       0.26      0.97      0.41     12790

    accuracy                           0.53     76113
   macro avg       0.62      0.71      0.51     76113
weighted avg       0.87      0.53      0.58     76113



In [15]:
y_pred = rf.predict(x_valid_features)
target_names = ['no risk', 'risk']

print(classification_report(y_valid_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.91      0.41      0.56     21102
        risk       0.21      0.79      0.34      4267

    accuracy                           0.47     25369
   macro avg       0.56      0.60      0.45     25369
weighted avg       0.79      0.47      0.53     25369



### 2. LightGBM

In [43]:
import lightgbm as LightGBM

lgbm = LightGBM.LGBMClassifier(early_stopping_rounds=100,
                               reg_lambda = 0.2, 
                               n_estimators=200,
                               max_depth = 20,
                               class_weight={True: 10, False: 1}
                              ) 

evals = [(x_train_features, y_train_bool)]
lgbm.fit(x_train_features, y_train_bool, eval_metric='logloss', eval_set=evals)
y_pred = lgbm.predict(x_train_features)

[1]	training's binary_logloss: 0.627733
[2]	training's binary_logloss: 0.621718
[3]	training's binary_logloss: 0.616827
[4]	training's binary_logloss: 0.612504
[5]	training's binary_logloss: 0.609019
[6]	training's binary_logloss: 0.605886
[7]	training's binary_logloss: 0.603092
[8]	training's binary_logloss: 0.600786
[9]	training's binary_logloss: 0.59865
[10]	training's binary_logloss: 0.596648
[11]	training's binary_logloss: 0.594737
[12]	training's binary_logloss: 0.593277
[13]	training's binary_logloss: 0.591911
[14]	training's binary_logloss: 0.590619
[15]	training's binary_logloss: 0.589417
[16]	training's binary_logloss: 0.588205
[17]	training's binary_logloss: 0.587123
[18]	training's binary_logloss: 0.585994
[19]	training's binary_logloss: 0.585078
[20]	training's binary_logloss: 0.584241
[21]	training's binary_logloss: 0.58342
[22]	training's binary_logloss: 0.58264
[23]	training's binary_logloss: 0.581768
[24]	training's binary_logloss: 0.581081
[25]	training's binary_loglo

In [44]:
from sklearn.metrics import classification_report

y = lgbm.predict(x_train_features)
target_names = ['no risk', 'risk']

print(classification_report(y_train_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.98      0.33      0.49     63323
        risk       0.23      0.97      0.37     12790

    accuracy                           0.44     76113
   macro avg       0.60      0.65      0.43     76113
weighted avg       0.85      0.44      0.47     76113



In [45]:
from sklearn.metrics import classification_report

y = lgbm.predict(x_valid_features)
target_names = ['no risk', 'risk']

print(classification_report(y_valid_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.93      0.32      0.47     21102
        risk       0.21      0.87      0.33      4267

    accuracy                           0.41     25369
   macro avg       0.57      0.60      0.40     25369
weighted avg       0.80      0.41      0.45     25369

