In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
null_values = ['?', '??', 'N/A', 'NA', 'nan', 'NaN', '-nan', '-NaN', 'null', '-']
x_train = pd.read_csv('./data/track1/features/x_train_normal.csv', na_values = null_values)
x_valid = pd.read_csv('./data/track1/features/x_valid_normal.csv', na_values = null_values)
x_test = pd.read_csv('./data/track1/features/x_test_normal.csv', na_values = null_values)
y_train = pd.read_csv('./data/track1/features/y_train_normal.csv', na_values = null_values)
y_valid = pd.read_csv('./data/track1/features/y_valid_normal.csv', na_values = null_values)
y_test = pd.read_csv('./data/track1/features/y_test_normal.csv', na_values = null_values)

In [3]:
x_train = x_train.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
x_valid = x_valid.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
x_test = x_test.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
y_train_bool = y_train['Y'] <-2.0
y_valid_bool = y_valid['Y'] <-2.0
y_test_bool = y_test['Y'] <-2.0

### 1. Tree Classifer

In [4]:
sfs_feature_list = ['BPS', 'PBR', 'DIV', '거래량', '시가총액', '금리', '자산총계', '이익잉여금', '자본총계']
rfe_features_list = ['BPS', 'PBR', 'DIV', '거래량', '시가총액', '금리', '자산총계', '이익잉여금', '자본총계']
x_train_features = x_train[sfs_feature_list]
x_valid_features = x_valid[sfs_feature_list]
x_test_features = x_test[sfs_feature_list]


In [61]:
import lightgbm as LightGBM

lgbm = LightGBM.LGBMClassifier(early_stopping_rounds=100,
                               reg_lambda = 0.25, 
                               n_estimators=600,
                               max_depth = 50,
                               min_data_in_leaf = 50,
                               class_weight={True: 10, False: 1},
                               learning_rate= 0.1
                              ) 

evals = [(x_train_features, y_train_bool)]
lgbm.fit(x_train_features, y_train_bool, eval_metric='logloss', eval_set=evals)
y_pred = lgbm.predict(x_train_features)

[1]	training's binary_logloss: 0.628988
[2]	training's binary_logloss: 0.623421
[3]	training's binary_logloss: 0.618595
[4]	training's binary_logloss: 0.614736
[5]	training's binary_logloss: 0.611145
[6]	training's binary_logloss: 0.608083
[7]	training's binary_logloss: 0.605599
[8]	training's binary_logloss: 0.60318
[9]	training's binary_logloss: 0.601156
[10]	training's binary_logloss: 0.599245
[11]	training's binary_logloss: 0.597674
[12]	training's binary_logloss: 0.596315
[13]	training's binary_logloss: 0.594907
[14]	training's binary_logloss: 0.593697
[15]	training's binary_logloss: 0.592459
[16]	training's binary_logloss: 0.591341
[17]	training's binary_logloss: 0.590368
[18]	training's binary_logloss: 0.589415
[19]	training's binary_logloss: 0.588542
[20]	training's binary_logloss: 0.587626
[21]	training's binary_logloss: 0.586785
[22]	training's binary_logloss: 0.585927
[23]	training's binary_logloss: 0.585217
[24]	training's binary_logloss: 0.584518
[25]	training's binary_log

In [62]:
from sklearn.metrics import classification_report

y = lgbm.predict(x_train_features)
target_names = ['no risk', 'risk']

print(classification_report(y_train_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.99      0.42      0.59     63391
        risk       0.26      0.98      0.41     12724

    accuracy                           0.52     76115
   macro avg       0.62      0.70      0.50     76115
weighted avg       0.87      0.52      0.56     76115



In [63]:
from sklearn.metrics import classification_report

y = lgbm.predict(x_valid_features)
target_names = ['no risk', 'risk']

print(classification_report(y_valid_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.91      0.39      0.55     21052
        risk       0.22      0.82      0.34      4344

    accuracy                           0.46     25396
   macro avg       0.56      0.60      0.45     25396
weighted avg       0.79      0.46      0.51     25396



### 2. Tree Ensemble Model

#### make feature set

In [28]:
rfecv_feature_list = ['BPS', 'PBR', 'DIV', '거래량', '시가총액', '금리', '자산총계', '이익잉여금', '자본총계']
sfs_feature_list = ['BPS', 'DIV', '거래량', '금리', '비유동자산', '자산총계', '부채총계', '법인세차감전 순이익', '당기순이익']
stock_info_list = ['BPS', 'PER', 'PBR', 'EPS', 'DIV', 'DPS', '거래량']
financial_info_list = ['유동자산', '비유동자산', '자산총계', '유동부채', '비유동부채', '부채총계', '이익잉여금', '자본총계', '매출액', '영업이익', '법인세차감전 순이익', '당기순이익', '자본금']


def make_feature_set(x) :
    x_whole = x
    x_rfecv = x[rfecv_feature_list]
    x_sfs = x[sfs_feature_list]
    x_f = x[financial_info_list]
    x_s = x[stock_info_list]
    return x_whole, x_rfecv, x_sfs, x_f, x_s


x_whole, x_rfecv, x_sfs, x_f, x_s= make_feature_set(x_train)

In [79]:
import lightgbm as LightGBM

feature_set = []
feature_set = make_feature_set(x_train)
model = []

## train
for x in feature_set :
    lgbm = LightGBM.LGBMClassifier(early_stopping_rounds=100,
                               reg_lambda = 0.25, 
                               n_estimators=600,
                               max_depth = 50,
                               min_data_in_leaf = 50,
                               class_weight={True: 10, False: 1},
                               learning_rate= 0.1
                              ) 
    evals = [(x, y_train_bool)]
    lgbm.fit(x, y_train_bool, eval_metric='logloss', eval_set=evals)
    model.append(lgbm)

## prediction
def predict_ensemble_model(x_) :
    feature_set = make_feature_set(x_)
    y_pred = []
    i = 0
    for x in feature_set :
        pred = model[i].predict(x)
        y_pred.append(pred)
        i = i+1

    y_pred_sum = y_pred[0] &(y_pred[1] | y_pred[2] | y_pred[3] | y_pred[4])
    return y_pred_sum
#y_pred = np.array([vote(xi) for xi in y_pred])

[1]	training's binary_logloss: 0.628963
[2]	training's binary_logloss: 0.623169
[3]	training's binary_logloss: 0.618132
[4]	training's binary_logloss: 0.613868
[5]	training's binary_logloss: 0.610407
[6]	training's binary_logloss: 0.607459
[7]	training's binary_logloss: 0.604669
[8]	training's binary_logloss: 0.602451
[9]	training's binary_logloss: 0.600126
[10]	training's binary_logloss: 0.598127
[11]	training's binary_logloss: 0.596193
[12]	training's binary_logloss: 0.594546
[13]	training's binary_logloss: 0.593102
[14]	training's binary_logloss: 0.591732
[15]	training's binary_logloss: 0.590413
[16]	training's binary_logloss: 0.589211
[17]	training's binary_logloss: 0.588091
[18]	training's binary_logloss: 0.587018
[19]	training's binary_logloss: 0.585988
[20]	training's binary_logloss: 0.585031
[21]	training's binary_logloss: 0.584051
[22]	training's binary_logloss: 0.58326
[23]	training's binary_logloss: 0.582493
[24]	training's binary_logloss: 0.581635
[25]	training's binary_log

In [80]:
from sklearn.metrics import classification_report

y = predict_ensemble_model(x_train)
target_names = ['no risk', 'risk']

print(classification_report(y_train_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.99      0.45      0.62     63391
        risk       0.26      0.98      0.41     12724

    accuracy                           0.54     76115
   macro avg       0.63      0.71      0.52     76115
weighted avg       0.87      0.54      0.58     76115



In [81]:
from sklearn.metrics import classification_report

y = predict_ensemble_model(x_valid)
target_names = ['no risk', 'risk']

print(classification_report(y_valid_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.91      0.41      0.56     21052
        risk       0.22      0.80      0.34      4344

    accuracy                           0.47     25396
   macro avg       0.56      0.60      0.45     25396
weighted avg       0.79      0.47      0.52     25396

