In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
null_values = ['?', '??', 'N/A', 'NA', 'nan', 'NaN', '-nan', '-NaN', 'null', '-']
x_train = pd.read_csv('./data/track1/features/x_train_normal.csv', na_values = null_values)
x_valid = pd.read_csv('./data/track1/features/x_valid_normal.csv', na_values = null_values)
x_test = pd.read_csv('./data/track1/features/x_test_normal.csv', na_values = null_values)
y_train = pd.read_csv('./data/track1/features/y_train_normal.csv', na_values = null_values)
y_valid = pd.read_csv('./data/track1/features/y_valid_normal.csv', na_values = null_values)
y_test = pd.read_csv('./data/track1/features/y_test_normal.csv', na_values = null_values)

In [3]:
x_train = x_train.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
x_valid = x_valid.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
x_test = x_test.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
y_train_bool = y_train['Y'] <-2.0
y_valid_bool = y_valid['Y'] <-2.0
y_test_bool = y_test['Y'] <-2.0

### 1. Tree Classifer

In [4]:
sfs_feature_list = ['BPS', 'PBR', 'DIV', '거래량', '시가총액', '금리', '자산총계', '이익잉여금', '자본총계']
rfe_features_list = ['BPS', 'PBR', 'DIV', '거래량', '시가총액', '금리', '자산총계', '이익잉여금', '자본총계']
x_train_features = x_train[sfs_feature_list]
x_valid_features = x_valid[sfs_feature_list]
x_test_features = x_test[sfs_feature_list]


In [23]:
import lightgbm as LightGBM

lgbm = LightGBM.LGBMClassifier(early_stopping_rounds=100,
                               reg_lambda = 0.25, 
                               n_estimators=1500,
                               max_depth = 50,
                               min_data_in_leaf = 50,
                               class_weight={True: 5, False: 1}
                              ) 

evals = [(x_train_features, y_train_bool)]
lgbm.fit(x_train_features, y_train_bool, eval_metric='logloss', eval_set=evals)
y_pred = lgbm.predict(x_train_features)

[1]	training's binary_logloss: 0.68593
[2]	training's binary_logloss: 0.680011
[3]	training's binary_logloss: 0.674916
[4]	training's binary_logloss: 0.670883
[5]	training's binary_logloss: 0.667258
[6]	training's binary_logloss: 0.664002
[7]	training's binary_logloss: 0.6611
[8]	training's binary_logloss: 0.658771
[9]	training's binary_logloss: 0.656329
[10]	training's binary_logloss: 0.654196
[11]	training's binary_logloss: 0.652471
[12]	training's binary_logloss: 0.650882
[13]	training's binary_logloss: 0.649265
[14]	training's binary_logloss: 0.64794
[15]	training's binary_logloss: 0.646684
[16]	training's binary_logloss: 0.645487
[17]	training's binary_logloss: 0.644293
[18]	training's binary_logloss: 0.643337
[19]	training's binary_logloss: 0.642276
[20]	training's binary_logloss: 0.64132
[21]	training's binary_logloss: 0.640491
[22]	training's binary_logloss: 0.639765
[23]	training's binary_logloss: 0.63902
[24]	training's binary_logloss: 0.638244
[25]	training's binary_logloss:

In [24]:
from sklearn.metrics import classification_report

y = lgbm.predict(x_train_features)
target_names = ['no risk', 'risk']

print(classification_report(y_train_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.98      0.78      0.87     63391
        risk       0.45      0.91      0.60     12724

    accuracy                           0.80     76115
   macro avg       0.71      0.84      0.73     76115
weighted avg       0.89      0.80      0.82     76115



In [25]:
from sklearn.metrics import classification_report

y = lgbm.predict(x_valid_features)
target_names = ['no risk', 'risk']

print(classification_report(y_valid_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.87      0.70      0.77     21052
        risk       0.25      0.48      0.33      4344

    accuracy                           0.66     25396
   macro avg       0.56      0.59      0.55     25396
weighted avg       0.76      0.66      0.70     25396



### 2. Tree Ensemble Model

#### make feature set

In [7]:
list(x_train.head())

['BPS',
 'PER',
 'PBR',
 'EPS',
 'DIV',
 'DPS',
 '거래량',
 '시가총액',
 '금리',
 '유동자산',
 '비유동자산',
 '자산총계',
 '유동부채',
 '비유동부채',
 '부채총계',
 '이익잉여금',
 '자본총계',
 '매출액',
 '영업이익',
 '법인세차감전 순이익',
 '당기순이익',
 '자본금']

In [9]:
rfecv_feature_list = ['BPS', 'PBR', 'DIV', '거래량', '시가총액', '금리', '자산총계', '이익잉여금', '자본총계']
sfs_feature_list = ['BPS', 'DIV', '거래량', '금리', '비유동자산', '자산총계', '부채총계', '법인세차감전 순이익', '당기순이익']
stock_info_list = ['BPS', 'PER', 'PBR', 'EPS', 'DIV', 'DPS', '거래량']
financial_info_list = ['유동자산', '비유동자산', '자산총계', '유동부채', '비유동부채', '부채총계', '이익잉여금', '자본총계', '매출액', '영업이익', '법인세차감전 순이익', '당기순이익', '자본금']


def make_feature_set(x) :
    x_whole = x
    x_rfecv = x[rfecv_feature_list]
    x_sfs = x[sfs_feature_list]
    x_f = x[financial_info_list]
    x_s = x[stock_info_list]
    return x_whole, x_rfecv, x_sfs, x_f, x_s


x_whole, x_rfecv, x_sfs, x_f, x_s= make_feature_set(x_train)

In [66]:
import lightgbm as LightGBM

feature_set = []
feature_set = make_feature_set(x_train)
model = []

## train
for x in feature_set :
    lgbm = LightGBM.LGBMClassifier(early_stopping_rounds=50,
                               reg_lambda = 0.25, 
                               n_estimators=500,
                               max_depth = 30,
                               class_weight={True: 10, False: 1}
                              ) 
    evals = [(x, y_train_bool)]
    lgbm.fit(x, y_train_bool, eval_metric='logloss', eval_set=evals)
    model.append(lgbm)

## prediction
def predict_ensemble_model(x_) :
    feature_set = make_feature_set(x_)
    y_pred = []
    i = 0
    for x in feature_set :
        pred = model[i].predict(x)
        y_pred.append(pred)
        i = i+1

    vote = lambda t : 1 if t > 2 else 0
    y_pred_sum = y_pred[0] & (y_pred[1] | y_pred[2] | y_pred[3] | y_pred[4])
    return y_pred_sum
#y_pred = np.array([vote(xi) for xi in y_pred])

[1]	training's binary_logloss: 0.628963
[2]	training's binary_logloss: 0.623169
[3]	training's binary_logloss: 0.618132
[4]	training's binary_logloss: 0.613871
[5]	training's binary_logloss: 0.610405
[6]	training's binary_logloss: 0.607457
[7]	training's binary_logloss: 0.604673
[8]	training's binary_logloss: 0.602339
[9]	training's binary_logloss: 0.600031
[10]	training's binary_logloss: 0.598022
[11]	training's binary_logloss: 0.596078
[12]	training's binary_logloss: 0.594463
[13]	training's binary_logloss: 0.592965
[14]	training's binary_logloss: 0.591672
[15]	training's binary_logloss: 0.590438
[16]	training's binary_logloss: 0.589205
[17]	training's binary_logloss: 0.58799
[18]	training's binary_logloss: 0.586932
[19]	training's binary_logloss: 0.586027
[20]	training's binary_logloss: 0.585062
[21]	training's binary_logloss: 0.584205
[22]	training's binary_logloss: 0.583322
[23]	training's binary_logloss: 0.582515
[24]	training's binary_logloss: 0.581757
[25]	training's binary_log

In [67]:
from sklearn.metrics import classification_report

y = predict_ensemble_model(x_train)
target_names = ['no risk', 'risk']

print(classification_report(y_train_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.99      0.44      0.61     63391
        risk       0.26      0.98      0.41     12724

    accuracy                           0.53     76115
   macro avg       0.63      0.71      0.51     76115
weighted avg       0.87      0.53      0.57     76115



In [68]:
from sklearn.metrics import classification_report

y = predict_ensemble_model(x_valid)
target_names = ['no risk', 'risk']

print(classification_report(y_valid_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.91      0.40      0.55     21052
        risk       0.22      0.81      0.34      4344

    accuracy                           0.47     25396
   macro avg       0.56      0.61      0.45     25396
weighted avg       0.79      0.47      0.52     25396

