### 0. Dataset Load

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
null_values = ['?', '??', 'N/A', 'NA', 'nan', 'NaN', '-nan', '-NaN', 'null', '-']
x_train = pd.read_csv('./data/track1/features/x_train_normal.csv', na_values = null_values)
x_valid = pd.read_csv('./data/track1/features/x_valid_normal.csv', na_values = null_values)
x_test = pd.read_csv('./data/track1/features/x_test_normal.csv', na_values = null_values)
y_train = pd.read_csv('./data/track1/features/y_train_normal.csv', na_values = null_values)
y_valid = pd.read_csv('./data/track1/features/y_valid_normal.csv', na_values = null_values)
y_test = pd.read_csv('./data/track1/features/y_test_normal.csv', na_values = null_values)

In [3]:
x_train_features = x_train.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
x_valid_features = x_valid.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
x_test_features = x_test.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
y_train_bool = y_train['Y'] <-2.0
y_valid_bool = y_valid['Y'] <-2.0
y_test_bool = y_test['Y'] <-2.0

### 1. Tree Base Simple Classifers
Tree 기반 분류 모델인 Decision Tree와 Random Forest 를 사용하여 재무데이터로 리스크 주식 분류 모델을 만들기

##### 1-1 Decision Tree Classifier

In [13]:
from sklearn import tree
import pickle
import joblib

decisionTree = tree.DecisionTreeClassifier(
    max_depth=15,
    min_samples_split=100,
    class_weight={True: 10, False: 1}
)
decisionTree.fit(x_train_features, y_train_bool)

joblib.dump(decisionTree, './test/models/decisionTree.pkl') 

['./test/models/decisionTree.pkl']

In [8]:
from sklearn.metrics import classification_report

y_pred = decisionTree.predict(x_train_features)
target_names = ['no risk', 'risk']
print(classification_report(y_train_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.98      0.34      0.50     63391
        risk       0.22      0.96      0.36     12724

    accuracy                           0.44     76115
   macro avg       0.60      0.65      0.43     76115
weighted avg       0.85      0.44      0.48     76115



In [9]:
from sklearn.metrics import classification_report

y_pred = decisionTree.predict(x_valid_features)
target_names = ['no risk', 'risk']
print(classification_report(y_valid_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.90      0.31      0.46     21052
        risk       0.20      0.83      0.32      4344

    accuracy                           0.40     25396
   macro avg       0.55      0.57      0.39     25396
weighted avg       0.78      0.40      0.44     25396



In [10]:
from sklearn.metrics import classification_report

y_pred = decisionTree.predict(x_test_features)
target_names = ['no risk', 'risk']
print(classification_report(y_test_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.90      0.31      0.46     21040
        risk       0.20      0.83      0.32      4311

    accuracy                           0.40     25351
   macro avg       0.55      0.57      0.39     25351
weighted avg       0.78      0.40      0.43     25351



##### 1-2 Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200, 
    criterion='entropy', 
    min_samples_split = 100,
    bootstrap=True,
    max_depth=20,
    class_weight={True: 10, False: 1}
    )
rf.fit(x_train_features, y_train_bool)

joblib.dump(rf, './test/models/randomForest.pkl')

['./test/models/randomForest.pkl']

In [18]:
from sklearn.metrics import classification_report

y_pred = rf.predict(x_train_features)
target_names = ['no risk', 'risk']
print(classification_report(y_train_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.97      0.40      0.57     63391
        risk       0.24      0.95      0.38     12724

    accuracy                           0.49     76115
   macro avg       0.61      0.67      0.48     76115
weighted avg       0.85      0.49      0.54     76115



In [19]:
y_pred = rf.predict(x_valid_features)
target_names = ['no risk', 'risk']

print(classification_report(y_valid_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.91      0.38      0.53     21052
        risk       0.22      0.83      0.34      4344

    accuracy                           0.45     25396
   macro avg       0.56      0.60      0.44     25396
weighted avg       0.79      0.45      0.50     25396



In [21]:
y_pred = rf.predict(x_test_features)
target_names = ['no risk', 'risk']

print(classification_report(y_test_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.91      0.38      0.54     21040
        risk       0.21      0.82      0.34      4311

    accuracy                           0.46     25351
   macro avg       0.56      0.60      0.44     25351
weighted avg       0.79      0.46      0.50     25351



### 2. Feature Selection
Forward, Backward 방식으로 Feature Selection을 시도 하였음

### 여기 채워 주면 될듯

### 3. LightGBM and Weak Bagging
gradient boosting 기반의 tree classifier인 lightGBM 모델을 사용하여 Risk 종목 분류를 하였습니다
LightGBM 모델들을 Feature selection 결과를 기반으로 Bagging의 아이디어를 활용하여 앙상블 하는 모델을 만들었습니다

In [None]:
import lightgbm as LightGBM

lgbm = LightGBM.LGBMClassifier(early_stopping_rounds=100,
                               reg_lambda = 0.25, 
                               n_estimators=1500,
                               max_depth = 50,
                               min_data_in_leaf = 50,
                               class_weight={True: 5, False: 1}
                              ) 

evals = [(x_train_features, y_train_bool)]
lgbm.fit(x_train_features, y_train_bool, eval_metric='logloss', eval_set=evals)
y_pred = lgbm.predict(x_train_features)

[1]	training's binary_logloss: 0.68593
[2]	training's binary_logloss: 0.680011
[3]	training's binary_logloss: 0.674916
[4]	training's binary_logloss: 0.670883
[5]	training's binary_logloss: 0.667258
[6]	training's binary_logloss: 0.664002
[7]	training's binary_logloss: 0.6611
[8]	training's binary_logloss: 0.658771
[9]	training's binary_logloss: 0.656329
[10]	training's binary_logloss: 0.654196
[11]	training's binary_logloss: 0.652471
[12]	training's binary_logloss: 0.650882
[13]	training's binary_logloss: 0.649265
[14]	training's binary_logloss: 0.64794
[15]	training's binary_logloss: 0.646684
[16]	training's binary_logloss: 0.645487
[17]	training's binary_logloss: 0.644293
[18]	training's binary_logloss: 0.643337
[19]	training's binary_logloss: 0.642276
[20]	training's binary_logloss: 0.64132
[21]	training's binary_logloss: 0.640491
[22]	training's binary_logloss: 0.639765
[23]	training's binary_logloss: 0.63902
[24]	training's binary_logloss: 0.638244
[25]	training's binary_logloss:

In [None]:
from sklearn.metrics import classification_report

y = lgbm.predict(x_train_features)
target_names = ['no risk', 'risk']

print(classification_report(y_train_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.98      0.78      0.87     63391
        risk       0.45      0.91      0.60     12724

    accuracy                           0.80     76115
   macro avg       0.71      0.84      0.73     76115
weighted avg       0.89      0.80      0.82     76115



In [None]:
from sklearn.metrics import classification_report

y = lgbm.predict(x_valid_features)
target_names = ['no risk', 'risk']

print(classification_report(y_valid_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.87      0.70      0.77     21052
        risk       0.25      0.48      0.33      4344

    accuracy                           0.66     25396
   macro avg       0.56      0.59      0.55     25396
weighted avg       0.76      0.66      0.70     25396

