##### Dataset Load

In [5]:
import pandas as pd
null_values = ['?', '??', 'N/A', 'NA', 'nan', 'NaN', '-nan', '-NaN', 'null', '-']
x_test = pd.read_csv('./data/x_test_normal.csv', na_values = null_values)
y_test = pd.read_csv('./data/y_test_normal.csv', na_values = null_values)
x_test_features = x_test.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
y_test_bool = y_test['Y'] <-2.0

### 1. Tree Base Simple Classifers

##### 1-1 Decision Tree Model

In [4]:
import pickle
import joblib
from sklearn import tree
from sklearn.metrics import classification_report

decisionTree = tree.DecisionTreeClassifier(
    max_depth=15,
    min_samples_split=100,
    class_weight={True: 10, False: 1}
)

decisionTree = joblib.load('./models/decisionTree.pkl') 
y_pred = decisionTree.predict(x_test_features)
target_names = ['no risk', 'risk']
print(classification_report(y_test_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.90      0.31      0.46     21040
        risk       0.20      0.83      0.32      4311

    accuracy                           0.40     25351
   macro avg       0.55      0.57      0.39     25351
weighted avg       0.78      0.40      0.43     25351



##### 1-2 Random Forest

In [8]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200, 
    criterion='entropy', 
    min_samples_split = 100,
    bootstrap=True,
    max_depth=20,
    class_weight={True: 10, False: 1}
    )
rf = joblib.load('./models/randomForest.pkl') 
y_pred = rf.predict(x_test_features)
target_names = ['no risk', 'risk']
print(classification_report(y_test_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.91      0.38      0.54     21040
        risk       0.21      0.82      0.34      4311

    accuracy                           0.46     25351
   macro avg       0.56      0.60      0.44     25351
weighted avg       0.79      0.46      0.50     25351



### 2. LightGBM and Weak Bagging

In [None]:
! pip install lightgbm 

##### 2-1 LightGBM

In [11]:
from sklearn.metrics import classification_report
import lightgbm as LightGBM

lgbm = LightGBM.LGBMClassifier(early_stopping_rounds=100,
                               reg_lambda = 0.25, 
                               n_estimators=600,
                               max_depth = 50,
                               min_data_in_leaf = 50,
                               class_weight={True: 10, False: 1},
                               learning_rate= 0.1
                              ) 

rf = joblib.load('./models/LightGBM.pkl') 
y_pred = rf.predict(x_test_features)
target_names = ['no risk', 'risk']
print(classification_report(y_test_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.91      0.40      0.55     21040
        risk       0.21      0.80      0.34      4311

    accuracy                           0.47     25351
   macro avg       0.56      0.60      0.44     25351
weighted avg       0.79      0.47      0.52     25351



##### 2-2 LightGBM and Ensemble

In [24]:
from sklearn.metrics import classification_report
import lightgbm as LightGBM

# light gbm 앙상블을 위해 feature set을 생성하는 코드
rfecv_feature_list = ['BPS', 'PBR', 'DIV', '거래량', '시가총액', '금리', '자산총계', '이익잉여금', '자본총계']
sfs_feature_list = ['BPS', 'DIV', '거래량', '금리', '비유동자산', '자산총계', '부채총계', '법인세차감전 순이익', '당기순이익']


def make_feature_set(x) :
    x_whole = x
    x_rfecv = x[rfecv_feature_list]
    x_sfs = x[sfs_feature_list]
    return x_whole, x_rfecv, x_sfs

feature_set = []
feature_set = make_feature_set(x_test_features)
model = []

## train
i = 0
for x in feature_set :
    lgbm = LightGBM.LGBMClassifier(early_stopping_rounds=100,
                               reg_lambda = 0.25, 
                               n_estimators=600,
                               max_depth = 50,
                               min_data_in_leaf = 50,
                               class_weight={True: 10, False: 1},
                               learning_rate= 0.1
                              ) 
    lgbm = joblib.load('./models/lgbm_ensembles' +str(i) + '.pkl') 
    i = i+1
    
    model.append(lgbm)

## prediction
def predict_ensemble_model(x_) :
    feature_set = make_feature_set(x_)
    y_pred = []
    i = 0
    for x in feature_set :
        print(type(model[i]))
        pred = model[i].predict(x)
        y_pred.append(pred)
        i = i+1

    y_pred_sum = y_pred[0] | (y_pred[1] & y_pred[2])# & y_pred[3] & y_pred[4])
    return y_pred_sum

y_pred = predict_ensemble_model(x_test_features)
target_names = ['no risk', 'risk']
print(classification_report(y_test_bool, y_pred, target_names = target_names))

<class 'lightgbm.sklearn.LGBMClassifier'>
<class 'lightgbm.sklearn.LGBMClassifier'>
<class 'lightgbm.sklearn.LGBMClassifier'>
              precision    recall  f1-score   support

     no risk       0.91      0.37      0.52     21040
        risk       0.21      0.82      0.34      4311

    accuracy                           0.45     25351
   macro avg       0.56      0.60      0.43     25351
weighted avg       0.79      0.45      0.49     25351



### 3. Multi Layer Perceptrons

In [None]:
! pip install torch
! pip install torchmetrics

In [26]:
## data loader

import torch
from torch.utils.data import Dataset, DataLoader

class StockDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x.iloc[idx])
        y = torch.FloatTensor(self.y.iloc[idx])
        return x, y

##### MLP Classifier

In [33]:
import torch
import torch.nn as nn
from torch import optim

class Simple_MLP_Net(nn.Module):
    def __init__(self):
        super(Simple_MLP_Net, self).__init__()

        self.layer = nn.Sequential(
            nn.Linear(22, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 32, bias=True),
            nn.ReLU(),
        )
        self.output_layer = nn.Sequential(
            nn.Linear(32, 1, bias=True),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.layer(x)
        x = self.output_layer(x)
        return x

    def embedding_output(self, x):
        x = self.layer(x)
        return x


In [34]:
#from torcheval.metrics import BinaryAccuracy
from torchmetrics.classification import BinaryAccuracy


y_test_int = pd.DataFrame()
y_test_int['y'] = y_test_bool.astype(int)
valid_dataset = StockDataset(x_test_features, y_test_int)
valid_dataloader = DataLoader(valid_dataset, batch_size=128, shuffle=True, drop_last=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Simple_MLP_Net().to(device)

PATH = './models/mlp_net_checkpoint19.pth'
checkpoint = torch.load(PATH, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)

criterion = nn.BCELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)

model.eval()
total_acc = 0
total_loss = 0
num_batch = 0
for x, y in valid_dataloader:
    with torch.no_grad():
        x = x.to(device)
        y = y.to(device)

        outputs = model(x)
        loss = criterion(outputs, y)
        metric = BinaryAccuracy().to(device)
        metric(outputs, y)
        acc = metric.compute()
        total_acc += acc
        total_loss += loss.cpu().item()
        num_batch = num_batch + 1
        
total_acc = total_acc/(num_batch) 
total_loss = total_loss/(num_batch)

print("acc : ", total_acc, "loss : " , total_loss)

acc :  tensor(0.8299) loss :  0.43261614170941437


##### 3-2 Encoder Decoder

In [35]:
import torch
import torch.nn as nn
from torch import optim

class Encoder_Decoder(nn.Module):
    def __init__(self):
        super(Encoder_Decoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(22, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 32, bias=True),
            nn.ReLU(),
            nn.Linear(32, 16, bias=True),
            nn.Sigmoid(),
            
        )

        self.decoder = nn.Sequential(
            nn.Linear(16, 32, bias=True),
            nn.ReLU(),
            nn.Linear(32, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 22, bias=True),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def calcEncoding(self, x):
        return self.encoder(x)

In [38]:

test_dataset = StockDataset(x_test_features, x_test_features)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True, drop_last=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Encoder_Decoder().to(device)

PATH = './models/embedding_net5_150_checkpoint.pth'
checkpoint = torch.load(PATH, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)

criterion = nn.MSELoss(reduction='mean').to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)


#model.eval()
total_loss = 0
num_batch = 0
for x, y in test_dataloader:
    with torch.no_grad():
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        total_loss += loss
        
total_loss = total_loss / len(valid_dataloader)
print("test loss : " + str(float(total_loss)))


test loss : 0.002253052545711398


### 4. LSTM

In [39]:
!pip install keras
!pip install tensorflow

Collecting keras
  Downloading keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
Installing collected packages: keras
Successfully installed keras-2.11.0
Collecting tensorflow
  Downloading tensorflow-2.11.0-cp39-cp39-win_amd64.whl (1.9 kB)
Collecting tensorflow-intel==2.11.0
  Downloading tensorflow_intel-2.11.0-cp39-cp39-win_amd64.whl (266.3 MB)
Collecting termcolor>=1.1.0
  Downloading termcolor-2.1.1-py3-none-any.whl (6.2 kB)
Collecting libclang>=13.0.0
  Downloading libclang-14.0.6-py2.py3-none-win_amd64.whl (14.2 MB)
Collecting google-pasta>=0.1.1
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting tensorflow-estimator<2.12,>=2.11.0
  Downloading tensorflow_estimator-2.11.0-py2.py3-none-any.whl (439 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1
  Downloading tensorflow_io_gcs_filesystem-0.29.0-cp39-cp39-win_amd64.whl (1.5 MB)
Collecting tensorboard<2.12,>=2.11
  Using cached tensorboard-2.11.0-py3-none-any.whl (6.0 MB)
Collecting absl-py>=1.0.0
  Using cached absl

In [42]:
import numpy as np
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Dense
import keras.backend as K

x = np.load('./models/valid_info/x_valid.npy', allow_pickle=True)               # 전체 데이터 중 25%에 해당하는 학습에 사용되지 않은 validation data 로드
y = np.load('./models/valid_info/y_valid.npy', allow_pickle=True)               # x_valid는 (기업수)*(기업당 데이터 수)*(window 크기)*(feature 갯수) = 299*106*10*20
                                                                       # y_valid는 (기업수)*(결과 값) = 299*106
K.clear_session()
model = Sequential()                                                            # 학습에 사용한 모델의 구조 복구
model.add(LSTM(20, input_shape=(10, 20)))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()
model.load_weights('./models/valid_info/v3')                                                        # 학습된 weights 불러와 적용
model.save_weights('./models/valid_info/v3')

results = {'TP' : 0, 'TN' : 0, 'FP' : 0, 'FN' : 0}

for i in range(len(y)):
    predicts = model.predict(x[i], verbose=0)                                   # validation 진행
    predicts = [True if x>0.5 else False for [x] in predicts]                   # classification 결과값 환산
    for j in range(len(predicts)):
        if y[i][j] and predicts[j]:
            results['TP'] += 1          # TP : true positive
        elif y[i][j]:
            results['FN'] += 1          # TN : true negative
        elif predicts[j]:
            results['FP'] += 1          # FP = false positive
        else:
            results['TN'] += 1          # FN = false negative

print('\nRESULTS')                                                          # 결과값 출력부분, precision, recall, f1 score, support 출력
print('True Positive\tFalse Negative\tFalse Positive\tTrue Negative')
print('{}\t\t\t\t{}\t\t\t\t{}\t\t\t\t{}'.format(results['TP'], results['FN'], results['FP'],  results['TN']))
print('------------------------------------------------------------')
print('\t\t precision\t\trecall\t\t\tf1 score\t\tsupport')
precision = results['TP'] / (results['TP'] + results['FP'])             # precision값과 recall값, f1-score 직접 계산
recall = results['TP'] / (results['TP'] + results['FN'])
print('risk    :{:.2f}\t\t\t{:.2f}\t\t\t{:.2f}\t\t\t{}'.format(precision, recall, 2*precision*recall / (precision+recall), results['TP'] + results['FN']))
precision = results['TN'] / (results['TN'] + results['FN'])
recall = results['TN'] / (results['TN'] + results['FP'])
print('no risk :{:.2f}\t\t\t{:.2f}\t\t\t{:.2f}\t\t\t{}'.format(precision, recall, 2*precision*recall / (precision+recall), results['FP'] + results['TN']))
print('accuracy: {}'.format((results['TP'] + results['TN']) / (results['TP'] + results['TN'] + results['FP'] + results['FN'])))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 20)                3280      
                                                                 
 dense (Dense)               (None, 128)               2688      
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 16)                528       
                                                                 
 dense_4 (Dense)             (None, 1)                 17        
                                                                 
Total params: 16,849
Trainable params: 16,849
Non-traina