In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [12]:
null_values = ['?', '??', 'N/A', 'NA', 'nan', 'NaN', '-nan', '-NaN', 'null', '-']
x_train = pd.read_csv('./data/track1/features/x_train_normal.csv', na_values = null_values)
x_valid = pd.read_csv('./data/track1/features/x_valid_normal.csv', na_values = null_values)
x_test = pd.read_csv('./data/track1/features/x_test_normal.csv', na_values = null_values)
y_train = pd.read_csv('./data/track1/features/y_train_normal.csv', na_values = null_values)
y_valid = pd.read_csv('./data/track1/features/y_valid_normal.csv', na_values = null_values)
y_test = pd.read_csv('./data/track1/features/y_test_normal.csv', na_values = null_values)

In [13]:
x_train_features = x_train.drop(columns=['날짜', 'CODE'], inplace=False)
x_valid_features = x_valid.drop(columns=['날짜', 'CODE'], inplace=False)
x_test_features = x_test.drop(columns=['날짜', 'CODE'], inplace=False)
y_train_bool = y_train['Y'] <-2.0
y_valid_bool = y_valid['Y'] <-2.0
y_test_bool = y_test['Y'] <-2.0

In [15]:
x_train_features['시가총액'].mean()

-0.012279785211688932

### 1. Decision Tree

In [7]:
from sklearn import tree
decisionTree = tree.DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=5,
    class_weight={True: 10, False: 1}
)
decisionTree.fit(x_train_features, y_train_bool)

In [8]:
from sklearn.metrics import classification_report

y_pred = decisionTree.predict(x_train_features)
target_names = ['no risk', 'risk']
print(classification_report(y_train_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.96      0.28      0.43     63323
        risk       0.21      0.94      0.34     12790

    accuracy                           0.39     76113
   macro avg       0.58      0.61      0.39     76113
weighted avg       0.83      0.39      0.41     76113



In [9]:
from sklearn.metrics import classification_report

y_pred = decisionTree.predict(x_valid_features)
target_names = ['no risk', 'risk']
print(classification_report(y_valid_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.91      0.26      0.41     21025
        risk       0.20      0.88      0.32      4329

    accuracy                           0.37     25354
   macro avg       0.55      0.57      0.37     25354
weighted avg       0.79      0.37      0.39     25354



### 2. Random Forest 

In [5]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

oversample = RandomOverSampler(random_state = 42)
x_over, y_over = oversample.fit_resample(x_train_features, y_train_bool)
pd.DataFrame(y_over).value_counts()

Y    
False    63323
True     63323
dtype: int64

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200, 
    criterion='log_loss', 
    min_samples_split = 10,
    bootstrap=True,
    max_depth=10,
    class_weight={True: 10, False: 1}
    )
rf.fit(x_train_features, y_train_bool)
# rf.fit(x_over, y_over)

In [23]:
from sklearn.metrics import classification_report

y_pred = rf.predict(x_train_features)
target_names = ['no risk', 'risk']
print(classification_report(y_train_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.98      0.21      0.34     63323
        risk       0.20      0.98      0.33     12790

    accuracy                           0.33     76113
   macro avg       0.59      0.59      0.33     76113
weighted avg       0.85      0.33      0.34     76113



In [24]:
y_pred = rf.predict(x_valid_features)
target_names = ['no risk', 'risk']

print(classification_report(y_valid_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.93      0.20      0.33     21025
        risk       0.19      0.93      0.32      4329

    accuracy                           0.32     25354
   macro avg       0.56      0.56      0.32     25354
weighted avg       0.81      0.32      0.33     25354



In [25]:
y_pred = rf.predict(x_test_features)
target_names = ['no risk', 'risk']

print(classification_report(y_test_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.94      0.20      0.33     21093
        risk       0.19      0.93      0.32      4266

    accuracy                           0.32     25359
   macro avg       0.56      0.57      0.32     25359
weighted avg       0.81      0.32      0.33     25359



### 2. LightGBM

In [26]:
import lightgbm as LightGBM

lgbm = LightGBM.LGBMClassifier(early_stopping_rounds=100,
                               reg_lambda = 0.2, 
                               n_estimators=200,
                               max_depth = 20,
                               class_weight={True: 10, False: 1}
                              ) 

evals = [(x_train_features, y_train_bool)]
lgbm.fit(x_train_features, y_train_bool, eval_metric='logloss', eval_set=evals)
y_pred = lgbm.predict(x_train_features)

[1]	training's binary_logloss: 0.627573
[2]	training's binary_logloss: 0.62186
[3]	training's binary_logloss: 0.616436
[4]	training's binary_logloss: 0.611959
[5]	training's binary_logloss: 0.608186
[6]	training's binary_logloss: 0.604946
[7]	training's binary_logloss: 0.601911
[8]	training's binary_logloss: 0.599425
[9]	training's binary_logloss: 0.597111
[10]	training's binary_logloss: 0.595081
[11]	training's binary_logloss: 0.593307
[12]	training's binary_logloss: 0.591474
[13]	training's binary_logloss: 0.589972
[14]	training's binary_logloss: 0.588516
[15]	training's binary_logloss: 0.587134
[16]	training's binary_logloss: 0.585925
[17]	training's binary_logloss: 0.584636
[18]	training's binary_logloss: 0.583419
[19]	training's binary_logloss: 0.582342
[20]	training's binary_logloss: 0.581361
[21]	training's binary_logloss: 0.580444
[22]	training's binary_logloss: 0.579447
[23]	training's binary_logloss: 0.578487
[24]	training's binary_logloss: 0.577626
[25]	training's binary_log

In [27]:
from sklearn.metrics import classification_report

y = lgbm.predict(x_train_features)
target_names = ['no risk', 'risk']

print(classification_report(y_train_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.98      0.34      0.51     63323
        risk       0.23      0.97      0.37     12790

    accuracy                           0.45     76113
   macro avg       0.61      0.66      0.44     76113
weighted avg       0.85      0.45      0.49     76113



In [28]:
from sklearn.metrics import classification_report

y = lgbm.predict(x_valid_features)
target_names = ['no risk', 'risk']

print(classification_report(y_valid_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.92      0.33      0.48     21025
        risk       0.21      0.86      0.34      4329

    accuracy                           0.42     25354
   macro avg       0.57      0.60      0.41     25354
weighted avg       0.80      0.42      0.46     25354



### 3. Neural Network

In [4]:
len(list(x_train_features.head()))

23

#### 3.1 Data Loader

In [58]:
import torch
from torch.utils.data import Dataset, DataLoader

class StockDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x.iloc[idx])
        y = torch.FloatTensor(self.y.iloc[idx])
        return x, y



In [61]:
import torch
import torch.nn as nn
from torch import optim

class Simple_MLP_Net(nn.Module):
    def __init__(self):
        super(Simple_MLP_Net, self).__init__()

        self.layer = nn.Sequential(
            nn.Linear(23, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 16, bias=True),
            nn.ReLU(),
            nn.Linear(16, 1, bias=True),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.layer(x)
        return x


In [59]:
## test code
y_train_int = pd.DataFrame()
y_train_int['y'] = y_train_bool.astype(int)
train_dataset = StockDataset(x_train_features, y_train_int)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)
print(len(train_dataloader))

for epoch in range(100):
    cost = 0.0

    for x, y in train_dataloader:
        print(y)
        break;

594


In [65]:
y_train_int = pd.DataFrame()
y_train_int['y'] = y_train_bool.astype(int)
train_dataset = StockDataset(x_train_features, y_train_int)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Simple_MLP_Net().to(device)
criterion = nn.BCELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)

model.train()
for epoch in range(100):
    cost = 0.0

    for x, y in train_dataloader:
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cost += loss

    cost = cost / len(train_dataloader)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch : {epoch+1:4d}, Cost : {cost:.3f}")
        torch.save(model.state_dict(), './history/mlp_net_checkpoint' + str(epoch) +  '.pth')
    



Epoch :   10, Cost : 0.453
Epoch :   20, Cost : 0.453
Epoch :   30, Cost : 0.453
Epoch :   40, Cost : 0.453
Epoch :   50, Cost : 0.453


KeyboardInterrupt: 

In [None]:
from torcheval.metrics import BinaryAccuracy

model.eval()
total_acc = 0
total_loss = 0
num_batch = 0
for x, y in train_dataloader:
    with torch.no_grad():
        x = x.to(device)
        y = y.to(device)

        outputs = model(x)
        loss = criterion(outputs, y)
        metric = BinaryAccuracy()
        print(outputs)
        metric(outputs, y)
        acc = metric.compute()
        total_acc += acc
        total_loss += loss.cpu().item()
        num_batch = num_batch + 1
        
total_acc = total_acc/(num_batch) 
total_loss = total_loss/(num_batch)

print(total_acc, total_loss)

In [None]:
torch.save(model.state_dict(), './simple_mlp_net_epoch100')

In [None]:
with torch.no_grad():
    model.eval()
    inputs = torch.FloatTensor(
        [[89, 92, 75], [75, 64, 50], [38, 58, 63], [33, 42, 39], [23, 15, 32]]
    ).to(device)
    outputs = model(inputs)

    print("---------")
    print(outputs)
    print(outputs >= torch.FloatTensor([0.5]).to(device))


### Embedding Features for Timeseries Prediction
주가, 금리, 재무정보간의 관계를 포괄적으로 나타내는 피처를 뽑는 것이 목표

#### 1 Encoder-Decoder Model

In [None]:
import torch
import torch.nn as nn
from torch import optim

class Encoder_Decoder(nn.Module):
    def __init__(self):
        super(Simple_MLP_Net, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(23, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 32, bias=True),
            nn.ReLU(),
            nn.Linear(16, 8, bias=True),
            nn.Sigmoid(),
            
        )

        self.decoder = nn.Sequential(
            nn.Linear(8, 16, bias=True),
            nn.ReLU(),
            nn.Linear(32, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 23, bias=True),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [None]:
train_dataset = StockDataset(x_train_features, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Simple_MLP_Net().to(device)

criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)

for epoch in range(100):
    cost = 0.0

    for x, y in train_dataloader:
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cost += loss

    cost = cost / len(train_dataloader)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch : {epoch+1:4d}, Cost : {cost:.3f}")
        torch.save(model.state_dict(), './history/mlp_net_checkpoint' + str(epoch) +  '.pth')