In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
null_values = ['?', '??', 'N/A', 'NA', 'nan', 'NaN', '-nan', '-NaN', 'null', '-']
x_train = pd.read_csv('./data/track1/features/x_train_normal.csv', na_values = null_values)
x_valid = pd.read_csv('./data/track1/features/x_valid_normal.csv', na_values = null_values)
x_test = pd.read_csv('./data/track1/features/x_test_normal.csv', na_values = null_values)
y_train = pd.read_csv('./data/track1/features/y_train_normal.csv', na_values = null_values)
y_valid = pd.read_csv('./data/track1/features/y_valid_normal.csv', na_values = null_values)
y_test = pd.read_csv('./data/track1/features/y_test_normal.csv', na_values = null_values)

In [3]:
x_train_features = x_train.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
x_valid_features = x_valid.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
x_test_features = x_test.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
y_train_bool = y_train['Y'] <-2.0
y_valid_bool = y_valid['Y'] <-2.0
y_test_bool = y_test['Y'] <-2.0

In [4]:
y_train_bool.value_counts()

False    63391
True     12724
Name: Y, dtype: int64

In [4]:
x_train_features['시가총액'].mean()

-0.01272906236005884

### 1. Decision Tree

In [6]:
from sklearn import tree
decisionTree = tree.DecisionTreeClassifier(
    max_depth=15,
    min_samples_split=100,
    class_weight={True: 10, False: 1}
)
decisionTree.fit(x_train_features, y_train_bool)

In [24]:
from sklearn.metrics import classification_report

y_pred = decisionTree.predict(x_train_features)
target_names = ['no risk', 'risk']
print(classification_report(y_train_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.98      0.34      0.51     63391
        risk       0.23      0.96      0.37     12724

    accuracy                           0.45     76115
   macro avg       0.60      0.65      0.44     76115
weighted avg       0.85      0.45      0.48     76115



In [25]:
from sklearn.metrics import classification_report

y_pred = decisionTree.predict(x_valid_features)
target_names = ['no risk', 'risk']
print(classification_report(y_valid_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.90      0.32      0.47     21052
        risk       0.20      0.82      0.32      4344

    accuracy                           0.40     25396
   macro avg       0.55      0.57      0.39     25396
weighted avg       0.78      0.40      0.44     25396



In [7]:
from sklearn.metrics import classification_report

y_pred = decisionTree.predict(x_test_features)
target_names = ['no risk', 'risk']
print(classification_report(y_test_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.90      0.31      0.46     21040
        risk       0.20      0.83      0.32      4311

    accuracy                           0.40     25351
   macro avg       0.55      0.57      0.39     25351
weighted avg       0.78      0.40      0.43     25351



### 2. Random Forest 

In [26]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

oversample = RandomOverSampler(random_state = 42)
x_over, y_over = oversample.fit_resample(x_train_features, y_train_bool)
pd.DataFrame(y_over).value_counts()

Y    
False    63391
True     63391
dtype: int64

In [27]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200, 
    criterion='log_loss', 
    min_samples_split = 10,
    bootstrap=True,
    max_depth=10,
    class_weight={True: 10, False: 1}
    )
rf.fit(x_train_features, y_train_bool)
# rf.fit(x_over, y_over)

In [28]:
from sklearn.metrics import classification_report

y_pred = rf.predict(x_train_features)
target_names = ['no risk', 'risk']
print(classification_report(y_train_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.98      0.19      0.32     63391
        risk       0.20      0.98      0.33     12724

    accuracy                           0.32     76115
   macro avg       0.59      0.59      0.32     76115
weighted avg       0.85      0.32      0.32     76115



In [29]:
y_pred = rf.predict(x_valid_features)
target_names = ['no risk', 'risk']

print(classification_report(y_valid_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.95      0.19      0.31     21052
        risk       0.19      0.95      0.32      4344

    accuracy                           0.32     25396
   macro avg       0.57      0.57      0.32     25396
weighted avg       0.82      0.32      0.31     25396



In [30]:
y_pred = rf.predict(x_test_features)
target_names = ['no risk', 'risk']

print(classification_report(y_test_bool, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     no risk       0.94      0.18      0.31     21040
        risk       0.19      0.95      0.32      4311

    accuracy                           0.31     25351
   macro avg       0.57      0.56      0.31     25351
weighted avg       0.81      0.31      0.31     25351



### 2. LightGBM

In [8]:
import lightgbm as LightGBM

lgbm = LightGBM.LGBMClassifier(early_stopping_rounds=100,
                               reg_lambda = 0.2, 
                               n_estimators=200,
                               max_depth = 20,
                               class_weight={True: 10, False: 1}
                              ) 

evals = [(x_train_features, y_train_bool)]
lgbm.fit(x_train_features, y_train_bool, eval_metric='logloss', eval_set=evals)
y_pred = lgbm.predict(x_train_features)

[1]	training's binary_logloss: 0.628962
[2]	training's binary_logloss: 0.623168
[3]	training's binary_logloss: 0.61813
[4]	training's binary_logloss: 0.613869
[5]	training's binary_logloss: 0.610403
[6]	training's binary_logloss: 0.607455
[7]	training's binary_logloss: 0.604671
[8]	training's binary_logloss: 0.602336
[9]	training's binary_logloss: 0.600028
[10]	training's binary_logloss: 0.598019
[11]	training's binary_logloss: 0.596075
[12]	training's binary_logloss: 0.59446
[13]	training's binary_logloss: 0.592961
[14]	training's binary_logloss: 0.591668
[15]	training's binary_logloss: 0.590434
[16]	training's binary_logloss: 0.589201
[17]	training's binary_logloss: 0.587986
[18]	training's binary_logloss: 0.586928
[19]	training's binary_logloss: 0.586022
[20]	training's binary_logloss: 0.585056
[21]	training's binary_logloss: 0.5842
[22]	training's binary_logloss: 0.583317
[23]	training's binary_logloss: 0.582509
[24]	training's binary_logloss: 0.581751
[25]	training's binary_loglos

In [9]:
from sklearn.metrics import classification_report

y = lgbm.predict(x_train_features)
target_names = ['no risk', 'risk']

print(classification_report(y_train_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.98      0.33      0.50     63391
        risk       0.23      0.97      0.37     12724

    accuracy                           0.44     76115
   macro avg       0.60      0.65      0.43     76115
weighted avg       0.86      0.44      0.48     76115



In [10]:
from sklearn.metrics import classification_report

y = lgbm.predict(x_valid_features)
target_names = ['no risk', 'risk']

print(classification_report(y_valid_bool, y, target_names=target_names))

              precision    recall  f1-score   support

     no risk       0.93      0.32      0.47     21052
        risk       0.21      0.88      0.34      4344

    accuracy                           0.41     25396
   macro avg       0.57      0.60      0.40     25396
weighted avg       0.80      0.41      0.45     25396



### 3. Neural Network

#### 3.1 Data Loader

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

class StockDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x.iloc[idx])
        y = torch.FloatTensor(self.y.iloc[idx])
        return x, y



In [27]:
import torch
import torch.nn as nn
from torch import optim

class Simple_MLP_Net(nn.Module):
    def __init__(self):
        super(Simple_MLP_Net, self).__init__()

        self.layer = nn.Sequential(
            nn.Linear(23, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 32, bias=True),
            nn.ReLU()
        )
        self.output_layer = nn.Sequential(
            nn.Linear(32, 1, bias=True),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.layer(x)
        out = self.output_layer(x)
        return out

    def embedding_output(self, x):
        x = self.layer(x)
        return x


In [7]:
## test code
y_train_int = pd.DataFrame()
y_train_int['y'] = y_train_bool.astype(int)
train_dataset = StockDataset(x_train_features, y_train_int)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)
print(len(train_dataloader))

for epoch in range(100):
    cost = 0.0

    for x, y in train_dataloader:
        print(y)
        break;

594
tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
  

  self.handle: torch.Tensor = torch.zeros(1)


tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
      

In [28]:
y_train_int = pd.DataFrame()
y_train_int['y'] = y_train_bool.astype(int)
train_dataset = StockDataset(x_train_features, y_train_int)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Simple_MLP_Net().to(device)

#PATH = './history/mlp_net_checkpoint99.pth'
#checkpoint = torch.load(PATH)
#model.load_state_dict(checkpoint)

criterion = nn.BCELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)

model.train()
for epoch in range(20):
    cost = 0.0

    for x, y in train_dataloader:
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cost += loss

    cost = cost / len(train_dataloader)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch : {epoch+1:4d}, Cost : {cost:.3f}")
        
torch.save(model.state_dict(), './history/mlp_net_checkpoint' + str(epoch) +  '.pth')
    



Epoch :   10, Cost : 0.427
Epoch :   20, Cost : 0.422


In [38]:
y_train_int = pd.DataFrame()
y_train_int['y'] = y_train_bool.astype(int)
train_dataset = StockDataset(x_train_features, y_train_int)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=False, drop_last=False) ## not suffle\

model.eval()
total_output = torch.tensor

for x, y in train_dataloader:
    with torch.no_grad():
        x = x.to(device)
        y = y.to(device)

        outputs = model.embedding_output(x)
        #total_output.append(outputs)
        torch.cat([total_output, outputs])

TypeError: expected Tensor as element 0 in argument 0, but got builtin_function_or_method

In [34]:
print(total_output)

[tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.8578, 1.3903],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.6837, 1.0559],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.8812, 1.4364],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.6861, 1.0608],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.3573, 2.3382],
        [0.0045, 0.0000, 0.0000,  ..., 0.0000, 0.5503, 0.8188]]), tensor([[0.0296, 0.0000, 0.0000,  ..., 0.0000, 0.4906, 0.6783],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.6037, 0.9061],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.0515, 1.7587],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.7799, 1.2386],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.7444, 1.1715],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.5760, 0.8524]]), tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.7136, 1.1133],
        [0.0386, 0.0000, 0.0000,  ..., 0.0000, 0.4628, 0.6354],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.1907, 2.0284],
        .

In [None]:
from torcheval.metrics import BinaryAccuracy

model.eval()
total_acc = 0
total_loss = 0
num_batch = 0
for x, y in train_dataloader:
    with torch.no_grad():
        x = x.to(device)
        y = y.to(device)

        outputs = model(x)
        loss = criterion(outputs, y)
        metric = BinaryAccuracy()
        metric(outputs, y)
        acc = metric.compute()
        total_acc += acc
        total_loss += loss.cpu().item()
        num_batch = num_batch + 1
        
total_acc = total_acc/(num_batch) 
total_loss = total_loss/(num_batch)

print(total_acc, total_loss)

In [None]:
torch.save(model.state_dict(), './simple_mlp_net_epoch100')

In [None]:
with torch.no_grad():
    model.eval()
    inputs = torch.FloatTensor(
        [[89, 92, 75], [75, 64, 50], [38, 58, 63], [33, 42, 39], [23, 15, 32]]
    ).to(device)
    outputs = model(inputs)

    print("---------")
    print(outputs)
    print(outputs >= torch.FloatTensor([0.5]).to(device))


### Embedding Features for Timeseries Prediction
주가, 금리, 재무정보간의 관계를 포괄적으로 나타내는 피처를 뽑는 것이 목표

#### 1 Encoder-Decoder Model

In [1]:
import torch
import torch.nn as nn
from torch import optim

class Encoder_Decoder(nn.Module):
    def __init__(self):
        super(Encoder_Decoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(23, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 32, bias=True),
            nn.ReLU(),
            nn.Linear(32, 16, bias=True),
            nn.Sigmoid(),
            #nn.Linear(16, 8, bias=True),
            #nn.Sigmoid()
        )

        self.decoder = nn.Sequential(
            #nn.Linear(8, 16, bias=True),
            #nn.ReLU(),
            nn.Linear(16, 32, bias=True),
            nn.ReLU(),
            nn.Linear(32, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 23, bias=True),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [None]:
train_dataset = StockDataset(x_train_features, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Simple_MLP_Net().to(device)

criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)

for epoch in range(100):
    cost = 0.0

    for x, y in train_dataloader:
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cost += loss

    cost = cost / len(train_dataloader)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch : {epoch+1:4d}, Cost : {cost:.3f}")
        torch.save(model.state_dict(), './history/mlp_net_checkpoint' + str(epoch) +  '.pth')

#### encoder decoder validation

In [15]:

valid_dataset = StockDataset(x_valid_features, x_valid_features)
valid_dataloader = DataLoader(valid_dataset, batch_size=128, shuffle=True, drop_last=True)


train_dataset = StockDataset(x_train_features, x_train_features)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Encoder_Decoder().to(device)

PATH = './history/embedding_net5_30_checkpoint.pth'
checkpoint = torch.load(PATH, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)

criterion = nn.MSELoss(reduction='mean').to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)

model.eval()
total_loss = 0
num_batch = 0
for x, y in train_dataloader:
    with torch.no_grad():
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        total_loss += loss

total_loss = total_loss / len(valid_dataloader)
print(total_loss)


tensor(3.1429)
