In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
null_values = ['?', '??', 'N/A', 'NA', 'nan', 'NaN', '-nan', '-NaN', 'null', '-']
x_train = pd.read_csv('./data/track1/features/x_train_normal.csv', na_values = null_values)
x_valid = pd.read_csv('./data/track1/features/x_valid_normal.csv', na_values = null_values)
x_test = pd.read_csv('./data/track1/features/x_test_normal.csv', na_values = null_values)
y_train = pd.read_csv('./data/track1/features/y_train_normal.csv', na_values = null_values)
y_valid = pd.read_csv('./data/track1/features/y_valid_normal.csv', na_values = null_values)
y_test = pd.read_csv('./data/track1/features/y_test_normal.csv', na_values = null_values)

x_train_features = x_train.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
x_valid_features = x_valid.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
x_test_features = x_test.drop(columns=['날짜', 'CODE', '종가'], inplace=False)
y_train_bool = y_train['Y'] <-2.0
y_valid_bool = y_valid['Y'] <-2.0
y_test_bool = y_test['Y'] <-2.0

In [3]:
len(list(x_train_features.head()))

22

### 1. Data Loader

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class StockDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x.iloc[idx])
        y = torch.FloatTensor(self.y.iloc[idx])
        return x, y

### 2. MLP Classifier

In [None]:
import torch
import torch.nn as nn
from torch import optim

class Simple_MLP_Net(nn.Module):
    def __init__(self):
        super(Simple_MLP_Net, self).__init__()

        self.layer = nn.Sequential(
            nn.Linear(22, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 32, bias=True),
            nn.ReLU(),
        )
        self.output_layet = nn.Sequential(
            nn.Linear(32, 1, bias=True),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.layer(x)
        x = self.output_layer(x)
        return x

    def embedding_output(self, x):
        x = self.layer(x)
        return x


In [None]:
y_train_int = pd.DataFrame()
y_train_int['y'] = y_train_bool.astype(int)
train_dataset = StockDataset(x_train_features, y_train_int)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Simple_MLP_Net().to(device)
criterion = nn.BCELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)


for epoch in range(100):
    cost = 0.0
    model.train()
    for x, y in train_dataloader:
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cost += loss

    cost = cost / len(train_dataloader)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch : {epoch+1:4d}, Cost : {cost:.3f}")
        torch.save(model.state_dict(), './history/mlp_net_checkpoint' + str(epoch) +  '.pth')

Epoch :   10, Cost : 0.429
Epoch :   20, Cost : 0.425
Epoch :   30, Cost : 0.421
Epoch :   40, Cost : 0.417
Epoch :   50, Cost : 0.414
Epoch :   60, Cost : 0.410


In [None]:
#from torcheval.metrics import BinaryAccuracy
from torchmetrics.classification import BinaryAccuracy


y_train_int = pd.DataFrame()
y_train_int['y'] = y_train_bool.astype(int)
train_dataset = StockDataset(x_train_features, y_train_int)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)

y_valid_int = pd.DataFrame()
y_valid_int['y'] = y_valid_bool.astype(int)
valid_dataset = StockDataset(x_valid_features, y_valid_int)
valid_dataloader = DataLoader(valid_dataset, batch_size=128, shuffle=True, drop_last=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Simple_MLP_Net().to(device)

PATH = '/content/drive/MyDrive/data/mlp_net_checkpoint99.pth'
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint)

criterion = nn.BCELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)

model.eval()
total_acc = 0
total_loss = 0
num_batch = 0
for x, y in valid_dataloader:
    with torch.no_grad():
        x = x.to(device)
        y = y.to(device)

        outputs = model(x)
        #pred = (outputs > 0.5).float
        loss = criterion(outputs, y)
        metric = BinaryAccuracy().to(device)
        metric(outputs, y)
        acc = metric.compute()
        total_acc += acc
        total_loss += loss.cpu().item()
        num_batch = num_batch + 1
        
total_acc = total_acc/(num_batch) 
total_loss = total_loss/(num_batch)

print("acc : ", total_acc, "loss : " , total_loss)

### 3. Encoder Decoder

In [5]:
import torch
import torch.nn as nn
from torch import optim

class Encoder_Decoder(nn.Module):
    def __init__(self):
        super(Encoder_Decoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(22, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 32, bias=True),
            nn.ReLU(),
            nn.Linear(32, 16, bias=True),
            nn.Sigmoid(),
            
        )

        self.decoder = nn.Sequential(
            nn.Linear(16, 32, bias=True),
            nn.ReLU(),
            nn.Linear(32, 64, bias=True),
            nn.ReLU(),
            nn.Linear(64, 128, bias=True),
            nn.ReLU(),
            nn.Linear(128, 22, bias=True),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def calcEncoding(self, x):
        return self.encoder(x)

In [7]:
train_dataset = StockDataset(x_train_features, x_train_features)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Encoder_Decoder().to(device)

PATH = './history/embedding_net5_150_checkpoint.pth'
checkpoint = torch.load(PATH, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)

criterion = nn.MSELoss(reduction='mean').to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)

for epoch in range(1):
    cost = 0.0

    for x, y in train_dataloader:
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cost += loss

    cost = cost / len(train_dataloader)

    if (epoch) % 10 == 0:
        print(f"Epoch : {epoch+1:4d}, Cost : {cost:.3f}")
        #torch.save(model.state_dict(), '/content/drive/MyDrive/data/embedding_net5_' + str(epoch) + '_checkpoint.pth')

#torch.save(model.state_dict(), '/content/drive/MyDrive/data/embedding_net5_100_checkpoint.pth')

Epoch :    1, Cost : 0.003


In [8]:

valid_dataset = StockDataset(x_valid_features, x_valid_features)
valid_dataloader = DataLoader(valid_dataset, batch_size=128, shuffle=True, drop_last=True)

y_train_int = pd.DataFrame()
y_train_int['y'] = y_train_bool.astype(int)
train_dataset = StockDataset(x_train_features, y_train_int)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Encoder_Decoder().to(device)

PATH = './history/embedding_net5_150_checkpoint.pth'
checkpoint = torch.load(PATH, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)

criterion = nn.MSELoss(reduction='mean').to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)


#model.eval()
total_loss = 0
num_batch = 0
for x, y in valid_dataloader:
    with torch.no_grad():
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        total_loss += loss
        
total_loss = total_loss / len(valid_dataloader)
print(total_loss)


tensor(0.0023)


In [12]:

import numpy
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Encoder_Decoder().to(device)

PATH = './history/embedding_net5_150_checkpoint.pth'
checkpoint = torch.load(PATH, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint)

x_valid_embedding = x_valid.drop(columns=['종가'], inplace=False)
feature_list = ['BPS', 'PER', 'PBR', 'EPS', 'DIV', 'DPS', '거래량', '시가총액', '금리', '유동자산', 
'비유동자산', '자산총계', '유동부채', '비유동부채', '부채총계', '이익잉여금', '자본총계', '매출액', '영업이익', 
'법인세차감전 순이익', '당기순이익', '자본금']

df = pd.DataFrame()
for row in x_valid_embedding.iterrows():
    with torch.no_grad():
        torch_tensor = torch.tensor(row[1][feature_list].values)
        x = torch_tensor.to(device)
        y = torch_tensor.to(device)

        output = model(x)
        loss = criterion(output, y)

        total_loss += loss

        embedding = model.calcEncoding(x)
        print(type(embedding.numpy()))
        print(type(row[1][['날짜', 'CODE']]))

        break
    
total_loss = total_loss / len(x_valid_embedding)
print(total_loss)

RuntimeError: Numpy is not available

In [16]:
x_valid_embedding = x_valid.drop(columns=['종가'], inplace=False)
print(list(x_valid_embedding.head()))

['BPS', 'PER', 'PBR', 'EPS', 'DIV', 'DPS', '날짜', '거래량', '시가총액', '금리', 'CODE', '유동자산', '비유동자산', '자산총계', '유동부채', '비유동부채', '부채총계', '이익잉여금', '자본총계', '매출액', '영업이익', '법인세차감전 순이익', '당기순이익', '자본금']


In [None]:
from torcheval.metrics import BinaryAccuracy

model.eval()
total_acc = 0
total_loss = 0
num_batch = 0
for x, y in train_dataloader:
    with torch.no_grad():
        x = x.to(device)
        y = y.to(device)

        outputs = model(x)
        loss = criterion(outputs, y)
        metric = BinaryAccuracy()
        print(outputs)
        metric(outputs, y)
        acc = metric.compute()
        total_acc += acc
        total_loss += loss.cpu().item()
        num_batch = num_batch + 1
        
total_acc = total_acc/(num_batch) 
total_loss = total_loss/(num_batch)

print(total_acc, total_loss)

tensor([[6.0196e-01, 2.5275e-09, 2.0017e-15,  ..., 1.3709e-01, 3.6216e-03,
         1.5622e-19],
        [3.6322e-05, 1.2717e-12, 4.3412e-24,  ..., 1.3926e-01, 1.4928e-05,
         1.7391e-24],
        [7.4225e-13, 3.2759e-17, 4.1410e-37,  ..., 1.3002e-01, 6.7597e-15,
         5.5540e-30],
        ...,
        [2.7887e-01, 4.4763e-12, 6.3600e-22,  ..., 1.2250e-01, 2.2595e-07,
         6.1145e-27],
        [2.3583e-03, 1.6181e-11, 7.9250e-13,  ..., 3.3921e-04, 4.5342e-04,
         1.0000e+00],
        [3.8308e-01, 4.5908e-10, 3.4905e-14,  ..., 1.1343e-01, 1.0971e-06,
         4.8243e-11]], device='cuda:0')


TypeError: ignored