In [1]:
import os
import tqdm
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import Set
import Utility
import SQLSentence

%load_ext autoreload
%autoreload 1
%aimport Set
%aimport Indicators

In [2]:
# Constants
TABLE_CODE = 'stock.stock_code'
TABLE_TYPE = 'stock.stock_type'
TABLE_CODE_TYPE = 'stock.stock_code_type'

PATH_CHECKPOINT = 'checkpoint'

In [3]:
# Build Basic Data
connect = Utility.connect_to_database()
main_data = Utility.GetAllData(connect)
list_stock_type = (SQLSentence.QuerySQL(TABLE_CODE_TYPE, connect, ['distinct stock_type_id']))['stock_type_id'].tolist()
df_code_name = SQLSentence.QuerySQL(TABLE_CODE, connect)
dict_code_name = dict(zip(df_code_name.code, df_code_name.name))
dict_type_name = ((SQLSentence.QuerySQL(TABLE_TYPE, connect)).drop(['id'], axis=1)).to_dict()['name']

del df_code_name

# Build Environment
device = 'cuda' if torch.cuda.is_available() else 'cpu'

if not os.path.exists(PATH_CHECKPOINT):
    os.makedirs(PATH_CHECKPOINT)

myseed = 42069  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

In [5]:
# Dataset
class StockDataset(Dataset):
    def __init__(self, data, target, window_length_train):
        self.data = data
        self.target = target
        self.window_length_train = window_length_train

        self.data = self.data.drop(['code', 'date'], axis=1)

        self.data = Set.Feature(self.data)

        self.data = self.data.dropna().reset_index(drop=True)

        self.index_col_volume = self.data.columns.get_loc("volume") # Get Feature Start Index
        self.index_col_body_pct = self.data.columns.get_loc("body_pct") # Get Target Index

        self.data = self.data.to_numpy()
        self.dim = (self.data.shape[1]-(self.index_col_volume+1))*self.window_length_train

    def __getitem__(self, index):
        data = torch.tensor(self.data[index:index+self.window_length_train, self.index_col_volume+1:], dtype=torch.float32).flatten()
        if self.target >= 0:
            label = torch.tensor(int(self.data[index + self.window_length_train, self.index_col_body_pct] >= self.target), dtype=torch.float32)
        else:
            label = torch.tensor(int(self.data[index + self.window_length_train, self.index_col_body_pct] <= self.target), dtype=torch.float32)
        return data, label
    
    def __len__(self):
        return len(self.data) - self.window_length_train

In [6]:
# DataLoader
BATCH_SIZE = 128

dataset_size = len(main_data[main_data['code'] == 2316])
train_size = int(0.9 * dataset_size)

target_pct = 0.02*100 # Target rise percentage value
window_length_train = 20

dataset_train = StockDataset(main_data[main_data['code'] == 2316][:train_size].copy().reset_index(drop=True), target_pct, window_length_train)
dataset_test = StockDataset(main_data[main_data['code'] == 2316][train_size:].copy().reset_index(drop=True), target_pct, window_length_train)
datalaoder_train = DataLoader(dataset_train, BATCH_SIZE, shuffle=True)
datalaoder_test = DataLoader(dataset_test, BATCH_SIZE, shuffle=False)

del dataset_train, dataset_test

In [7]:
# Model
class Model_1(nn.Module):
    def __init__(self, input_dim):
        super(Model_1, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_dim, input_dim*2),
            nn.Linear(input_dim*2, input_dim*4),
            nn.Linear(input_dim*4, input_dim*8),
            nn.Linear(input_dim*8, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.net(x)
        return x

In [None]:
BATCH_SIZE = 128

lr = 1e-4
model = Model_1(datalaoder_train.dataset.dim).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.5, 0.999))

for type_id in list_stock_type:
    print(f'Train {dict_type_name[type_id]}')

    list_unique_code_from_data = (SQLSentence.GetCodeByTypeId(type_id, connect))['code'].tolist()
    
    model = Model_1(datalaoder_train.dataset.dim).to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.5, 0.999))

    best_test_loss = 10.
    
    for code in tqdm.tqdm(list_unique_code_from_data):
        dataset = StockDataset(main_data[main_data['code'] == code].copy().reset_index(drop=True), target_pct, window_length_train)
        train_size = int(0.9 * dataset.__len__())

        dataset_train = StockDataset(main_data[main_data['code'] == code][:train_size].copy().reset_index(drop=True), target_pct, window_length_train)
        dataset_test = StockDataset(main_data[main_data['code'] == code][train_size:].copy().reset_index(drop=True), target_pct, window_length_train)
        datalaoder_train = DataLoader(dataset_train, BATCH_SIZE, shuffle=True)
        datalaoder_test = DataLoader(dataset_test, BATCH_SIZE, shuffle=False)

        del dataset, dataset_train, dataset_test

        # Train
        model.train()

        for input, label in datalaoder_train:
            optimizer.zero_grad()
            input, label = input.to(device), label.to(device)
            logit = model(input)
            loss = criterion(logit.squeeze(1), label)
            loss.backward()
            optimizer.step()

    # Test
    model.eval()
    for input, label in datalaoder_train:
        optimizer.zero_grad()
        input, label = input.to(device), label.to(device)
        logit = model(input)
        loss = criterion(logit.squeeze(1), label)

        if loss < best_test_loss:
            best_test_loss = loss
            print(f'Loss: {loss}, Model has saved in {PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{target_pct}')
            torch.save(model.state_dict(), f'{PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{target_pct}')