In [1]:
import os
import tqdm
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import Set
import Utility
import SQLSentence

%load_ext autoreload
%autoreload 1
%aimport Set
%aimport Indicators

In [2]:
# Constants
TABLE_CODE = 'stock.stock_code'
TABLE_TYPE = 'stock.stock_type'
TABLE_CODE_TYPE = 'stock.stock_code_type'

PATH_CHECKPOINT = 'checkpoint'

In [3]:
# Build Basic Data
connect = Utility.connect_to_database()
main_data = Utility.GetAllData(connect)
list_stock_type = (SQLSentence.QuerySQL(TABLE_CODE_TYPE, connect, ['distinct stock_type_id']))['stock_type_id'].tolist()
df_code_name = SQLSentence.QuerySQL(TABLE_CODE, connect)
dict_code_name = dict(zip(df_code_name.code, df_code_name.name))
dict_type_name = ((SQLSentence.QuerySQL(TABLE_TYPE, connect)).drop(['id'], axis=1)).to_dict()['name']

del df_code_name

# Build Environment
device = 'cuda' if torch.cuda.is_available() else 'cpu'

if not os.path.exists(PATH_CHECKPOINT):
    os.makedirs(PATH_CHECKPOINT)

myseed = 42069  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

In [4]:
# Dataset
class StockDataset(Dataset):
    def __init__(self, data, target, window_length_train):
        self.data = data
        self.target = target
        self.window_length_train = window_length_train

        self.data = self.data.drop(['code', 'date'], axis=1)

        self.data = Set.Feature(self.data)

        self.data = self.data.dropna().reset_index(drop=True)

        self.index_col_open = self.data.columns.get_loc("open")
        self.index_col_high = self.data.columns.get_loc("high")
        self.index_col_volume = self.data.columns.get_loc("volume") # Get Feature Start Index

        self.data = self.data.to_numpy()
        self.dim = (self.data.shape[1]-(self.index_col_volume+1))*self.window_length_train

    def __getitem__(self, index):
        data = torch.tensor(self.data[index:index+self.window_length_train, self.index_col_volume+1:], dtype=torch.float32).flatten()
        target = (self.data[index + self.window_length_train, self.index_col_high]  - \
                  self.data[index + self.window_length_train, self.index_col_open]) / \
                  self.data[index + self.window_length_train, self.index_col_open]
        if self.target >= 0:
            label = torch.tensor(int(target >= self.target), dtype=torch.float32)
        else:
            label = torch.tensor(int(target <= self.target), dtype=torch.float32)
        return data, label
    
    def __len__(self):
        return len(self.data) - self.window_length_train

In [5]:
# Model
class Model_1(nn.Module):
    def __init__(self, input_dim):
        super(Model_1, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_dim, input_dim*2),
            nn.Linear(input_dim*2, input_dim*4),
            nn.Linear(input_dim*4, input_dim*8),
            nn.Linear(input_dim*8, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.net(x)
        return x

In [10]:
# Some Functions
def ConcatDataSet(main_data, code_list):
    all_train_data = []
    all_test_data = []

    for code in code_list:
        stock_data = main_data[main_data['code'] == code].copy()
        train_size = int(0.9 * len(stock_data))

        all_train_data.append(stock_data.iloc[:train_size])
        all_test_data.append(stock_data.iloc[train_size:])

    train_data = pd.concat(all_train_data, ignore_index=True)  # 使用 ignore_index=True 来避免重新索引
    test_data = pd.concat(all_test_data, ignore_index=True)
    
    return train_data, test_data

def build_dataloader(main_data, code_list, target, batch_size, window_length_train):
    train_data, test_data = ConcatDataSet(main_data, code_list)
    dataset_train = StockDataset(train_data, target, window_length_train)
    dataset_test = StockDataset(test_data, target, window_length_train)
    datalaoder_train = DataLoader(dataset_train, batch_size, shuffle=True)
    datalaoder_test = DataLoader(dataset_test, batch_size, shuffle=False)

    return datalaoder_train, datalaoder_test

def calculate_error_percentage(outputs, label, threshold=0.5):
    type1_error = 0
    type2_error = 0
    type1_count = 0
    type2_count = 0

    predicted = (outputs.squeeze(1) > threshold).float()

    for i in range(label.size(0)):
        # Type1
        if label[i] == 1:
            type1_count += 1
            if predicted[i] == 0:
                type1_error += 1

        # Type2
        elif label[i] == 0:
            type2_count += 1
            if predicted[i] == 1:
                type2_error += 1
                
    type1_error_percentage = round((type1_error/type1_count)*100, 2) if type1_count > 0 else None
    type2_error_percentage = round((type2_error/type2_count)*100, 2) if type2_count > 0 else None

    return type1_error_percentage, type2_error_percentage

In [7]:
# Get data from single Type
BATCH_SIZE = 128
N_EPOCHS = 10

lr = 1e-4
target_pct = 0.02 # Target rise percentage value
window_length_train = 20

type_id=1

list_unique_code_from_data = (SQLSentence.GetCodeByTypeId(type_id, connect))['code'].tolist()
datalaoder_train, datalaoder_test = build_dataloader(main_data, list_unique_code_from_data, target_pct, BATCH_SIZE, window_length_train)

In [14]:
# Select single Type to adjust the model and the optimizer
model = Model_1(datalaoder_train.dataset.dim).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.5, 0.999))

threshold=0.8
best_test_loss = 1000.

epoch = 0
while epoch < 5:
    epoch+=1
    print(f'Train {dict_type_name[type_id]} Epoch: {epoch}')

    train_loss = 0
    test_loss = 0

    train_correct = 0
    test_correct = 0
    train_total = 0
    test_total = 0

    #accuracy sum
    num_batches = 0
    type1_error_percentage_sum = 0
    type2_error_percentage_sum = 0

    # Train
    model.train()
    for input, label in tqdm.tqdm(datalaoder_train):
        optimizer.zero_grad()

        input, label = input.to(device), label.to(device)
        outputs = model(input)
        loss = criterion(outputs.squeeze(1), label)

        predicted = (outputs.squeeze(1) > threshold).float()
        train_loss+=loss.detach().cpu().item()

        train_correct += (predicted == label).sum().item()
        train_total += label.size(0)

        type1_error_percentage, type2_error_percentage = calculate_error_percentage(outputs, label, threshold)
        if type1_error_percentage is not None:
            type1_error_percentage_sum += type1_error_percentage
        if type2_error_percentage is not None:
            type2_error_percentage_sum += type2_error_percentage
        num_batches += 1

        loss.backward()
        optimizer.step()

    print(f'\nTrain Loss: {round(train_loss/len(datalaoder_train), 6)}')
    print(f"Accuracy: {round((train_correct/train_total)*100 , 2)}%")
    print(f"Type1 Error Ratio: {round(type1_error_percentage_sum/num_batches, 2)}%")
    print(f"Type2 Error Ratio: {round(type2_error_percentage_sum/num_batches, 2)}%\n")

    #accuracy sum
    num_batches = 0
    type1_error_percentage_sum = 0
    type2_error_percentage_sum = 0

    # Test
    model.eval()
    for input, label in datalaoder_test:
        input, label = input.to(device), label.to(device)
        outputs = model(input)
        predicted = (outputs.squeeze(1) > 0.5).float()
        loss = criterion(outputs.squeeze(1), label)
        test_loss+=loss.detach().cpu().item()
        test_total += label.size(0)
        test_correct += (predicted == label).sum().item()

        type1_error_percentage, type2_error_percentage = calculate_error_percentage(outputs, label, threshold)
        if type1_error_percentage is not None:
            type1_error_percentage_sum += type1_error_percentage
        if type2_error_percentage is not None:
            type2_error_percentage_sum += type2_error_percentage
        num_batches += 1

    print(f'Test Loss: {round(test_loss/len(datalaoder_test), 6)}')
    print(f"Accuracy: {round((test_correct/test_total)*100 , 2)}%")
    print(f"Type1 Error Ratio: {round(type1_error_percentage_sum/num_batches, 2)}%")
    print(f"Type2 Error Ratio: {round(type2_error_percentage_sum/num_batches, 2)}%\n")
    
    if test_loss/len(datalaoder_test) < best_test_loss:
        best_test_loss = test_loss/len(datalaoder_test)
        print(f'Model has saved in {PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{target_pct}\n')
        torch.save(model.state_dict(), f'{PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{target_pct}')

Train 紡織 Epoch: 1


100%|██████████| 1572/1572 [01:32<00:00, 16.96it/s]



Train Loss: 24.909193
Accuracy: 74.99%
Type1 Error Ratio: 99.99%
Type2 Error Ratio: 0.02%

Test Loss: 12.31049
Accuracy: 87.73%
Type1 Error Ratio: 97.14%
Type2 Error Ratio: 0.0%

Model has saved in checkpoint/type_1/type_1_Model_1_Target_0.02

Train 紡織 Epoch: 2


100%|██████████| 1572/1572 [01:27<00:00, 17.95it/s]



Train Loss: 24.937996
Accuracy: 75.0%
Type1 Error Ratio: 100.0%
Type2 Error Ratio: 0.0%

Test Loss: 12.310434
Accuracy: 87.73%
Type1 Error Ratio: 97.14%
Type2 Error Ratio: 0.0%

Model has saved in checkpoint/type_1/type_1_Model_1_Target_0.02

Train 紡織 Epoch: 3


100%|██████████| 1572/1572 [01:26<00:00, 18.17it/s]



Train Loss: 24.938185
Accuracy: 75.0%
Type1 Error Ratio: 100.0%
Type2 Error Ratio: 0.0%

Test Loss: 12.309627
Accuracy: 87.73%
Type1 Error Ratio: 97.14%
Type2 Error Ratio: 0.0%

Model has saved in checkpoint/type_1/type_1_Model_1_Target_0.02

Train 紡織 Epoch: 4


100%|██████████| 1572/1572 [01:35<00:00, 16.52it/s]



Train Loss: 24.972682
Accuracy: 74.96%
Type1 Error Ratio: 99.89%
Type2 Error Ratio: 0.09%

Test Loss: 12.341837
Accuracy: 87.73%
Type1 Error Ratio: 97.14%
Type2 Error Ratio: 0.0%

Train 紡織 Epoch: 5


100%|██████████| 1572/1572 [01:20<00:00, 19.64it/s]



Train Loss: 24.994447
Accuracy: 75.0%
Type1 Error Ratio: 100.0%
Type2 Error Ratio: 0.0%

Test Loss: 12.341837
Accuracy: 87.73%
Type1 Error Ratio: 97.14%
Type2 Error Ratio: 0.0%



In [None]:
BATCH_SIZE = 128
N_EPOCHS = 10

target_pct = 0.02 # Target rise percentage value
window_length_train = 20

for type_id in list_stock_type[1:2]:
    print(f'Train {dict_type_name[type_id]}')

    list_unique_code_from_data = (SQLSentence.GetCodeByTypeId(type_id, connect))['code'].tolist()
    train_data, test_data = ConcatDataSet(main_data, list_unique_code_from_data)
    datalaoder_train, datalaoder_test = build_dataloader(main_data, list_unique_code_from_data, target_pct, BATCH_SIZE, window_length_train)
    
    lr = 1e-4
    model = Model_1(datalaoder_train.dataset.dim).to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.5, 0.999))

    best_test_loss = 1000.
    epoch = 0
    
    while epoch < N_EPOCHS:
        train_loss = 0
        test_loss = 0

        # Train
        model.train()
        for input, label in tqdm.tqdm(datalaoder_train):
            optimizer.zero_grad()
            input, label = input.to(device), label.to(device)
            logit = model(input)
            loss = criterion(logit.squeeze(1), label)
            loss.backward()
            optimizer.step()
            train_loss+=loss.detach().cpu().item()

        print(f'Train Loss: {train_loss/len(datalaoder_train)}, Epoch: {epoch+1}')

        # Test
        model.eval()
        for input, label in datalaoder_test:
            input, label = input.to(device), label.to(device)
            logit = model(input)
            loss = criterion(logit.squeeze(1), label)
            test_loss+=loss.detach().cpu().item()
            if test_loss < best_test_loss:
                best_test_loss = test_loss
                print(f'Test Loss: {best_test_loss}, Model has saved in {PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{target_pct}')
                torch.save(model.state_dict(), f'{PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{target_pct}')

        epoch+=1