In [None]:
import os
import tqdm
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import Set
import Utility
import SQLSentence
import Model

%load_ext autoreload
%autoreload 1
%aimport Model
%aimport Set

In [5]:
# Constants
TABLE_CODE = 'stock.stock_code'
TABLE_TYPE = 'stock.stock_type'
TABLE_CODE_TYPE = 'stock.stock_code_type'

PATH_CHECKPOINT = 'checkpoint'

In [6]:
# Build Basic Data
connect = Utility.connect_to_database()
main_data = Utility.GetAllData(connect)
list_stock_type = (SQLSentence.QuerySQL(TABLE_CODE_TYPE, connect, ['distinct stock_type_id']))['stock_type_id'].tolist()
df_code_name = SQLSentence.QuerySQL(TABLE_CODE, connect)
dict_code_name = dict(zip(df_code_name.code, df_code_name.name))
dict_type_name = ((SQLSentence.QuerySQL(TABLE_TYPE, connect)).drop(['id'], axis=1)).to_dict()['name']

del df_code_name

# Build Environment
device = 'cuda' if torch.cuda.is_available() else 'cpu'

if not os.path.exists(PATH_CHECKPOINT):
    os.makedirs(PATH_CHECKPOINT)

myseed = 42069  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

In [7]:
# Dataset
class StockDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.dim = len(self.data[0])-1

    def __getitem__(self, index):
        data = torch.tensor(self.data[index][:-1], dtype=torch.float32)
        label = torch.tensor(self.data[index][-1], dtype=torch.float32)
        return data, label
    
    def __len__(self):
        return len(self.data)

In [8]:
# Some Functions
def process_stock_data(stock_data, target, window_length, predict = False):
    stock_data = stock_data.drop(['code', 'date'], axis=1).reset_index(drop=True)
    stock_data = Set.Feature(stock_data)
    stock_data = stock_data.dropna()

    index_col_open = stock_data.columns.get_loc("open")
    index_col_high = stock_data.columns.get_loc("high")
    index_col_volume = stock_data.columns.get_loc("volume")

    stock_data = stock_data.to_numpy()

    data_merged = []

    if predict:
        return stock_data[-window_length:, index_col_volume + 1:].flatten()

    for i in range(len(stock_data) - window_length):
        window_data = stock_data[i:i + window_length, index_col_volume + 1:].flatten()
        target_value = (stock_data[i + window_length, index_col_high] - stock_data[i + window_length, index_col_open]) / stock_data[i + window_length, index_col_open]

        if target >= 0:
            label = int(target_value >= target)
        else:
            label = int(target_value <= target)
        
        window_data_with_label = np.append(window_data, label)

        data_merged.append(window_data_with_label)

    return data_merged

def build_dataset(main_data, code_list, target, window_length):
    all_train_data = []
    all_test_data = []

    for code in code_list:
        stock_data = main_data[main_data['code'] == code].copy()
        train_size = int(0.9 * len(stock_data))

        process_data = process_stock_data(stock_data, target, window_length)
               
        train_data = process_data[:train_size]
        test_data = process_data[train_size:]

        all_train_data.extend(train_data)
        all_test_data.extend(test_data)
    
    return all_train_data, all_test_data

def build_dataloader(main_data, code_list, target, batch_size, window_length):
    train_data, test_data = build_dataset(main_data, code_list, target, window_length)
    dataset_train = StockDataset(train_data)
    dataset_test = StockDataset(test_data)
    datalaoder_train = DataLoader(dataset_train, batch_size, shuffle=True)
    datalaoder_test = DataLoader(dataset_test, batch_size, shuffle=False)

    return datalaoder_train, datalaoder_test

def calculate_correct_count(predicted, label):
    type1_correct = 0
    type2_correct = 0
    type1_count = 0
    type2_count = 0

    for i in range(label.size(0)):
        # Type1
        if label[i] == 1:
            type1_count += 1
            if predicted[i] == 1:
                type1_correct += 1

        # Type2
        elif label[i] == 0:
            type2_count += 1
            if predicted[i] == 0:
                type2_correct += 1

    return type1_correct, type2_correct, type1_count, type2_count

def predict_result(main_data, code, model, device, target, window_length, threshold, min_period=100):
    stock_data = main_data[main_data['code'] == code].tail(min_period).copy()
    process_data = process_stock_data(stock_data, target, window_length, predict=True)
    output = model(torch.tensor(process_data, dtype=torch.float32).reshape(1, process_data.shape[0]).to(device))
    if output[0] > threshold:
        print(f'{code} Probability: {output[0]}')
    return output[0] > threshold

In [9]:
# Invoke Model
def invoke_model(mode, model, dataloader, criterion, device, optimizer=None):
    
    model.train() if mode == 'train' else model.eval()

    total_loss = 0
    correct, accuracy_count = 0, 0
    investable_count, not_investable_count = 0, 0
    actual_investable_count, actual_not_investable_count = 0, 0
    type1_correct, type2_correct = 0, 0
    type1_count, type2_count = 0, 0

    for input, label in tqdm.tqdm(dataloader):
        input, label = input.to(device), label.to(device)

        if mode == 'train' and optimizer is not None:
            optimizer.zero_grad()
        
        outputs = model(input)
        loss = criterion(outputs.squeeze(1), label)
        total_loss+=loss.detach().cpu().item()

        predicted = (outputs.squeeze(1) > 0).float()

        if mode == 'train' and optimizer is not None:
            loss.backward()
            optimizer.step()

        correct += (predicted == label).sum().item()
        accuracy_count += label.size(0)

        investable_count += (predicted == 1).sum().item()
        not_investable_count += (predicted == 0).sum().item()

        actual_investable_count += (label == 1).sum().item()
        actual_not_investable_count += (label == 0).sum().item()

        t1_correct, t2_correct, t1_cnt, t2_cnt = calculate_correct_count(predicted, label)
        type1_correct += t1_correct
        type2_correct += t2_correct
        type1_count += t1_cnt
        type2_count += t2_cnt

    print(f'\n預測可投資的次數: {investable_count} / 實際上預測可投資的次數: {actual_investable_count}')
    print(f'不可投資的次數: {not_investable_count} / 實際上不可投資的次數: {actual_not_investable_count}')

    avg_loss = round(total_loss/len(dataloader), 6)
    accuracy =  round((correct/accuracy_count)*100 , 2)
    type1_correct_ratio = round((type1_correct/type1_count)*100, 2)
    type2_correct_ratio = round((type2_correct/type2_count)*100, 2)

    print(f'\n{mode} Loss: {avg_loss}')
    print(f"Accuracy: {accuracy}%")
    print(f"\nType1 Correct Ratio: {type1_correct_ratio}%")
    print(f"Type2 Correct Ratio: {type2_correct_ratio}%\n")

    return avg_loss, type1_correct_ratio, type2_correct_ratio

In [31]:
# Get data from single Type
BATCH_SIZE = 256
N_EPOCHS = 20

type_id=32

target_pct = 0.02 # Target rise percentage value
window_length = 20

list_unique_code_from_data = (SQLSentence.GetCodeByTypeId(type_id, connect))['code'].tolist()
datalaoder_train, datalaoder_test = build_dataloader(main_data, list_unique_code_from_data, target_pct, BATCH_SIZE, window_length)

In [None]:
# Select single Type to adjust the model and the optimizer
lr = Set.GetInfo('learning_rate')

model = Model.Model_CNN_LSTM(datalaoder_train.dataset.dim).to(device)
type_id_pos_weight = (Set.GetInfo('pos_weight'))[dict_type_name[type_id]]
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([type_id_pos_weight]).to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))

threshold=0.5
best_test_loss = 1000.
train_avg_loss = 0
test_avg_loss = 0

good_model_predict_count = 1

use_checkpoint = False
if use_checkpoint:
    try:
        model.load_state_dict(torch.load(f"{PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{int(target_pct*100)}_save", weights_only=True))
        best_test_loss = Utility.load_best_model_record_txt(type_id, model.__class__.__name__)
    except:
        print(f'No preserved model: {model.__class__.__name__}')

epoch = 0
while epoch < N_EPOCHS:
    epoch+=1
    print(f'Train {dict_type_name[type_id]} Epoch: {epoch}')

    # Train
    train_avg_loss, type1_correct_ratio, type2_correct_ratio = invoke_model('train', model, datalaoder_train, criterion, device, optimizer)

    if type1_correct_ratio>25.0 and type2_correct_ratio>85.0:
        torch.save(model.state_dict(), f'{PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{int(target_pct*100)}_selected{good_model_predict_count}')
        print(f'Model has saved in {PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{int(target_pct*100)}_selected{good_model_predict_count}\n')
        good_model_predict_count+=1
    
    print('--------------------------------------------------------------------------------------------------------------------------------------------')

    # Test
    with torch.no_grad():
        test_avg_loss, _, _ = invoke_model('test', model, datalaoder_test, criterion, device)

    if test_avg_loss < best_test_loss:
        best_test_loss = test_avg_loss
        print(f'Model has saved in {PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{int(target_pct*100)}\n')
        torch.save(model.state_dict(), f'{PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{int(target_pct*100)}')
        Utility.update_model_record_txt(type_id, model.__class__.__name__, best_test_loss)
        print('============================================================================================================================================')

In [None]:
# Predict
type_id=32

target_pct = 0.02
window_length = 20
threshold=0.5

list_unique_code_from_data = (SQLSentence.GetCodeByTypeId(type_id, connect))['code'].tolist()
current_date = (SQLSentence.GetLatestDate(connect)).strftime("%Y/%m/%d")

model = Model.Model_CNN_LSTM(datalaoder_test.dataset.dim).to(device)
list_code_investable = []

try:
    model.load_state_dict(torch.load(f"{PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{int(target_pct*100)}_selected", weights_only=True))
except:
    print(f'No preserved model: {model.__class__.__name__}')

for code in list_unique_code_from_data:

    if predict_result(main_data, code, model, device, target_pct, window_length, threshold):
        list_code_investable.append(code)

if len(list_code_investable) != 0:
    print(f'Current Date: {current_date}')
    for code in list_code_investable:
        print(f'{code} {dict_code_name[code]}')
else:
    print('No Match stock')
    

In [None]:
# Train all data
BATCH_SIZE = 256
N_EPOCHS = 20

lr = Set.GetInfo('learning_rate')
target_pct = 0.02 # Target rise percentage value
window_length = 20

list_selected_type_id = [0, 4, 7, 8, 13, 17, 18, 24, 25, 26, 27, 30, 31, 32, 35, 37]

for type_id in list_selected_type_id[:2]:
    print('////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////')
    list_unique_code_from_data = (SQLSentence.GetCodeByTypeId(type_id, connect))['code'].tolist()
    datalaoder_train, datalaoder_test = build_dataloader(main_data, list_unique_code_from_data, target_pct, BATCH_SIZE, window_length)
    
    model = Model.Model_1(datalaoder_train.dataset.dim).to(device)
    type_id_pos_weight = (Set.GetInfo('pos_weight'))[dict_type_name[type_id]]
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([type_id_pos_weight]).to(device))
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.5, 0.999))

    threshold=0.5
    best_test_loss = 1000.
    train_avg_loss = 0
    test_avg_loss = 0

    good_model_predict_count = 1
    epoch = 0
    while epoch < N_EPOCHS:
        epoch+=1
        print(f'Train {dict_type_name[type_id]} Epoch: {epoch}')

        # Train
        train_avg_loss, type1_correct_ratio, type2_correct_ratio = invoke_model('train', model, datalaoder_train, criterion, device, optimizer)

        if type1_correct_ratio>25.0 and type2_correct_ratio>85.0:
            torch.save(model.state_dict(), f'{PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{int(target_pct*100)}_selected{good_model_predict_count}')
            print(f'Model has saved in {PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{int(target_pct*100)}_selected{good_model_predict_count}\n')
            good_model_predict_count+=1

        print('--------------------------------------------------------------------------------------------------------------------------------------------')

        # Test
        with torch.no_grad():
            test_avg_loss, _, _ = invoke_model('test', model, datalaoder_test, criterion, device)

        if test_avg_loss < best_test_loss:
            best_test_loss = test_avg_loss
            print(f'Model has saved in {PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{int(target_pct*100)}\n')
            torch.save(model.state_dict(), f'{PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{int(target_pct*100)}')
            Utility.update_model_record_txt(type_id, model.__class__.__name__, best_test_loss)
            print('======================================================================================================================================================')
