In [3]:
import os
import tqdm
import numpy as np
import math
import gc

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import Set
import Utility
import SQLSentence
import Model
from models import FullyConnected, TCN, CNNLSTM

In [2]:
# Build Environment
connect = Utility.connect_to_database()
main_data, list_stock_type, dict_code_name, dict_type_name = Utility.fetch_data_from_db()
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Constants
PATH_CHECKPOINT = 'checkpoint'

if not os.path.exists(PATH_CHECKPOINT):
    os.makedirs(PATH_CHECKPOINT)

myseed = 42069  # set a random seed for reproducibility

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

In [3]:
# Some Functions

# Dataset
class StockDataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label
        self.shape = data.shape

    def __getitem__(self, index):
        data = torch.tensor(self.data[index], dtype=torch.float32)
        label = torch.tensor(self.label[index], dtype=torch.float32)

        return data, label
    
    def __len__(self):
        return len(self.data)
    
# Invoke Model
def invoke_model(mode, model, dataloader, criterion, device, optimizer=None, threshold=0.0):
    
    model.train() if mode == 'train' else model.eval()

    total_loss = 0
    total_count, correct_count = 0, 0
    true_positive, false_positive = 0, 0
    true_negative, false_negative = 0, 0

    for input, label in dataloader:
        input, label = input.to(device), label.to(device)
 
        if mode == 'train' and optimizer is not None:
            optimizer.zero_grad()
        
        outputs = model(input).to(device)
        loss = criterion(outputs.squeeze(1), label)
        total_loss+=loss.detach().cpu().item()
        
        predicted = (outputs.squeeze(1) > threshold).to(torch.bool)
        label = label.to(torch.bool)

        if mode == 'train' and optimizer is not None:
            loss.backward()
            optimizer.step()

        total_count += label.size(0)
        correct_count += (label == predicted).sum().item()

        # TP: both of label and predicted should be 1
        true_positive += (label & predicted).sum().item()

        # FP: label is 0 and predicted is 1
        false_positive += (~label & predicted).sum().item()

        # TN: both of label and predicted should be 0
        true_negative += (~label & ~predicted).sum().item()

        # FN: label is 1 and predicted is 0
        false_negative += (label & ~predicted).sum().item()

    avg_loss = round(total_loss/len(dataloader), 6)
    accuracy = round((correct_count/total_count)*100 , 2)

    # Precision: (TP) / (TP + FP)
    # precision = round((true_positive/(true_positive+false_positive)) * 100, 2)
    precision = 0.0 if (true_positive + false_positive) == 0 else round((true_positive / (true_positive + false_positive)) * 100, 2)
    
    # Recall:    (TP) / (TP + FN)
    # recall = round((true_positive/(true_positive+false_negative)) * 100, 2)
    recall = 0.0 if (true_positive + false_negative) == 0 else round((true_positive / (true_positive + false_negative)) * 100, 2)

    # print(f'{mode} Loss: {avg_loss}, Accuracy: {accuracy}%, False Positive Accuracy: {fp_acc}%, False Negative Accuracy: {fn_acc}%')

    return avg_loss, accuracy, precision, recall

# Invoke Model with single stock data
def invoke_model_single_stock(mode, model, dataloader, criterion, device, optimizer=None, threshold=0.0):
    
    model.train() if mode == 'train' else model.eval()

    total_loss = 0
    total_count, correct_count = 0, 0
    true_positive, false_positive = 0, 0
    true_negative, false_negative = 0, 0

    for input, label in tqdm.tqdm(dataloader):
        input, label = input.to(device), label.to(device)
 
        if mode == 'train' and optimizer is not None:
            optimizer.zero_grad()
        
        outputs = model(input).to(device)
        loss = criterion(outputs.squeeze(1), label)
        total_loss+=loss.detach().cpu().item()
        
        predicted = (outputs.squeeze(1) > threshold).to(torch.bool)
        label = label.to(torch.bool)

        if mode == 'train' and optimizer is not None:
            loss.backward()
            optimizer.step()

        total_count += label.size(0)
        correct_count += (label == predicted).sum().item()

        # TP: both of label and predicted should be 1
        true_positive += (label & predicted).sum().item()

        # FP: label is 0 and predicted is 1
        false_positive += (~label & predicted).sum().item()

        # TN: both of label and predicted should be 0
        true_negative += (~label & ~predicted).sum().item()

        # FN: label is 1 and predicted is 0
        false_negative += (label & ~predicted).sum().item()

    avg_loss = round(total_loss/len(dataloader), 6)
    accuracy = round((correct_count/total_count)*100 , 2)

    # Precision: (TP) / (TP + FP)
    # precision = round((true_positive/(true_positive+false_positive)) * 100, 2)
    precision = 0.0 if (true_positive + false_positive) == 0 else round((true_positive / (true_positive + false_positive)) * 100, 2)
    
    # Recall:    (TP) / (TP + FN)
    # recall = round((true_positive/(true_positive+false_negative)) * 100, 2)
    recall = 0.0 if (true_positive + false_negative) == 0 else round((true_positive / (true_positive + false_negative)) * 100, 2)

    # print(f'{mode} Loss: {avg_loss}, Accuracy: {accuracy}%, False Positive Accuracy: {fp_acc}%, False Negative Accuracy: {fn_acc}%')

    return avg_loss, accuracy, precision, recall

def process_stock_data(stock_data, target, window_length, predict = False):
    stock_data = stock_data.drop(['code', 'date'], axis=1).reset_index(drop=True)

    index_col_open = stock_data.columns.get_loc("open")
    index_col_high = stock_data.columns.get_loc("high")
    index_col_low = stock_data.columns.get_loc("low")
    index_col_volume = stock_data.columns.get_loc("volume")

    stock_data = Set.Feature(stock_data)
    stock_data = stock_data.dropna()
    stock_data = stock_data.to_numpy()

    data_merged = []
    label_merged = []

    if predict:
        return stock_data[-window_length:, index_col_volume + 1:]

    for i in range(len(stock_data) - window_length):
        data = stock_data[i:i + window_length, index_col_volume + 1:]

        if target >= 0:
            target_value = (stock_data[i + window_length, index_col_high] - stock_data[i + window_length, index_col_open]) / \
                stock_data[i + window_length, index_col_open]
            label = int(target_value >= target)
        else:
            target_value = (stock_data[i + window_length, index_col_low] - stock_data[i + window_length, index_col_open]) / \
                stock_data[i + window_length, index_col_open]
            label = int(target_value <= target)

        data_merged.append(data)
        label_merged.append(label)

    return np.array(data_merged), np.array(label_merged)

def build_dataset(main_data, code_list, target, window_length):
    all_train_data = []
    all_test_data = []

    all_train_label = []
    all_test_label = []

    for code in code_list:
        stock_data = main_data[main_data['code'] == code].copy()

        data, label = process_stock_data(stock_data, target, window_length)
        train_size = int(0.9 * len(data))
               
        train_data = data[:train_size]
        test_data = data[train_size:]

        train_label = label[:train_size]
        test_label = label[train_size:]

        all_train_data.extend(train_data)
        all_test_data.extend(test_data)

        all_train_label.extend(train_label)
        all_test_label.extend(test_label)
    
    return np.array(all_train_data), np.array(all_test_data), np.array(all_train_label), np.array(all_test_label)

def build_dataloader(main_data, code_list, config):

    target = config['target_pct']
    batch_size = config['batch_size']
    window_length = config['window_length']

    train_data, test_data, train_label, test_label = build_dataset(main_data, code_list, target, window_length)
    dataset_train = StockDataset(train_data, train_label)
    dataset_test = StockDataset(test_data, test_label)
    datalaoder_train = DataLoader(dataset_train, batch_size, shuffle=True)
    datalaoder_test = DataLoader(dataset_test, batch_size, shuffle=False)

    return datalaoder_train, datalaoder_test

def predict_result(main_data, code, model, device, target, window_length, min_period=100):
    stock_data = main_data[main_data['code'] == code].tail(min_period).copy()
    data = process_stock_data(stock_data, target, window_length, predict=True)
    output = model(torch.tensor(data, dtype=torch.float32).unsqueeze(0).to(device))
    return round(output[0].cpu().detach().item(), 4)

def GetDatasetShape(main_data, window_length, code=1101, min_period=100):
    stock_data = main_data[main_data['code'] == code].tail(min_period).copy()
    stock_data = stock_data.drop(['code', 'date'], axis=1).reset_index(drop=True)
    stock_data = Set.Feature(stock_data)
    stock_data = stock_data.dropna()

    index_col_volume = stock_data.columns.get_loc("volume")

    stock_data = stock_data.to_numpy()

    return np.expand_dims(stock_data[-window_length:, index_col_volume + 1:], axis=0).shape

In [None]:
# Select single Type to adjust the model and the optimizer
dict_model = {
    0: CNNLSTM,
    1: TCN,
    2: FullyConnected,
}

model_type_id = 1
stock_industry_type_id = 0
config_model = {
    'n_epochs': 1,
    'batch_size': 256,
    'version': 4,
    'threshold': 0.0,
    'target_pct': 0.02,
    'window_length':20,
    'scale_weight': 0.8,
    'learning_rate': 1e-4,
    'weight_decay': 1e-5,
    'use_checkpoint': False,
}

list_unique_code_from_data = (SQLSentence.GetCodeByTypeId(stock_industry_type_id, connect))['code'].tolist()
datalaoder_train, datalaoder_test = build_dataloader(main_data, list_unique_code_from_data, config_model)

model = dict_model[model_type_id](datalaoder_train.dataset.shape).to(device)

current_pos_weight = (math.log((Set.GetInfo('pos_weight'))[config_model['target_pct']][dict_type_name[stock_industry_type_id]])+1)*config_model['scale_weight']
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([current_pos_weight]).to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=config_model['learning_rate'], weight_decay=config_model['weight_decay'])

path_model_records = f'models/model_records/model_records.csv'
path_model_checkpoint = f'models/checkpoint/type_{stock_industry_type_id}/type_{stock_industry_type_id}_{model.get_model_name()}_Target_{int(config_model["target_pct"]*100)}_v{str(config_model["version"])}'

best_test_loss = float('inf')
best_test_loss = Utility.load_best_model_record_txt(stock_industry_type_id, config_model['target_pct'], model.get_model_name(), config_model['version'], path_model_records)

if config_model['use_checkpoint']:
    try:
        postfix='save'
        model.load(f"{path_model_checkpoint}_{postfix}")
    except:
        print(f'No preserved model: {model.get_model_name()}')

epoch = 0
while epoch < config_model['n_epochs']:
    epoch+=1
    print(f'Train {dict_type_name[stock_industry_type_id]} Epoch: {epoch}')

    # Train
    train_loss, train_acc, train_precision, train_recall = invoke_model_single_stock('train', model, datalaoder_train, criterion, device, optimizer, config_model['threshold'])
    print(f'Loss: {train_loss}, Accuracy: {train_acc}%, Precision: {train_precision}%, Recall: {train_recall}%\n')

    # Test
    with torch.no_grad():
        test_loss, test_acc, test_precision, test_recall = invoke_model_single_stock('test', model, datalaoder_test, criterion, device, threshold=config_model['threshold'])
        print(f'Loss: {test_loss}, Accuracy: {test_acc}%, Precision: {test_precision}%, Recall: {test_recall}%\n')

        # Save the model with a best loss (test) using postfix: best
        if test_loss < best_test_loss:
            best_test_loss = test_loss

            record = {
                'type_id': stock_industry_type_id,
                'target_pct': config_model['target_pct'],
                'model_name': model.get_model_name(),
                'version': config_model['version'],
                'test_loss': test_loss,
                'test_acc': test_acc,
                'test_precision': test_precision,
                'test_recall': test_recall,
                'train_loss': train_loss,
                'train_acc': train_acc,
                'train_precision': train_precision,
                'train_recall': train_recall,
            }

            print(f'Model has saved in {path_model_checkpoint}_best\n')
            print(f'Loss: {test_loss}, Accuracy: {test_acc}%, Precision: {test_precision}%, Recall: {test_recall}%\n')
            model.save(f'{path_model_checkpoint}_best')
            Utility.update_model_record_txt(record, path_model_records)

# Save the last epoch model using postfix: save
print(f'Model has saved in {path_model_checkpoint}_save\n')
model.save(f'{path_model_checkpoint}_save')

# del datalaoder_train, datalaoder_test

In [None]:
# Mutiple types to train the model.
dict_model = {
    0: CNNLSTM,
    1: TCN,
    2: FullyConnected,
}

model_type_id = 2
config_model = {
    'n_epochs': 30,
    'batch_size': 256,
    'version': 4,
    'threshold': 0.0,
    'target_pct': -0.02,
    'window_length':20,
    'scale_weight': 0.8,
    'learning_rate': 1e-4,
    'weight_decay': 1e-5,
    'use_checkpoint': False,
}

list_selected_type_id = Set.GetInfo('select_stock_list')

for stock_industry_type_id in list_selected_type_id:
    list_unique_code_from_data = (SQLSentence.GetCodeByTypeId(stock_industry_type_id, connect))['code'].tolist()
    datalaoder_train, datalaoder_test = build_dataloader(main_data, list_unique_code_from_data, config_model)

    model = dict_model[model_type_id](datalaoder_train.dataset.shape).to(device)

    current_pos_weight = (math.log((Set.GetInfo('pos_weight'))[config_model['target_pct']][dict_type_name[stock_industry_type_id]])+1)*config_model['scale_weight']
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([current_pos_weight]).to(device))
    optimizer = torch.optim.Adam(model.parameters(), lr=config_model['learning_rate'], weight_decay=config_model['weight_decay'])
    
    path_model_records = f'models/model_records/model_records.csv'
    path_model_checkpoint = f'models/checkpoint/type_{stock_industry_type_id}/type_{stock_industry_type_id}_{model.get_model_name()}_Target_{int(config_model["target_pct"]*100)}_v{str(config_model["version"])}'

    best_test_loss = Utility.load_best_model_record_txt(stock_industry_type_id, config_model['target_pct'], model.get_model_name(), config_model['version'], path_model_records)

    if config_model['use_checkpoint']:
        try:
            postfix='save'
            model.load(f"{path_model_checkpoint}_{postfix}")
        except:
            print(f'No preserved model: {model.get_model_name()}')
    
    epoch = 0
    with tqdm.tqdm(total=config_model['n_epochs'], desc=f'Train {dict_type_name[stock_industry_type_id]}') as pbar:
        while epoch < config_model['n_epochs']:
            epoch+=1
            # print(f'Train {dict_type_name[type_id]} Epoch: {epoch}')

            # Train
            train_loss, train_acc, train_precision, train_recall = invoke_model('train', model, datalaoder_train, criterion, device, optimizer, config_model['threshold'])

            # Test
            with torch.no_grad():
                test_loss, test_acc, test_precision, test_recall = invoke_model('test', model, datalaoder_test, criterion, device, threshold=config_model['threshold'])

                # Save the model with a best loss (test) using postfix: best
                if test_loss < best_test_loss:
                    best_test_loss = test_loss

                    record = {
                        'type_id': stock_industry_type_id,
                        'target_pct': config_model['target_pct'],
                        'model_name': model.get_model_name(),
                        'version': config_model['version'],
                        'test_loss': test_loss,
                        'test_acc': test_acc,
                        'test_precision': test_precision,
                        'test_recall': test_recall,
                        'train_loss': train_loss,
                        'train_acc': train_acc,
                        'train_precision': train_precision,
                        'train_recall': train_recall,
                    }

                    model.save(f'{path_model_checkpoint}_best')
                    Utility.update_model_record_txt(record, path_model_records)
                    print(f'Loss: {test_loss}, Accuracy: {test_acc}%, Precision: {test_precision}%, Recall: {test_recall}%\n')
                    print(f'Model has saved in {path_model_checkpoint}_best\n')

            pbar.update(1)

    model.save(f'{path_model_checkpoint}_save')
    print(f'Model has saved in {path_model_checkpoint}_save\n')

    del datalaoder_train, datalaoder_test, model
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
# Predict
target_pct = 0.05
window_length = 20
threshold=0.5

list_selected_type_id = Set.GetInfo('select_stock_list')
current_date = (SQLSentence.GetLatestDate(connect)).strftime("%Y-%m-%d")

model_name = 'CNN_LSTM'
ModelClass = getattr(Model, model_name)
postfix='best'

file_name = f'predict_tmp_{current_date}_{int(target_pct*100)}%_{model_name}_{postfix}.txt'

if not os.path.exists(file_name):
    with open(file_name, 'w') as f:
        pass

with open(file_name, 'a', encoding='utf-8') as f:
    f.write(f"----------------------------------Model: {model_name}_{postfix}----------------------------")

    for type_id in tqdm.tqdm(list_selected_type_id):

        list_unique_code_from_data = (SQLSentence.GetCodeByTypeId(type_id, connect))['code'].tolist()

        model = ModelClass(GetDatasetShape(main_data, window_length)).to(device)
        model.eval()
        list_code_investable = []

        try:
            model.load_state_dict(torch.load(f"{PATH_CHECKPOINT}/type_{type_id}/type_{type_id}_{model.__class__.__name__}_Target_{int(target_pct*100)}_{postfix}", weights_only=True))
        except:
            print(f'{dict_type_name[type_id]} No preserved model: {model.__class__.__name__}')
            continue

        for code in list_unique_code_from_data:
            output = predict_result(main_data, code, model, device, target_pct, window_length)
            if output > threshold:
                list_code_investable.append((code, output))

        if len(list_code_investable) != 0:
            f.write("------------------------------------------------------------------------")
            f.write(f"\n產業 {type_id}: {dict_type_name[type_id]}\n")
            for code, probability in list_code_investable:
                f.write(f'{code} {dict_code_name[code]:<10} Prob: {probability}\n')
    f.write("\n\n")