In [1]:
import numpy as np
from numpy import load
import torch
import torch.nn as nn
import pickle
import argparse
import pandas as pd
import numpy as np
import torch
import os
from torch.utils.data import Dataset, DataLoader
import yaml

from argparse import Namespace
from timeit import default_timer as timer
from tensorboardX import SummaryWriter
from datetime import datetime
from source.transformer import Seq2SeqTransformer
from source.train import train_epoch, evaluate
from source.Attention_LSTM import RNNModel
from source.train import train_epoch_lstm, evaluate_lstm

In [2]:
def load_config(file_path):
    with open(file_path, 'r') as file:
        config = yaml.safe_load(file)
    return Namespace(**config)

config_file = "Train_info.yaml"
args = load_config(config_file)

In [3]:

modeltype = args.model
if modeltype not in ['Trans', 'ALSTM', 'LSTM']:
    raise ValueError

datasubfix = args.data

if args.trainname:
    trainname = args.trainname
else:
    now = datetime.now()
    now.strftime("%m/%d/%Y, %H:%M:%S")
    date_time = now.strftime("%m_%d_%Y")
    trainname = date_time
device = args.device


num_epochs = args.epoch
seq_length = 39
TGT_VOCAB_SIZE = 3
EMB_SIZE = args.emb_dim
NHEAD = args.num_head
FFN_HID_DIM = args.hid_dim
BATCH_SIZE = args.batch_size
lr_init = args.lr_initial
NUM_ENCODER_LAYERS = args.num_layers // 2
NUM_DECODER_LAYERS = args.num_layers // 2
NUM_LAYERS = args.num_layers
train_data_path = os.path.join('/Data/LOBData/TrainData',args.data)
train_data_info_path = os.path.join(train_data_path,"config.yaml")
train_data_config = load_config(train_data_info_path)
mbrnlist = train_data_config.MBRNlist

In [7]:
train_data_config.feat_cols

['ASK_STEP1_BSTORD_RQTY',
 'ASK_STEP2_BSTORD_RQTY',
 'ASK_STEP3_BSTORD_RQTY',
 'ASK_STEP4_BSTORD_RQTY',
 'ASK_STEP5_BSTORD_RQTY',
 'BID_STEP1_BSTORD_RQTY',
 'BID_STEP2_BSTORD_RQTY',
 'BID_STEP3_BSTORD_RQTY',
 'BID_STEP4_BSTORD_RQTY',
 'BID_STEP5_BSTORD_RQTY',
 '매도5단계호가합계잔량',
 '매수5단계호가합계잔량',
 '매도10단계호가합계잔량',
 '매수10단계호가합계잔량',
 '매도총호가잔량',
 '매수총호가잔량',
 '고가',
 '저가',
 '시가',
 '직전체결가격',
 'NEW_ASK_QTY',
 'CCL_ASK_QTY',
 'NEW_BID_QTY',
 'CCL_BID_QTY']

In [4]:
def transform(x,y):
    x = x[:,:-4]
    x = torch.FloatTensor(x)
    y = torch.FloatTensor(y)
    y[-1]=0
    return x,y

class TimeSeriesNpyDataset(Dataset):
    def __init__(self, x_data, y_data, seq_length, transform=None):
        self.x_data = x_data
        self.y_data = y_data
        self.seq_length = seq_length
        self.transform = transform
    def __len__(self):
        return len(self.x_data) - self.seq_length

    def __getitem__(self, idx):
        x_sample = self.x_data[idx:idx + self.seq_length]
        y_sample = self.y_data[idx + 1:idx + self.seq_length + 1]

        # Set the last value of y_sample to 0
        y_sample[-1] = 0

        if self.transform:
            x_sample,y_sample = self.transform(x_sample,y_sample)

        return x_sample, y_sample

In [5]:
def split_data(XData, YData,seq_length, train_ratio=0.9):
    data_len = len(XData) // seq_length
    split_idx = int(data_len * train_ratio)
    X_train = XData[:split_idx * seq_length]
    Y_train = YData[:split_idx * seq_length]
    X_test = XData[split_idx * seq_length:]
    Y_test = YData[split_idx * seq_length:]
    return X_train, Y_train, X_test, Y_test
def check_inf(data, idx, mbr=None, brn=None, dataname=None):
    if np.isinf(data[seq_length * idx:seq_length * (idx + 1)].tolist()).any():
        print(np.isinf(data[seq_length * idx:seq_length * (idx + 1)].tolist()).any())
        print(mbr, brn)
        print(dataname)
        print(data[seq_length * idx:seq_length * (idx + 1)].tolist())
        raise RuntimeError

def generate_square_subsequent_mask(sz, device):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
def generate_square_subsequent_mask3(sz, device):
    mask = (torch.triu(torch.ones((sz, sz - 1), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
def create_mask(src, tgt, device):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len, device)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.double)
    src_mask = generate_square_subsequent_mask(src_seq_len, device)
    mmr_mask = generate_square_subsequent_mask3(src_seq_len, device)
    return src_mask, tgt_mask, mmr_mask

In [6]:
mbr,brn=mbrnlist[0]

# Paths to your .npy files
x_data_file = os.path.join(train_data_path,f"Input_{mbr}_{brn}.npy")
y_data_file = os.path.join(train_data_path,f"Label_{mbr}_{brn}.npy")

XData = np.load(x_data_file)
YData = np.load(y_data_file)
X_train, Y_train, X_test, Y_test = split_data(XData, YData,seq_length)

# Create the train and test datasets and data loaders
train_dataset = TimeSeriesNpyDataset(X_train, Y_train, seq_length, transform=transform)
test_dataset = TimeSeriesNpyDataset(X_test, Y_test, seq_length, transform=transform)

train_dataloader = DataLoader(train_dataset, batch_size=2,shuffle=False, num_workers=2)

SRC_VOCAB_SIZE = X_train.shape[1]
print(SRC_VOCAB_SIZE)

model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                   NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
model = model.to(device)
loss_fn = torch.nn.CrossEntropyLoss()
summary = SummaryWriter()
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
optimizer = torch.optim.Adam(model.parameters(), lr=lr_init, betas=(0.9, 0.98), eps=1e-9)
# Iterate over the data loader to get sequences
for batch_idx, (x_sequence, y_sequence) in enumerate(train_dataloader):
    x_sequence ,y_sequence = x_sequence.to(device), y_sequence.to(device)
    
    src_input=x_sequence
    tgt_input=y_sequence
    print(src_input.shape)
    print(tgt_input.shape)
    src_mask, tgt_mask, mmr_mask = create_mask(src_input, tgt_input, device)
    print(f"Tensor data type: {src_mask.dtype}")
    logits = model(src_input, tgt_input, src_mask, tgt_mask,src_mask)

    optimizer.zero_grad()

    tgt_out = tgt[1:]
    loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
    loss.backward()

    optimizer.step()
    losses += loss.item()

    break

24
torch.Size([2, 39, 24])
torch.Size([2, 39, 4])
Tensor data type: torch.float32


RuntimeError: The size of tensor a (39) must match the size of tensor b (2) at non-singleton dimension 1

In [None]:
for mbr, brn in mbrnlist:
    print(mbr,brn)

    # Paths to your .npy files
    x_data_file = os.path.join(train_data_path,f"Input_{mbr}_{brn}.npy")
    y_data_file = os.path.join(train_data_path,f"Label_{mbr}_{brn}.npy")

    XData = np.load(x_data_file)
    YData = np.load(y_data_file)
    X_train, Y_train, X_test, Y_test = split_data(XData, YData)

    # Create the train and test datasets and data loaders
    train_dataset = TimeSeriesNpyDataset(X_train, Y_train, seq_length)
    test_dataset = TimeSeriesNpyDataset(X_test, Y_test, seq_length)
    
    
    # Iterate over the data loader to get sequences
    for batch_idx, (x_sequence, y_sequence) in enumerate(train_dataloader):
        # Perform your machine learning tasks here
        if device == None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        SRC_VOCAB_SIZE = Xtrain_data.shape[1]

        if modeltype == 'Trans':
            model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                       NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
        if modeltype == "ALSTM":
            model = RNNModel(rnn_type='LSTM', ntoken=SRC_VOCAB_SIZE, ninp=EMB_SIZE, nhid=FFN_HID_DIM, nlayers=NUM_LAYERS,
                             proj_size=TGT_VOCAB_SIZE,
                             attention_width=39)
        if modeltype == "LSTM":
            model = RNNModel(rnn_type='LSTM', ntoken=SRC_VOCAB_SIZE, ninp=EMB_SIZE, nhid=FFN_HID_DIM, nlayers=NUM_LAYERS,
                             proj_size=TGT_VOCAB_SIZE,
                             attention=False)
        
        summary = SummaryWriter()
        for p in model.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr_init, betas=(0.9, 0.98), eps=1e-9)
        
        
        print(x_sequence)
        break

In [None]:
x_sequence.shape