In [86]:
import torch
import pandas as pd
import numpy as np
import math, copy, time
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset 
import tqdm


In [87]:
df = pd.read_csv('./total.csv')
df.tail()

Unnamed: 0,Date,S&P,currency,gold,kospi
5320,2024-04-22,5010.600098,1373.930054,2332.199951,2629.439941
5321,2024-04-23,5070.549805,1378.839966,2327.699951,2623.02002
5322,2024-04-24,5071.629883,1370.47998,2324.5,2675.75
5323,2024-04-25,5048.419922,1376.969971,2345.600098,2628.620117
5324,2024-04-26,5048.419922,1376.969971,2345.600098,2654.820068


In [88]:
col_names = df.columns.values
print(col_names)

['Date' 'S&P' 'currency' 'gold' 'kospi']


In [89]:
# global variables and hyperparameter
IN_DIM = 128
DAY_INT = 1
BATCH_SIZE = 64
SCALER = 'MINMAX'   # 'NORMAL'
# SCALER = 'NORMAL'   # 'NORMAL'
TRAIN_TEST_SPLIT = 0.9
LR = 1e-4

In [90]:
# train test split
df_train = df.iloc[:int(len(df) * TRAIN_TEST_SPLIT), :].drop('Date', axis=1)
df_test = df.iloc[int(len(df) * TRAIN_TEST_SPLIT): , :].drop('Date', axis=1)
# print(df_train.tail())
# print(df_test.head())
scale_params = pd.DataFrame(index=['mean', 'var', 'max', 'min'])
for i in col_names[1:]:
    scale_params[i] = [df_train[i].mean(), df_train[i].var(), df_train[i].max(), df_train[i].min()]
scale_params

Unnamed: 0,S&P,currency,gold,kospi
mean,1946.146297,1109.0951,1175.979382,1876.733674
var,867715.913074,9353.777468,185116.837397,283489.405685
max,4796.560059,1571.400024,2051.5,3305.209961
min,676.530029,886.679993,374.799988,719.590027


In [91]:
# scaling
train_data = pd.DataFrame()
test_data = pd.DataFrame()
if SCALER == 'MINMAX':
    for i in col_names[1:]:
        train_data[i] = df_train[i].apply(lambda x: (x - scale_params.loc['min', i])/(scale_params.loc['max', i] - scale_params.loc['min', i]))
        test_data[i] = df_test[i].apply(lambda x: (x - scale_params.loc['min', i])/(scale_params.loc['max', i] - scale_params.loc['min', i]))
elif SCALER == 'NORMAL':
    for i in col_names[1:]:
        train_data[i] = df_train[i].apply(lambda x: (x - scale_params.loc['mean', i])/scale_params.loc['var', i])
        test_data[i] = df_test[i].apply(lambda x: (x - scale_params.loc['mean', i])/scale_params.loc['var', i])
test_data.tail()

Unnamed: 0,S&P,currency,gold,kospi
5320,1.051951,0.711605,1.167412,0.738643
5321,1.066502,0.718775,1.164728,0.73616
5322,1.066764,0.706566,1.16282,0.756554
5323,1.061131,0.716044,1.175404,0.738326
5324,1.061131,0.716044,1.175404,0.748459


In [95]:
class stockDataset(Dataset):
    def __init__(self, data, input_dim=128, output_dim=1, stride=1):
        
        self.data = data
        self.len = len(data)
        start_pos = (self.len - input_dim -1) % stride
        iter_times = (self.len - input_dim -1) // stride + 1
        # col_size = data.shape[1]
        # print(self.len, start_pos, iter_times)
        # print(data.shape)
        X = np.zeros([input_dim, iter_times], dtype=np.float32)
        Y = np.zeros([output_dim, iter_times], dtype=np.float32)
        # X = np.zeros([input_dim, iter_times], dtype=np.float32)
        # Y = np.zeros([output_dim, iter_times], dtype=np.float32)
        for i in range(iter_times):  # check iteration number 
            start = start_pos + i * stride
            end = start + input_dim    # -1 if use loc
            x_num = data[start:end].to_numpy()
            # x_num = x_num.astype(float)
            X[:, i] = x_num
            Y[:, i] = df.loc[end + 1, 'kospi']
        self.x = X.T
        self.y = Y
        self.len = len(X)
        print(self.x.shape, self.y.shape)

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        return self.x[index], self.y[:, index]

In [96]:
train_snp = stockDataset(train_data[col_names[1]], IN_DIM, 1, 1)
train_currency = stockDataset(train_data[col_names[2]], IN_DIM, 1, 1)
train_gold = stockDataset(train_data[col_names[3]], IN_DIM, 1, 1)
train_kospi = stockDataset(train_data[col_names[4]], IN_DIM, 1, 1)

test_snp = stockDataset(test_data[col_names[1]], IN_DIM, 1, 1)
test_currency = stockDataset(test_data[col_names[2]], IN_DIM, 1, 1)
test_gold = stockDataset(test_data[col_names[3]], IN_DIM, 1, 1)
test_kospi = stockDataset(test_data[col_names[4]], IN_DIM, 1, 1)

(4664, 128) (1, 4664)
(4664, 128) (1, 4664)
(4664, 128) (1, 4664)
(4664, 128) (1, 4664)
(405, 128) (1, 405)
(405, 128) (1, 405)
(405, 128) (1, 405)
(405, 128) (1, 405)


In [97]:
train_snp.__getitem__(0)

(array([0.09553085, 0.09468134, 0.09422261, 0.09543376, 0.09343863,
        0.09533183, 0.09311826, 0.09284399, 0.0957954 , 0.09650657,
        0.09502601, 0.09674686, 0.09707452, 0.10015704, 0.10003082,
        0.10106963, 0.10181721, 0.10133664, 0.10133664, 0.10178566,
        0.10508417, 0.10512301, 0.10567642, 0.10567642, 0.10484146,
        0.10817638, 0.10852834, 0.10917395, 0.11053075, 0.10808901,
        0.1093924 , 0.10793366, 0.11019094, 0.1105623 , 0.11245062,
        0.11245062, 0.11219335, 0.11434139, 0.11344818, 0.11286812,
        0.11622245, 0.11347491, 0.10969579, 0.11106229, 0.11033899,
        0.11134142, 0.11152831, 0.10922007, 0.10972249, 0.11316179,
        0.11244579, 0.11383655, 0.11680254, 0.11543119, 0.11390209,
        0.11390209, 0.11661565, 0.11536079, 0.11420549, 0.11348946,
        0.11273218, 0.11227101, 0.11338267, 0.11368364, 0.1136909 ,
        0.11636807, 0.11470061, 0.11516906, 0.11610109, 0.11658409,
        0.11423944, 0.11263265, 0.10858173, 0.10