# Kospi Prediction using transformer

In [653]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset 

import tqdm
import torch.nn.functional as F
import math

In [654]:
df = pd.read_csv('./total.csv')
df.tail()

Unnamed: 0,Date,S&P,currency,gold,kospi
5317,2024-04-17,5022.209961,1387.339966,2371.699951,2584.179932
5318,2024-04-18,5011.120117,1379.540039,2382.300049,2634.699951
5319,2024-04-19,4967.22998,1379.400024,2398.399902,2591.860107
5320,2024-04-22,5010.600098,1373.930054,2347.0,2591.860107
5321,2024-04-23,5010.600098,1373.930054,2347.0,2634.830078


In [655]:
col_names = df.columns[1:].values
col_names


array(['S&P', 'currency', 'gold', 'kospi'], dtype=object)

In [656]:
# global variables and hyperparameter
IN_DIM = 64
DAY_INT = 1
BATCH_SIZE = 64
SCALER = 'MINMAX'   # 'NORMAL'
# SCALER = 'NORMAL'   # 'NORMAL'
TRAIN_TEST_SPLIT = 0.9
LR = 1e-4

In [657]:
df_train = df.iloc[:int(len(df) * TRAIN_TEST_SPLIT), :].drop('Date', axis=1)
df_test = df.iloc[int(len(df) * TRAIN_TEST_SPLIT): , :].drop('Date', axis=1)
# print(df_train.tail())
# print(df_test.head())
scale_params = pd.DataFrame(index=['mean', 'var', 'max', 'min'])
for i in col_names:
    scale_params[i] = [df_train[i].mean(), df_train[i].var(), df_train[i].max(), df_train[i].min()]
scale_params

Unnamed: 0,S&P,currency,gold,kospi
mean,1944.588598,1109.020207,1175.492837,1876.226251
var,864381.983793,9350.660785,184854.497651,283255.535559
max,4796.560059,1571.400024,2051.5,3305.209961
min,676.530029,886.679993,374.799988,719.590027


In [658]:
df_train.tail()

Unnamed: 0,S&P,currency,gold,kospi
4784,4545.859863,1214.5,1919.099976,2739.850098
4785,4582.640137,1218.780029,1929.199951,2757.899902
4786,4525.120117,1213.900024,1922.900024,2759.199951
4787,4481.149902,1218.75,1918.400024,2735.030029
4788,4500.209961,1217.599976,1933.800049,2695.860107


In [659]:
train_data = pd.DataFrame()
test_data = pd.DataFrame()
if SCALER == 'MINMAX':
    for i in col_names:
        train_data[i] = df_train[i].apply(lambda x: (x - scale_params.loc['min', i])/(scale_params.loc['max', i] - scale_params.loc['min', i]))
        test_data[i] = df_test[i].apply(lambda x: (x - scale_params.loc['min', i])/(scale_params.loc['max', i] - scale_params.loc['min', i]))
elif SCALER == 'NORMAL':
    for i in col_names:
        train_data[i] = df_train[i].apply(lambda x: (x - scale_params.loc['mean', i])/scale_params.loc['var', i])
        test_data[i] = df_test[i].apply(lambda x: (x - scale_params.loc['mean', i])/scale_params.loc['var', i])
test_data.tail()

Unnamed: 0,S&P,currency,gold,kospi
5317,1.054769,0.731189,1.19097,0.721138
5318,1.052077,0.719798,1.197292,0.740677
5319,1.041424,0.719593,1.206894,0.724109
5320,1.051951,0.711605,1.176239,0.724109
5321,1.051951,0.711605,1.176239,0.740728


In [660]:
class Stockdataset(Dataset):
    def __init__(self, data, input_size=128, day_interval=2):
        self.data = data
        self.len = len(data)
        start_pos = (self.len - input_size -1) % day_interval
        iter_times = (self.len - input_size -1) // day_interval + 1
        print(self.len, start_pos, iter_times)
        X = []
        Y = []
        for i in range(iter_times):  # check iteration number 
            start = start_pos + i * day_interval
            end = start + input_size    # -1 if use loc
            x_num = data.iloc[start:end, :].to_numpy()
            # x_num = x_num.astype(float)
            X.append(x_num.T)
            Y.append(df.loc[end + 1, 'kospi'])
        self.x = X
        self.y = Y
        self.len = len(X)
        print(start, end)

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [673]:
train_dataset = Stockdataset(train_data, input_size=IN_DIM, day_interval=DAY_INT)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = Stockdataset(test_data, input_size=IN_DIM, day_interval=DAY_INT)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)
print(next(iter(test_dataset[-1])).dtype)

4789 0 4724
4724 4788
533 0 468
468 532
float64


In [662]:
# train_dataloader = train_dataloader.type(torch.float)

In [663]:
nn.Linear(32, 32)

Linear(in_features=32, out_features=32, bias=True)

In [664]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_dim, num_heads, dropout=0.0, bias=False, encoder_decoder_attention=False, causal=False):
        super().__init__()
        self.emb_dim = emb_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = emb_dim // num_heads
        assert self.head_dim * num_heads == self.emb_dim, 'emb_dim must be divisible by num_heads'

        self.encoder_decoder_attention = encoder_decoder_attention
        self.causal = causal
        self.q_proj = nn.Linear(emb_dim, emb_dim, bias=bias)
        self.k_proj = nn.Linear(emb_dim, emb_dim, bias=bias)
        self.v_proj = nn.Linear(emb_dim, emb_dim, bias=bias)
        self.out_proj = nn.Linear(emb_dim, emb_dim, bias=bias)
        # print(self.q_proj.grad)
    
    def transpose_for_scores(self, x):
        """
        To-Do : Reshape input
          Args : batch_size X sequence_length X embedding dimension
          Return : batch_size X # attention head X sequence_length X head dimension
        """
        new_x_hape = x.size()[:-1] + (
            self.num_heads,
            self.head_dim
        )
        x = x.view(*new_x_hape)
        return x.permute(0, 2, 1, 3)
    
    def scaled_dot_product(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attention_mask: torch.BoolTensor):
        """
        To-Do : Implement scaled dot product
          Args:
            Query (Tensor): shape `(batch, seq_len, emb_dim)`
            Key (Tensor): shape `(batch, seq_len, emb_dim)`
            Value (Tensor): shape `(batch, seq_len, emb_dim)`
            attention_mask: binary BoolTensor of shape `(batch, seq_len)` or `(seq_len, seq_len)`

          Returns:
            attn_output : attended output (result of attention mechanism)
            attn_weights: value of each attention
        """
        attn_weights = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(self.emb_dim)
        if attention_mask is not None:
            attn_weights = attn_weights.maked_fill(attention_mask.unsqueeze(1), float('-inf'))
        
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
        attn_output = torch.matmul(attn_probs, value)

        return attn_output, attn_probs
    
    def MultiHead_scaled_dot_product(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attention_mask: torch.BoolTensor):
        """
        To-Do : Implement Multi-head version of scaled dot product, please also take the causal masking into account.
          Args:
            Query (Tensor): shape `(batch,# attention head, seq_len, head_dim)`
            Key (Tensor): shape `(batch,# attention head, seq_len, head_dim)`
            Value (Tensor): shape `(batch,# attention head, seq_len, head_dim)`
            attention_mask: binary BoolTensor of shape `(batch, src_len)` or `(seq_len, seq_len)`

          Returns:
            attn_output : attended output (result of attention mechanism)
            attn_weights: value of each attention
        """
        attn_weights = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(self.head_dim)

        if attention_mask is not None:
            if self.causal:
                #(seq_len x seq_len)
                attn_weights = attn_weights.masked_fill(attention_mask.unsqueeze(0).unsqueeze(1), float('-inf'))
            else:
                #(batch_size x seq_len)
                attn_weights = attn_weights.masked_fill(attention_mask.unsqueeze(1).unsqueeze(2), float('-inf'))
        
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)

        attn_output = torch.matmul(attn_probs, value)
        attn_output = attn_output.permute(0, 2, 1, 3).contiguous()
        concat_attn_output_shape = attn_output.size()[:-2] + (self.emb_dim,) # batch_size, seq_len, emb_dim
        attn_output = attn_output.view(*concat_attn_output_shape)
        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights
    
    def forward(self, query: torch.Tensor, key: torch.Tensor, attention_mask: torch.Tensor = None):
        query = query.type('torch.FloatTensor').to('cuda')
        print('-------------')
        print(query[-1])
        q = self.q_proj(query)

        if self.encoder_decoder_attention:
            k = self.k_proj(key)
            v = self.v_proj(key)
        else: # self attention
            k = self.k_proj(query)
            v = self.v_proj(query)
        
        q = self.transpose_for_scores(q)
        k = self.transpose_for_scores(k)
        v = self.transpose_for_scores(v)

        attn_output, attn_weights = self.MultiHead_scaled_dot_product(q, k, v, attention_mask)
        return attn_output, attn_weights


In [665]:
class PositionWiseFeedForward(nn.Module):

    def __init__(self, emb_dim: int, d_ff: int, droptout: float = 0.1):
        super(PositionWiseFeedForward, self).__init__()

        self.activation = nn.ReLU()
        self.w_1 = nn.Linear(emb_dim, d_ff)
        self.w_2 = nn.Linear(d_ff, emb_dim)
        self.dropout = droptout
    
    def forward(self, x):
        residual = x
        x = self.activation(self.w_1(x))
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.w_2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        return x + residual

In [666]:
class SinusoidalPositionalEmbedding(nn.Embedding):
    def __init__(self, num_positions, embedding_dim, padding_idx=None):
        super().__init__(num_positions, embedding_dim)
        self.weight = self._init_weight(self.weight)
    
    @staticmethod
    def _init_weight(out: nn.Parameter):
        n_pos, embed_dim = out.shape
        pe = nn.Parameter(torch.zeros(out.shape))
        for pos in range(n_pos):
            for i in range(0, embed_dim, 2):
                pe[pos, i].data.copy_(torch.tensor(np.sin(pos / (10000**(i/embed_dim)))))
                pe[pos, i+1].data.copy_(torch.tensor(np.cos(pos/(10000**(i+1)/embed_dim))))
        pe.detach_()
        return pe
    
    @torch.no_grad()
    def forward(self, input_ids):
        bsz, seq_len =input_ids.shape[:2]
        positions = torch.arange(seq_len, dtype=torch.long, device=self.weight.device)
        return super().forward(positions)

In [667]:
class EncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.emb_dim = config.emb_dim
        self.ffn_dim = config.ffn_dim
        self.self_attn = MultiHeadAttention(
            emb_dim=self.emb_dim,
            num_heads=config.attention_heads,
            dropout=config.attention_dropout)
        self.self_att_layer_norm = nn.LayerNorm(self.emb_dim)
        self.dropout = config.dropout
        self.activation_fn = nn.ReLU()
        self.PositionWiseFeedForward = PositionWiseFeedForward(self.emb_dim, self.ffn_dim, self.dropout)
        self.final_layer_norm = nn.LayerNorm(self.emb_dim)
    
    def forward(self, x, encoder_padding_mask):
        """
        To-Do : Implement transformer encoder layer
          Args:
            x (Tensor): input to the layer of shape `(batch, seq_len, emb_dim)`
            encoder_padding_mask: binary BoolTensor of shape `(batch, src_len)`

          Returns:
            x : encoded output of shape `(batch, seq_len, emb_dim)`
            self_attn_weights: self attention score
        """
        residual = x
        x, attn_weights = self.self_attn(query=x, key=x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = x.type('torch.DoubleTensor').to('cuda')
        x = self.self_att_layer_norm(x)
        x = self.PositionWiseFeedForward(x)
        x = self.final_layer_norm(x)
        if torch.isinf(x).any() or torch.isnan(x).any():
          clamp_value = torch.finfo(x.type).max - 1000
          x = torch.clamp(x, min=-clamp_value, max=clamp_value)
        return x, attn_weights

In [668]:
class Encoder(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.dropout = config.dropout
        self.max_source_positions = config.max_position_embeddings
        
        self.embed_positions = SinusoidalPositionalEmbedding(
            config.max_position_embeddings, config.emb_dim)
        self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)])

    def forward(self, input_ids, attention_mask=None):
        """
        To-Do : Implement the transformer encoder
          Args:
            input_ids (Tensor): input to the layer of shape `(batch, seq_len)`
            attention_mask: binary BoolTensor of shape `(batch, src_len)`

          Returns:
            x: encoded output of shape `(batch, seq_len, emb_dim)`
            self_attn_scores: a list of self attention score of each layer
        """
        embed_pos = self.embed_positions(input_ids)
        # print(embed_pos)
        x = input_ids + embed_pos
        x = F.dropout(x, p=self.dropout, training=self.training)

        self_attn_scores = []
        for encoder_layer in self.layers:
            x, attn = encoder_layer(x, attention_mask)
            self_attn_scores.append(attn.detach())
            
        return x, self_attn_scores

In [669]:
class TransformerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()

        # self.SRC_vocab = SRC_vocab
        # self.TRG_vocab = TRG_vocab

        # self.enc_embedding = nn.Embedding(len(SRC_vocab.itos), config.emb_dim, padding_idx=SRC_vocab.stoi['<pad>'])
        # self.dec_embedding = nn.Embedding(len(TRG_vocab.itos), config.emb_dim, padding_idx=TRG_vocab.stoi['<pad>'])

        self.encoder = Encoder(config)
        # self.decoder = Decoder(config, self.dec_embedding)

        self.prediction_head = nn.Linear(config.emb_dim, 1)

        self.init_weights()

    # def generate_mask(self, src, trg):
    #     # Mask encoder attention to ignore padding
    #     enc_attention_mask = src.eq(self.SRC_vocab.stoi['<pad>']).to(device)
    #     # Mask decoder attention for causality
    #     tmp = torch.ones(trg.size(1), trg.size(1), dtype=torch.bool, device=device)
    #     mask = torch.arange(tmp.size(-1), device=device)
    #     dec_attention_mask = tmp.masked_fill_(mask < (mask + 1).view(tmp.size(-1), 1), False).to(device)

    #     return enc_attention_mask, dec_attention_mask

    def init_weights(self):
        for name, param in self.named_parameters():
            if param.requires_grad:
                if 'weight' in name:
                    nn.init.normal_(param.data, mean=0, std=0.01)
                else:
                    nn.init.constant_(param.data, 0)

    def forward(self, src):
        # enc_attention_mask, dec_causal_mask = self.generate_mask(src, trg)
        encoder_output, encoder_attention_scores = self.encoder(
            input_ids=src,
            attention_mask=None
            # attention_mask=enc_attention_mask
        )

        # decoder_output, decoder_attention_scores = self.decoder(
        #     trg,
        #     encoder_output,
        #     encoder_attention_mask=enc_attention_mask,
        #     decoder_causal_mask=dec_causal_mask,
        # )
        # decoder_output = self.prediction_head(decoder_output)
        encoder_output = self.prediction_head(encoder_output)

        return encoder_output, encoder_attention_scores
        # return decoder_output, encoder_attention_scores, decoder_attention_scores

In [670]:
import easydict
import torch.nn as nn
import torch.optim as optim

# Create the configuration for the transformer model
config = easydict.EasyDict({
    "emb_dim": IN_DIM,
    "ffn_dim": 256,
    "attention_heads": 4,
    "attention_dropout": 0.0,
    "dropout": 0.2,
    "max_position_embeddings": BATCH_SIZE,
    "encoder_layers": 3,
    "decoder_layers": 3,
})

# Constants for training
N_EPOCHS = 100
learning_rate = 5e-4

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Instantiate the model using the new Vocab instances instead of the Fields
model = TransformerEncoder(config)
model.to(device)

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Define the loss function, ignoring the index of the padding token
criterion = nn.MSELoss()

In [671]:
def train(model: nn.Module,
          iterator: DataLoader,
          optimizer: optim.Optimizer,
          criterion: nn.Module):

    model.train()
    epoch_loss = 0

    for inputs, labels in iterator:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        output, enc_attention_scores, _ = model(inputs)

        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module,
             iterator: DataLoader,
             criterion: nn.Module):

    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for inputs, labels in iterator:
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Assuming src and trg are already tensorized and padded
            # If not, you should perform those steps here

            output, attention_score, _ = model(inputs)

            loss = criterion(output, labels)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# Training loop
for epoch in tqdm.tqdm(range(N_EPOCHS), total=N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion)

    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

# Evaluation on test set
test_loss = evaluate(model, test_dataloader, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

-------------
tensor([[0.1595, 0.0000, 0.1374, 1.3895, 0.1476, 1.4013, 0.1440, 1.3968, 0.1380,
         1.3782, 0.1357, 1.3918, 0.1482, 1.3953, 0.1573, 1.4075, 0.1610, 1.4099,
         0.1663, 1.4091, 0.1665, 1.4118, 0.1635, 0.0000, 0.1753, 0.0000, 0.1716,
         0.0000, 0.1846, 1.4250, 0.1644, 0.0000, 0.1774, 1.4250, 0.1774, 1.4319,
         0.1676, 1.4209, 0.0000, 1.4245, 0.0000, 1.4200, 0.1637, 0.0000, 0.1567,
         1.4052, 0.1472, 1.3972, 0.1463, 1.4066, 0.1574, 0.0000, 0.1723, 1.4223,
         0.1761, 1.4266, 0.1773, 1.4192, 0.1756, 1.4199, 0.1666, 1.4124, 0.1636,
         1.4148],
        [1.5174, 1.7130, 1.3582, 1.7784, 1.1683, 1.7866, 1.0225, 1.7849, 0.9187,
         1.7993, 0.8423, 0.0000, 0.7655, 1.7858, 0.6721, 1.7689, 0.6453, 0.0000,
         0.5875, 1.7376, 0.5517, 1.7261, 0.5035, 1.7075, 0.0000, 1.6993, 0.4758,
         1.6955, 0.4232, 1.6467, 0.4369, 1.6827, 0.4472, 1.6832, 0.4105, 1.6681,
         0.0000, 1.6823, 0.4500, 1.6849, 0.4399, 0.0000, 0.4636, 0.0000, 0.45




RuntimeError: expected scalar type Double but found Float