# Predicting NIFTY with Transformer

The following is a Pytorch implementation of a transformer model to predict NIFTY indices.

## Imports

In [None]:
import os
import copy
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

## Data

### Load Data

In [None]:
data = pd.read_excel('NIFTY50.xlsx', sheet_name=None)

def copy_dict_without_keys(d, keys):
    return {x: d[x] for x in d if x not in keys}

input_data = copy_dict_without_keys(data, ['Companies List', 'in', 'Reverse REPO Rate', 'Repo Rate'])
output_data = copy_dict_without_keys(data, list(set(data.keys()) - {'in'}))

# impute null values in output data and convert date to standard format
output_data = output_data.dropna(subset=['Open', 'High', 'Low', 'Close', 'Adj Close'])
output_data['Date'] = pd.to_datetime(output_data['Date'], dayfirst=True)

# prepare time index
time_index = output_data['Date'].sort_values(ignore_index=True)
time_index = pd.Series(time_index.index, time_index.values)

# add time index to output data
output_data['time_idx'] = pd.Series(list(map(time_index.get, output_data['Date'])))
output_data = output_data.drop(['Date'], axis=1)

# prepare input data before merge
for sheet in input_data:
    # 1. Replace '##...' by actual dates
    # ('##...' are placeholders for wider text)

    # 2. Impute null values 
    input_data[sheet] = data[sheet].dropna(subset=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'])

    # 3. Add a categorical column for company symbol
    input_data[sheet]['Symbol'] = sheet

    # 4. Convert date to standard format
    input_data[sheet]['Date'] = pd.to_datetime(input_data[sheet]['Date'], dayfirst=True)

    # 5. Add time index as per time origin (of whole dataset)
    input_data[sheet]['time_idx'] = pd.Series(list(map(time_index.get, input_data[sheet]['Date']))) 
    input_data[sheet] = input_data[sheet].drop(['Date'], axis=1)

# map every timestep to its list of companies
# for sheet in input_data:
#     output_data[sheet] = output_data['time_idx'].isin(input_data[sheet]['time_idx'])

# store list of company symbols
companies = input_data.columns.values.tolist()

### Merge Data

Since every target value is mapped to by a 2D vector, it is necessary to have in the training data a column of 2D vectors mapped to a time step.    

In [None]:
# 1. Concatenate dataframes
input_data = pd.concat(input_data)

# 2. Collect event data from separate columns into vectors
input_data['events'] = input_data['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'].values.tolist()
input_data['events'] = input_data['events'].apply(np.array)
input_data = input_data[['time_idx', 'events', 'Symbol']]

# 3. Group by time step (as we need every time step sample to map to a single target value)
merged_data = input_data.groupby('time_idx')['events'].apply(list).reset_index()
merged_data['events'] = merged_data['events'].apply(np.array)
company_categories = input_data.groupby('time_idx')['Symbol'].apply(list).reset_index()
merged_data = pd.merge(merged_data, company_categories, how='inner', on='time_idx')
    
# 4. Sort by time step before concatenation with NIFTY data
merged_data = merged_data.sort_values(by=['time_idx'], ignore_index=True)

# 5. Add columns for NIFTY index
merged_data = pd.merge(merged_data, output_data, how='inner', on='time_idx')

## Create Dataset and dataloaders

Convert the dataframe of raw data into a PyTorch Forecasting ``TimeSeriesDataSet``. 

In [None]:
training = TimeSeriesDataSet(
    data=merged_data,
    time_idx='time_idx',
    target='Close',
    group_ids=['Symbol'],
    min_encoder_length=16,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=64,
    static_categoricals=[], # entity embedding is used for categorical variables, instead of one-hot
    time_varying_known_categoricals=[],
    time_varying_known_reals=['time_idx'],
    time_varying_unknown_reals=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'],
    time_varying_unknown_categoricals=['Symbol'],
    target_normalizer=GroupNormalizer(
        groups=[], transformation='softplus'
    )  # use softplus and normalize by group
)

# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)

# create dataloaders for model
batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

## Transformer

Initialize and train a time-series forecasting transformer.

### Optimize learning rate

The optimal learning rate is identified using PyTorch Lightning learning rate finder.

In [None]:
# configure network and trainer
pl.seed_everything(42)
trainer = pl.Trainer(
    gpus=0,
    # clipping gradients is a hyperparameter and important to prevent divergance
    # of the gradient for recurrent neural networks
    gradient_clip_val=0.1,
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=0.03,
    hidden_size=16,  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
    attention_head_size=2,
    dropout=0.1,  # between 0.1 and 0.3 are good values
    hidden_continuous_size=8,  # set to <= hidden_size
    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    # reduce learning rate if no improvement in validation loss after x epochs
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

In [None]:
# find optimal learning rate
res = trainer.tuner.lr_find(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=10.0,
    min_lr=1e-6,
)

print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()

For the TemporalFusionTransformer, the optimal learning rate seems to be slightly lower than the suggested one. Further, we do not directly want to use the suggested learning rate because PyTorch Lightning sometimes can get confused by the noise at lower learning rates and suggests rates far too low. Manual control is essential.

## Train Model

In [None]:
# configure network and trainer
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
lr_logger = LearningRateMonitor()  # log the learning rate
logger = TensorBoardLogger("lightning_logs")  # logging results to a tensorboard

trainer = pl.Trainer(
    max_epochs=30,
    gpus=0,
    weights_summary="top",
    gradient_clip_val=0.1,
    limit_train_batches=30,  # coment in for training, running valiation every 30 batches
    # fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    log_interval=10,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

In [None]:
# for training with gpu

import tensorflow as tf

with tf.device("gpu"):
    trainer.fit(
        tft,
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader,
    )

# for training with cpu

# trainer.fit(
#     tft,
#     train_dataloaders=train_dataloader,
#     val_dataloaders=val_dataloader,
# )

In [None]:
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

## Evaluate performance

In [None]:
# calcualte mean absolute error on validation set
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = best_tft.predict(val_dataloader)
(actuals - predictions).abs().mean()

In [None]:
# raw predictions are a dictionary from which all kind of information including quantiles can be extracted
raw_predictions, x = best_tft.predict(val_dataloader, mode="raw", return_x=True)

for idx in time_index:  # plot 
    best_tft.plot_prediction(x, raw_predictions, idx=idx, add_loss_to_title=True);