Before starting, the "FinalDatasets" folder created by "DataPrepration" notebook should be copied to project directory. In case both Notebooks are in the same directory, nothing needs to be done.

## Setup and Imports

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter
import random
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchsummary import summary

In [2]:
STOCKS = ['BOM500875','BOM532939','BOM524715','BOM532215','BOM532648','BOM532500','BOM500470','BOM500570','BOM500112','BOM532540','BOM500209','BOM500295','BOM532174','BOM500180','BOM500696','BOM500510','BOM500247','BOM500520','BOM532921','BOM532977','BOM500182','BOM507685','BOM500010','BOM532454','BOM500820','BOM500312','BOM532898','BOM532187','BOM533278','BOM532555','BOM570001']

## Data Prepration and LSTM Model creation

In [3]:
# Helper function for creating input to the model given full dataframe
def create_dataset(stock_data_full, lookback=30):
  # Initialize empty lists for storing sequences and labels
  sequences = []
  labels = []

  # Loop over the data and create 30-day sequences with corresponding labels
  for i in range(len(stock_data_full) - lookback):
      # Get the 30-day sequence for this row, including all features
      sequence = stock_data_full.iloc[i:i+lookback].values
      # Append the sequence to the list
      sequences.append(np.float32(sequence))
      # Get the label for this sequence, which is the stock price on the next trading day
      label = stock_data_full.iloc[i+lookback]['Label']
      # Append the label to the list
      labels.append(label)

  # Convert the sequences and labels to numpy arrays
  X = np.array(sequences)
  y = np.array(labels)

  # Print the shapes of X and y to verify they are the correct dimensions
  print(X.shape)  # should be (num_sequences, window_size, num_features)
  print(y.shape)  # should be (num_sequences,)

  return torch.from_numpy(X), torch.tensor(y)

In [4]:
# Definition of LSTM model to be used for our traning 
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm1 = nn.LSTM(input_size, hidden_size[0], batch_first=True, dropout=0.2)
        self.lstm2 = nn.LSTM(hidden_size[0], hidden_size[1], batch_first=True, dropout=0.2)
        self.lstm3 = nn.LSTM(hidden_size[1], hidden_size[2], batch_first=True, dropout=0.2)
        self.linear = nn.Linear(hidden_size[2], output_size)
        
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size[0]).to(device)
        c0 = torch.zeros(1, x.size(0), self.hidden_size[0]).to(device)
        
        out, _ = self.lstm1(x, (h0, c0))
        out, _ = self.lstm2(out)
        out, _ = self.lstm3(out)
        out = self.linear(out[:, -1, :])
        
        return out

## Model Traning on 1 stock to check correctness

In [5]:
# Load one stocks dataset into a dataframe
df = pd.read_csv('./FinalDatasets/BOM500010Stock.csv')
df.set_index('Date', inplace=True)
df.sort_index(inplace=True)

In [6]:
# train-test split for time series
train_size = int(len(df) * 0.80)
test_size = len(df) - train_size
train, test = df[:train_size], df[train_size:]

In [7]:
# Convert DF to Tensors to be used as LSTM Input
lookback = 30
batch_size =32
X_train, y_train = create_dataset(train, lookback=lookback)
X_test, y_test = create_dataset(test, lookback=lookback)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
# Create training and testing datasets
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Create data loaders for batching the datasets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

(2566, 30, 26)
(2566,)
(619, 30, 26)
(619,)
torch.Size([2566, 30, 26]) torch.Size([2566])
torch.Size([619, 30, 26]) torch.Size([619])


In [8]:
from torch.types import Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the model hyperparameters
input_size = 26
hidden_size = [32, 16, 10]
output_size = 1
dropout_prob = 0.2

# Initialize the model
model = LSTMModel(input_size, hidden_size, output_size).to(device)



In [9]:
# setting up hyperparameters
learning_rate = 0.001

# Initialize the model and the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999))

# Define the loss function
criterion = nn.MSELoss()

In [10]:
num_epochs=20
# Train the LSTM model
for epoch in range(num_epochs):
    for i, (sequences, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(sequences.float()) # cast input tensor to Float
        loss = criterion(outputs, labels.float().unsqueeze(1)) # cast label tensor to Float
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    # Print the loss every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [10/20], Loss: 0.6062
Epoch [20/20], Loss: 1.1472


In [11]:
correct = 0
total = 0
with torch.no_grad():
    for sequences, labels in test_loader:
        outputs = model(sequences)
        predicted = torch.round(outputs).flatten().int()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy on test set: {100 * correct / total:.2f}%")


Accuracy on test set: 26.33%


## Traning pipeline for all 31 stocks

In [12]:
traningAccuracies = dict()

In [13]:
def trainAndTestModelOnStock(stock):
  fileName = stock + 'Stock.csv'
  df = pd.read_csv('./FinalDatasets/' + fileName)
  df.set_index('Date', inplace=True)
  df.sort_index(inplace=True)
  
  # train-test split for time series
  train_size = int(len(df) * 0.80)
  test_size = len(df) - train_size
  train, test = df[:train_size], df[train_size:]

  # Convert dataframe to 3-D tensors
  lookback = 30
  batch_size =32
  X_train, y_train = create_dataset(train, lookback=lookback)
  X_test, y_test = create_dataset(test, lookback=lookback)
  
  # Create training and testing datasets
  train_dataset = TensorDataset(X_train, y_train)
  test_dataset = TensorDataset(X_test, y_test)

  # Create data loaders for batching the datasets
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  from torch.types import Device
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  # Define the model hyperparameters
  input_size = 26
  hidden_size = [32, 16, 10]
  output_size = 1
  dropout_prob = 0.2
  learning_rate = 0.001

  # Initialize the model
  model = LSTMModel(input_size, hidden_size, output_size).to(device)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999))
  criterion = nn.MSELoss()

  # Train the LSTM model
  num_epochs=20
  for epoch in range(num_epochs):
      for i, (sequences, labels) in enumerate(train_loader):
          optimizer.zero_grad()
          outputs = model(sequences.float()) # cast input tensor to Float
          loss = criterion(outputs, labels.float().unsqueeze(1)) # cast label tensor to Float
          loss.backward()
          nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
          optimizer.step()

      # Print the loss every 10 epochs
      if (epoch + 1) % 10 == 0:
          print(f"{stock} Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
  
  # Test the Model and save accuracy
  correct = 0
  total = 0
  with torch.no_grad():
      for sequences, labels in test_loader:
          outputs = model(sequences)
          predicted = torch.round(outputs).flatten().int()
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

  print(f"{stock} Accuracy on test set: {100 * correct / total:.2f}%")
  traningAccuracies[stock] = correct/total

In [14]:
for stock in STOCKS:
  trainAndTestModelOnStock(stock)

(2922, 30, 26)
(2922,)
(708, 30, 26)
(708,)




BOM500875 Epoch [10/20], Loss: 0.7194
BOM500875 Epoch [20/20], Loss: 0.2344
BOM500875 Accuracy on test set: 59.18%
(2134, 30, 26)
(2134,)
(512, 30, 26)
(512,)




BOM532939 Epoch [10/20], Loss: 0.3047
BOM532939 Epoch [20/20], Loss: 0.6245
BOM532939 Accuracy on test set: 49.02%
(2922, 30, 26)
(2922,)
(708, 30, 26)
(708,)




BOM524715 Epoch [10/20], Loss: 0.4392
BOM524715 Epoch [20/20], Loss: 0.6087
BOM524715 Accuracy on test set: 50.99%
(2926, 30, 26)
(2926,)
(709, 30, 26)
(709,)




BOM532215 Epoch [10/20], Loss: 0.3712
BOM532215 Epoch [20/20], Loss: 0.4027
BOM532215 Accuracy on test set: 79.27%
(2650, 30, 26)
(2650,)
(641, 30, 26)
(641,)




BOM532648 Epoch [10/20], Loss: 0.3648
BOM532648 Epoch [20/20], Loss: 0.1556
BOM532648 Accuracy on test set: 50.23%
(2925, 30, 26)
(2925,)
(709, 30, 26)
(709,)




BOM532500 Epoch [10/20], Loss: 0.4414
BOM532500 Epoch [20/20], Loss: 0.6374
BOM532500 Accuracy on test set: 36.25%
(2922, 30, 26)
(2922,)
(708, 30, 26)
(708,)




BOM500470 Epoch [10/20], Loss: 0.0648
BOM500470 Epoch [20/20], Loss: 0.2876
BOM500470 Accuracy on test set: 46.47%
(2922, 30, 26)
(2922,)
(708, 30, 26)
(708,)




BOM500570 Epoch [10/20], Loss: 0.2179
BOM500570 Epoch [20/20], Loss: 0.3555
BOM500570 Accuracy on test set: 63.70%
(2922, 30, 26)
(2922,)
(709, 30, 26)
(709,)




BOM500112 Epoch [10/20], Loss: 0.5293
BOM500112 Epoch [20/20], Loss: 0.6293
BOM500112 Accuracy on test set: 44.01%
(2828, 30, 26)
(2828,)
(685, 30, 26)
(685,)




BOM532540 Epoch [10/20], Loss: 0.3378
BOM532540 Epoch [20/20], Loss: 0.3044
BOM532540 Accuracy on test set: 62.19%
(2925, 30, 26)
(2925,)
(709, 30, 26)
(709,)




BOM500209 Epoch [10/20], Loss: 0.6569
BOM500209 Epoch [20/20], Loss: 0.3401
BOM500209 Accuracy on test set: 45.56%
(2922, 30, 26)
(2922,)
(709, 30, 26)
(709,)




BOM500295 Epoch [10/20], Loss: 0.3217
BOM500295 Epoch [20/20], Loss: 0.0595
BOM500295 Accuracy on test set: 42.45%
(2926, 30, 26)
(2926,)
(709, 30, 26)
(709,)




BOM532174 Epoch [10/20], Loss: 0.0560
BOM532174 Epoch [20/20], Loss: 0.4904
BOM532174 Accuracy on test set: 52.61%
(2925, 30, 26)
(2925,)
(709, 30, 26)
(709,)




BOM500180 Epoch [10/20], Loss: 0.2432
BOM500180 Epoch [20/20], Loss: 0.5783
BOM500180 Accuracy on test set: 23.55%
(2926, 30, 26)
(2926,)
(709, 30, 26)
(709,)




BOM500696 Epoch [10/20], Loss: 0.4322
BOM500696 Epoch [20/20], Loss: 0.4377
BOM500696 Accuracy on test set: 51.34%
(2906, 30, 26)
(2906,)
(705, 30, 26)
(705,)




BOM500510 Epoch [10/20], Loss: 0.5864
BOM500510 Epoch [20/20], Loss: 0.2707
BOM500510 Accuracy on test set: 55.89%
(2926, 30, 26)
(2926,)
(709, 30, 26)
(709,)




BOM500247 Epoch [10/20], Loss: 0.3994
BOM500247 Epoch [20/20], Loss: 0.6267
BOM500247 Accuracy on test set: 60.51%
(2926, 30, 26)
(2926,)
(710, 30, 26)
(710,)




BOM500520 Epoch [10/20], Loss: 0.6483
BOM500520 Epoch [20/20], Loss: 0.1048
BOM500520 Accuracy on test set: 54.65%
(2175, 30, 26)
(2175,)
(522, 30, 26)
(522,)




BOM532921 Epoch [10/20], Loss: 0.4138
BOM532921 Epoch [20/20], Loss: 0.7351
BOM532921 Accuracy on test set: 41.00%
(2079, 30, 26)
(2079,)
(498, 30, 26)
(498,)




BOM532977 Epoch [10/20], Loss: 0.6895
BOM532977 Epoch [20/20], Loss: 0.5558
BOM532977 Accuracy on test set: 68.07%
(2925, 30, 26)
(2925,)
(709, 30, 26)
(709,)




BOM500182 Epoch [10/20], Loss: 0.5804
BOM500182 Epoch [20/20], Loss: 0.0766
BOM500182 Accuracy on test set: 39.21%
(2926, 30, 26)
(2926,)
(709, 30, 26)
(709,)




BOM507685 Epoch [10/20], Loss: 0.7717
BOM507685 Epoch [20/20], Loss: 0.2991
BOM507685 Accuracy on test set: 33.29%
(2566, 30, 26)
(2566,)
(619, 30, 26)
(619,)




BOM500010 Epoch [10/20], Loss: 0.9698
BOM500010 Epoch [20/20], Loss: 1.2235
BOM500010 Accuracy on test set: 28.59%
(2922, 30, 26)
(2922,)
(708, 30, 26)
(708,)




BOM532454 Epoch [10/20], Loss: 0.0859
BOM532454 Epoch [20/20], Loss: 0.0489
BOM532454 Accuracy on test set: 52.12%
(2926, 30, 26)
(2926,)
(709, 30, 26)
(709,)




BOM500820 Epoch [10/20], Loss: 0.0293
BOM500820 Epoch [20/20], Loss: 0.2545
BOM500820 Accuracy on test set: 62.48%
(2926, 30, 26)
(2926,)
(710, 30, 26)
(710,)




BOM500312 Epoch [10/20], Loss: 0.1916
BOM500312 Epoch [20/20], Loss: 0.1708
BOM500312 Accuracy on test set: 50.85%
(2205, 30, 26)
(2205,)
(529, 30, 26)
(529,)




BOM532898 Epoch [10/20], Loss: 0.4311
BOM532898 Epoch [20/20], Loss: 0.6547
BOM532898 Accuracy on test set: 29.87%
(2926, 30, 26)
(2926,)
(710, 30, 26)
(710,)




BOM532187 Epoch [10/20], Loss: 0.5828
BOM532187 Epoch [20/20], Loss: 0.4495
BOM532187 Accuracy on test set: 26.34%
(1595, 30, 26)
(1595,)
(377, 30, 26)
(377,)




BOM533278 Epoch [10/20], Loss: 0.2533
BOM533278 Epoch [20/20], Loss: 0.2514
BOM533278 Accuracy on test set: 31.83%
(2789, 30, 26)
(2789,)
(675, 30, 26)
(675,)




BOM532555 Epoch [10/20], Loss: 0.1303
BOM532555 Epoch [20/20], Loss: 0.2972
BOM532555 Accuracy on test set: 59.70%
(1910, 30, 26)
(1910,)
(456, 30, 26)
(456,)




BOM570001 Epoch [10/20], Loss: 0.1596
BOM570001 Epoch [20/20], Loss: 0.3176
BOM570001 Accuracy on test set: 35.31%


In [15]:
print('Classification Accuracies for each of 31 stock on Testing set is as follows: ')

print ("{:<10} {:<5}".format('Stock','Accuracy'))
for k, v in traningAccuracies.items():
        accuracy = v
        print ("{:<8} {:<15}".format(k, accuracy))

Classification Accuracies for each of 31 stock on Testing set is as follows: 
Stock      Accuracy
BOM500875 0.5918079096045198
BOM532939 0.490234375    
BOM524715 0.5098870056497176
BOM532215 0.7926657263751763
BOM532648 0.5023400936037441
BOM532500 0.3624823695345557
BOM500470 0.4646892655367232
BOM500570 0.6370056497175142
BOM500112 0.4400564174894217
BOM532540 0.621897810218978
BOM500209 0.45557122708039494
BOM500295 0.4245416078984485
BOM532174 0.5260930888575458
BOM500180 0.23554301833568406
BOM500696 0.5133991537376587
BOM500510 0.5588652482269504
BOM500247 0.6050775740479548
BOM500520 0.5464788732394367
BOM532921 0.4099616858237548
BOM532977 0.6807228915662651
BOM500182 0.3921015514809591
BOM507685 0.3328631875881523
BOM500010 0.2859450726978998
BOM532454 0.5211864406779662
BOM500820 0.6248236953455572
BOM500312 0.5084507042253521
BOM532898 0.29867674858223064
BOM532187 0.2633802816901408
BOM533278 0.3183023872679045
BOM532555 0.597037037037037
BOM570001 0.3530701754385965


## Portfolio Optimization

In [None]:
# TO BE DONE

In [None]:
!pip freeze

absl-py==1.4.0
alabaster==0.7.13
albumentations==1.2.1
altair==4.2.2
anyio==3.6.2
appdirs==1.4.4
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
array-record==0.2.0
arviz==0.15.1
astropy==5.2.2
astunparse==1.6.3
attrs==23.1.0
audioread==3.0.0
autograd==1.5
Babel==2.12.1
backcall==0.2.0
beautifulsoup4==4.11.2
bleach==6.0.0
blis==0.7.9
blosc2==2.0.0
bokeh==2.4.3
branca==0.6.0
build==0.10.0
CacheControl==0.12.11
cached-property==1.5.2
cachetools==5.3.0
catalogue==2.0.8
certifi==2022.12.7
cffi==1.15.1
chardet==4.0.0
charset-normalizer==2.0.12
chex==0.1.7
click==8.1.3
cloudpickle==2.2.1
cmake==3.25.2
cmdstanpy==1.1.0
colorcet==3.0.1
colorlover==0.3.0
community==1.0.0b1
confection==0.0.4
cons==0.4.5
contextlib2==0.6.0.post1
contourpy==1.0.7
convertdate==2.4.0
cryptography==40.0.2
cufflinks==0.17.3
cvxopt==1.3.0
cvxpy==1.3.1
cycler==0.11.0
cymem==2.0.7
Cython==0.29.34
dask==2022.12.1
datascience==0.17.6
db-dtypes==1.1.1
dbus-python==1.2.16
debugpy==1.6.6
decorator==4.4.2
defusedxml==0.7.1
di