<a href="https://colab.research.google.com/github/ayush-96/msc-data-science/blob/master/deep_learning/3043532a_stock_market.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# !pip3 install optuna

In [7]:
# !pip3 install captum

In [8]:
%matplotlib inline

# Common imports
import os
from dateutil.parser import parse

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import optuna
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from captum.attr import IntegratedGradients

use_cuda = False
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")

plt.rcParams.update({'figure.figsize': (10, 7), 'figure.dpi': 120})

In [9]:
# Load Data - train and submission files
train = pd.read_csv("./train.csv")
submission = pd.read_csv("./sample_submission.csv")

# Drop ID column and transpose for time series format
train_data = train.drop(columns=['ID']).T
train_data.columns = [f"company_{i}" for i in range(442)]
dates = train_data.index
dates = pd.to_datetime(train_data.index, dayfirst=True)
train_data.index = dates

# Convert to numpy array
data = train_data.values.astype(float)

# Normalize data
scaler = StandardScaler()
data = scaler.fit_transform(data)

# train.head()
train_data.tail()

Unnamed: 0,company_0,company_1,company_2,company_3,company_4,company_5,company_6,company_7,company_8,company_9,...,company_432,company_433,company_434,company_435,company_436,company_437,company_438,company_439,company_440,company_441
2022-03-25,0.86,0.93,0.1,2.02,-1.53,2.08,1.54,0.17,1.96,-0.05,...,0.66,-1.4,2.2,0.26,0.49,0.18,-0.25,1.93,0.37,0.94
2022-03-28,0.31,1.55,0.9,2.74,-0.64,-0.13,-3.39,-0.21,-2.97,-0.42,...,0.23,1.97,0.18,0.56,-0.24,-0.05,0.45,-0.5,1.18,0.65
2022-03-29,0.95,3.96,3.05,2.33,2.32,4.38,0.1,1.72,-1.16,2.91,...,2.51,2.72,-0.59,-0.3,1.88,3.77,2.04,2.67,3.02,2.36
2022-03-30,0.68,0.41,-1.24,0.8,-1.64,-2.53,0.08,-1.17,1.96,0.57,...,-1.35,-2.95,0.84,-1.61,-1.58,-1.16,-3.71,-2.61,0.53,0.91
2022-03-31,-0.52,1.08,-1.26,-0.91,-4.58,-5.44,-2.42,-1.24,-1.55,-1.78,...,-2.35,-1.23,-2.79,-3.63,-1.9,-1.85,-2.04,-3.83,-1.73,-0.49


In [10]:
# Convert to PyTorch tensors
def create_sequences(data, seq_length=30):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

seq_length = 30
X, y = create_sequences(data, seq_length)

# Split into train and validation (last 30 days as validation)
X_train, y_train = X[:-30], y[:-30]
X_val, y_val = X[-30:], y[-30:]

print(f"Number of training sequences: {len(X_train)}, Validation sequences: {len(X_val)}")

Number of training sequences: 2961, Validation sequences: 30


  return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)


In [11]:
# Create DataLoader
batch_size = 32

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)

val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=False)

# Define LTSM Model

In [12]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # Use last timestep output
        return out

# Optuna Hyperparameter Optimization


# Hyperparameter Optimization with Optuna


In [None]:
def objective(trial):
    hidden_dim = trial.suggest_int("hidden_dim", 16, 128)
    num_layers = trial.suggest_int("num_layers", 1, 3)
    lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)

    model = LSTMModel(input_dim=442, hidden_dim=hidden_dim, num_layers=num_layers, output_dim=442).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(20):
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

    # Validation loss
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            val_loss += criterion(y_pred, y_batch).item()
    return val_loss / len(val_loader)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=25)
best_params = study.best_params

[I 2025-03-26 12:00:13,057] A new study created in memory with name: no-name-69227be7-0f33-480f-afda-9bc309c9ee00
  lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
[I 2025-03-26 12:01:07,074] Trial 0 finished with value: 1.65523362159729 and parameters: {'hidden_dim': 73, 'num_layers': 1, 'lr': 0.0006103288895257674}. Best is trial 0 with value: 1.65523362159729.
  lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
[I 2025-03-26 12:02:09,858] Trial 1 finished with value: 2.3564414978027344 and parameters: {'hidden_dim': 52, 'num_layers': 3, 'lr': 0.0003420822327366322}. Best is trial 0 with value: 1.65523362159729.
[I 2025-03-26 12:03:47,005] Trial 2 finished with value: 1.7853686809539795 and parameters: {'hidden_dim': 83, 'num_layers': 3, 'lr': 0.0008080417253035436}. Best is trial 0 with value: 1.65523362159729.
[I 2025-03-26 12:04:15,468] Trial 3 finished with value: 2.0576539039611816 and parameters: {'hidden_dim': 21, 'num_layers': 2, 'lr': 0.0003386726989914762}. Best is trial 0 wi

# Train Model with best hyperparameters:

In [None]:
model = LSTMModel(input_dim=442, hidden_dim=best_params['hidden_dim'], num_layers=best_params['num_layers'], output_dim=442).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=best_params['lr'])

num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()

# Prediction for 1st April, 2022

In [None]:
X_input = torch.tensor(data[-seq_length:], dtype=torch.float32).unsqueeze(0).to(device)
model.eval()
with torch.no_grad():
    pred = model(X_input).cpu().numpy()

In [None]:
print("Prediction shape:", pred.shape)
print("Submission target shape:", submission.iloc[:, 1:].shape)

# Ensure pred is reshaped correctly
pred = pred.reshape(-1, 1)  # Convert (1, 442) to (442, 1)

# Assign correctly
submission.iloc[:, 1:] = pred
submission.to_csv("submission.csv", index=False)

print("Submission file saved successfully!")

# Model Interpretation with Captum

In [None]:
# Captum for model interpretation
ig = IntegratedGradients(model)
X_input.requires_grad_()
attr = ig.attribute(X_input, target=0)
attr = attr.squeeze().detach().cpu().numpy()

# Plot feature importance
plt.figure(figsize=(10, 5))
plt.imshow(attr, cmap="viridis", aspect="auto")
plt.colorbar(label="Attribution Score")
plt.title("Feature Importance Analysis with Captum")
plt.xlabel("Company Index")
plt.ylabel("Time Steps")
plt.show()