In [1]:
!pip install optuna



#Imports, Data Loading & Processing

In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import optuna

# Load datasets
train_df = pd.read_csv("train.csv", index_col=0)
sample_submission = pd.read_csv("sample_submission.csv")
# Convert to time-series friendly format
train_df = train_df.T
train_df.index.name = "Date"
train_df.reset_index(inplace=True)
# Convert the Date column to actual datetime objects
train_df['Date'] = pd.to_datetime(train_df['Date'], format='%d/%m/%Y')

#Adding Moving Avg Feature Engineering

Computing Moving Average and adding it as a new feature. Using moving avg to provide smoother trends since raw stock prices can be noisy.

Creating ma7 for weekly window, ma14 for 14 days window and so on....
Each window captures trends at different timescale. And handling Nan values for early days in dataset.

Scaling the dataset to converge faster.

In [3]:
def add_moving_averages(df):
    """Add moving average features to the dataframe."""
    # Storing Date column copy and removing it for calculation
    date_col = df['Date'].copy()
    df_numeric = df.drop('Date', axis=1)

    # Calculate moving averages for 7, 14, 30, and 60-day windows
    ma7 = df_numeric.rolling(window=7).mean()
    ma14 = df_numeric.rolling(window=14).mean()
    ma30 = df_numeric.rolling(window=30).mean()
    ma60 = df_numeric.rolling(window=60).mean()

    # Rename columns of the moving average data to reflect the window size
    ma7.columns = [f"{col}_ma7" for col in ma7.columns]
    ma14.columns = [f"{col}_ma14" for col in ma14.columns]
    ma30.columns = [f"{col}_ma30" for col in ma30.columns]
    ma60.columns = [f"{col}_ma60" for col in ma60.columns]
    #Adding moving avg to orignal data
    result = pd.concat([df_numeric, ma7, ma14, ma30, ma60], axis=1)

    # Filling missing values (NaNs) for the first few rows
    result.insert(0, 'Date', date_col)
    for company in df_numeric.columns:
        mask7 = result[f"{company}_ma7"].isna()
        result.loc[mask7, f"{company}_ma7"] = result.loc[mask7, company]

        mask14 = result[f"{company}_ma14"].isna()
        result.loc[mask14, f"{company}_ma14"] = result.loc[mask14, company]

        mask30 = result[f"{company}_ma30"].isna()
        result.loc[mask30, f"{company}_ma30"] = result.loc[mask30, company]

        mask60 = result[f"{company}_ma60"].isna()
        result.loc[mask60, f"{company}_ma60"] = result.loc[mask60, company]
    return result

# Apply moving average feature engineering
enhanced_df = add_moving_averages(train_df)
print(f"Original DataFrame shape: {train_df.shape}")
print(f"Enhanced DataFrame shape: {enhanced_df.shape}")

enhanced_df.set_index('Date', inplace=True)

# Use StandardScaler to normalize all features to have mean 0 and standard deviation 1
scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(enhanced_df),
    columns=enhanced_df.columns,
    index=enhanced_df.index
)

print(f"Scaled DataFrame shape: {df_scaled.shape}")
print(f"Number of features: {df_scaled.shape[1]}")

Original DataFrame shape: (3021, 443)
Enhanced DataFrame shape: (3021, 2211)
Scaled DataFrame shape: (3021, 2210)
Number of features: 2210


#Time-Series Sequence for Model Input

Transforming the scaled dataset into fixed-length sequences using a 30-day window.

And spliting the data into training and validation sets for model learning.

In [4]:
window_size = 30
# Create sequences function
def create_sequences(data, window_size=30):
    X = []
    y = []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)

X, y = create_sequences(df_scaled.values, window_size)

# Spliting data into (80/20 split)
split_index = int(0.8 * len(X))
X_train, X_val = X[:split_index], X[split_index:]
y_train, y_val = y[:split_index], y[split_index:]
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")

X_train shape: torch.Size([2392, 30, 2210]), y_train shape: torch.Size([2392, 2210])
X_val shape: torch.Size([599, 30, 2210]), y_val shape: torch.Size([599, 2210])


#Custom Dataset and DataLoaders
Defining a custom PyTorch dataset class StockDataset to handle input features and targets.

Later wrapping the training and validation data in DataLoader object, for efficient mini-batch use during training.

In [5]:
# Defining a custom dataset class for stock prediction
class StockDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]


# Creating the training dataset using training features and labels
train_dataset = StockDataset(X_train, y_train)

# Creating the validation dataset using validation features and labels
val_dataset = StockDataset(X_val, y_val)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Previewing the shape of one batch of data from the training loader to confirm data is compatible
for features, targets in train_loader:
    print(f"Batch features shape: {features.shape}, targets shape: {targets.shape}")
    break

Batch features shape: torch.Size([32, 30, 2210]), targets shape: torch.Size([32, 2210])


#Defining LSTM Model

Defining an LSTMModel in PyTorch with two stacked LSTM layers (128 and 64 hidden units), using dropout (0.2) for regularization.

The model processes 30-day input sequences and predicts the next day's stock values using a fully connected layer.



In [6]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim1=128, hidden_dim2=64, dropout=0.2, output_dim=None):
        super(LSTMModel, self).__init__()

        # First LSTM layer: Takes input features and outputs hidden states of size hidden_dim1
        # Dropout layer to reduce overfitting after the first LSTM
        self.lstm1 = nn.LSTM(input_dim, hidden_dim1, batch_first=True)
        self.dropout1 = nn.Dropout(dropout)

        # Second LSTM layer: Takes the output from the first LSTM and processes it further
        self.lstm2 = nn.LSTM(hidden_dim1, hidden_dim2, batch_first=True)
        self.dropout2 = nn.Dropout(dropout)

        if output_dim is None:
            output_dim = y_train.shape[1]  # Number of companies

        # Fully connected layer to map LSTM output to final predictions
        self.fc = nn.Linear(hidden_dim2, output_dim)

    def forward(self, x):
        # Pass input through the first LSTM layer
        out, _ = self.lstm1(x)
        x = self.dropout1(out) # Pass the output to dropout

        # Pass through the second LSTM layer
        out, _ = self.lstm2(x)
        x = self.dropout2(out) # Pass the output to dropout

        # Take the last output and pass through fully connected layer
        x = self.fc(x[:, -1, :])
        return x

#

#Hyperparameter Optimization with Optuna

Defining an Optuna objective function to optimize hyperparameters of an LSTMModel by minimizing validation loss.

Optuna tests different combinations of hidden layer sizes, dropout rate, and learning rate across 30 trials.

For each trial, it builds the model, trains it for 5 epochs using MSEloss, and evaluates performance on the validation set.

In [7]:
# Define Optuna objective function
def objective(trial):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Hyperparameters to tune
    hidden_dim1 = trial.suggest_int('hidden_dim1', 64, 256) # Number of hidden units in the first LSTM layer
    hidden_dim2 = trial.suggest_int('hidden_dim2', 32, 128) # Number of hidden units in the second LSTM layer
    dropout = trial.suggest_float('dropout', 0.1, 0.5) # Dropout rate between layers
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True) # Learning rate

    # Build model with current hyperparameters
    model = LSTMModel(
        input_dim=X_train.shape[2],  # Number of features
        hidden_dim1=hidden_dim1,
        hidden_dim2=hidden_dim2,
        dropout=dropout
    ).to(device)

    # Defining Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Train model for epochs
    model.train()
    for epoch in range(5):  # Using fewer epochs for hyperparameter search
        for features, targets in train_loader:
            features, targets = features.to(device), targets.to(device)

            # Forward pass: get predictions from the model
            outputs = model(features)
            loss = criterion(outputs, targets)

            # Backward pass: compute gradients
            optimizer.zero_grad() # Clear the previous gradients
            loss.backward()
            optimizer.step()

    # Evaluate on validation set
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for features, targets in val_loader:
            features, targets = features.to(device), targets.to(device)
            outputs = model(features)
            val_loss += criterion(outputs, targets).item() # Accumulate validation loss

    val_loss /= len(val_loader)  # Averaging validation loss across all batches
    return val_loss

# Running Optuna study to find the best hyperparameters
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)
print("Best hyperparameters:", study.best_params)

[I 2025-03-27 15:59:00,545] A new study created in memory with name: no-name-2c7575ab-b9ad-4a04-bd15-e56f3af9a12a
[I 2025-03-27 15:59:22,570] Trial 0 finished with value: 1.3677880685580404 and parameters: {'hidden_dim1': 206, 'hidden_dim2': 122, 'dropout': 0.45085887540821323, 'learning_rate': 2.5511519221885295e-05}. Best is trial 0 with value: 1.3677880685580404.
[I 2025-03-27 15:59:29,405] Trial 1 finished with value: 1.3842658369164718 and parameters: {'hidden_dim1': 171, 'hidden_dim2': 69, 'dropout': 0.31170321380093285, 'learning_rate': 2.9810593752292393e-05}. Best is trial 0 with value: 1.3677880685580404.
[I 2025-03-27 15:59:34,913] Trial 2 finished with value: 1.3014904621400332 and parameters: {'hidden_dim1': 144, 'hidden_dim2': 36, 'dropout': 0.32648710860554675, 'learning_rate': 0.0005639394048912729}. Best is trial 2 with value: 1.3014904621400332.
[I 2025-03-27 15:59:41,735] Trial 3 finished with value: 1.3257429615447396 and parameters: {'hidden_dim1': 180, 'hidden_dim

Best hyperparameters: {'hidden_dim1': 211, 'hidden_dim2': 85, 'dropout': 0.168478052365363, 'learning_rate': 0.0006009227007955891}


#LSTM Model with best Hyperparameters

Training the final LSTMModel using the best hyperparameters found by Optuna.

Running for 30 epochs, optimizing with the Adam optimizer and evaluating performance on the validation set after each epoch to monitor training progress.

In [8]:
# Train final model with best hyperparameters
best_params = study.best_params
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
final_model = LSTMModel(
    input_dim=X_train.shape[2],
    hidden_dim1=best_params['hidden_dim1'], # Best number of hidden units in the first LSTM layer
    hidden_dim2=best_params['hidden_dim2'], # Best number of hidden units in the second LSTM layer
    dropout=best_params['dropout'] # Best dropout rate
).to(device)

# Define the loss function
criterion = nn.MSELoss()

# Using the Adam optimizer with the best learning rate
optimizer = torch.optim.Adam(final_model.parameters(), lr=best_params['learning_rate'])

# Training loop
num_epochs = 30
for epoch in range(num_epochs):
    final_model.train()
    train_loss = 0

    # Loop through all batches in the training data
    for features, targets in train_loader:
        features, targets = features.to(device), targets.to(device)

        # Forward pass
        outputs = final_model(features)
        loss = criterion(outputs, targets)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Print training stats
    train_loss /= len(train_loader) # Calculate the average training loss over all batches
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}")

    # Validation after each epoch
    final_model.eval()
    val_loss = 0 # Reset validation loss
    with torch.no_grad():
        for features, targets in val_loader:
            features, targets = features.to(device), targets.to(device)
            outputs = final_model(features)
            val_loss += criterion(outputs, targets).item()

    val_loss /= len(val_loader)
    print(f"Validation Loss: {val_loss:.4f}")

Epoch 1/30, Loss: 0.6530
Validation Loss: 1.3436
Epoch 2/30, Loss: 0.5889
Validation Loss: 1.2928
Epoch 3/30, Loss: 0.5648
Validation Loss: 1.2953
Epoch 4/30, Loss: 0.5500
Validation Loss: 1.2859
Epoch 5/30, Loss: 0.5323
Validation Loss: 1.2720
Epoch 6/30, Loss: 0.5183
Validation Loss: 1.2600
Epoch 7/30, Loss: 0.5026
Validation Loss: 1.2884
Epoch 8/30, Loss: 0.4894
Validation Loss: 1.3120
Epoch 9/30, Loss: 0.4747
Validation Loss: 1.3066
Epoch 10/30, Loss: 0.4628
Validation Loss: 1.2878
Epoch 11/30, Loss: 0.4544
Validation Loss: 1.2824
Epoch 12/30, Loss: 0.4437
Validation Loss: 1.2963
Epoch 13/30, Loss: 0.4336
Validation Loss: 1.2917
Epoch 14/30, Loss: 0.4257
Validation Loss: 1.2746
Epoch 15/30, Loss: 0.4187
Validation Loss: 1.3208
Epoch 16/30, Loss: 0.4097
Validation Loss: 1.3064
Epoch 17/30, Loss: 0.4016
Validation Loss: 1.3136
Epoch 18/30, Loss: 0.3946
Validation Loss: 1.2812
Epoch 19/30, Loss: 0.3907
Validation Loss: 1.3376
Epoch 20/30, Loss: 0.3873
Validation Loss: 1.2972
Epoch 21/

#Final Predictions

Using the trained LSTM model to make a prediction for the next day based on the most recent 30 days of data.

The predicted values are inverse-transformed back to their original scale.

In [9]:
final_model.eval()
with torch.no_grad():
    # Get the last sequence from the training data
    last_sequence = df_scaled.values[-window_size:]
    last_sequence = torch.tensor(last_sequence).unsqueeze(0).float().to(device)  # Add batch dimension

    # Make prediction
    prediction = final_model(last_sequence).cpu().numpy()[0]
    print(f"Raw prediction shape: {prediction.shape}")

    # If prediction size matches df_scaled width, it includes all features
    if len(prediction) == df_scaled.shape[1]:
        # Extracting only the first 442 values which correspond to the original companies
        prediction = prediction[:442]
    elif len(prediction) != 442:
        raise ValueError(f"Unexpected prediction shape: {prediction.shape}. Expected either 442 or {df_scaled.shape[1]}")

    # Create a dummy array with all the original features plus enhanced ones
    dummy = np.zeros((1, df_scaled.shape[1]))
    # Place the company predictions in the first 442 columns
    dummy[0, :442] = prediction

    # Inverse transform the entire array
    original_scale_data = scaler.inverse_transform(dummy)

    # Extracting only the original company values
    prediction_original_scale = original_scale_data[0, :442]

# Updated submission file
sample_submission['value'] = prediction_original_scale
sample_submission.to_csv('submission.csv', index=False)

print("Prediction completed and saved to submission.csv")
print(sample_submission.head())

Raw prediction shape: (2210,)
Prediction completed and saved to submission.csv
          ID     value
0  company_0  0.370681
1  company_1  0.499594
2  company_2  0.766118
3  company_3  0.521492
4  company_4  0.277550
