In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import pandas as pd
from pathlib import Path
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
data_path = Path("/content/drive/MyDrive/MLDS/IEMS-490/Assignment5/data")

In [None]:
os.listdir(data_path)

['test.csv',
 'train.csv',
 'val.csv',
 'xgboost_model.pkl',
 'test_pred.csv',
 'lstm']

In [None]:
train_df = pd.read_csv(data_path / 'train.csv')
valid_df = pd.read_csv(data_path / 'val.csv')
test_df = pd.read_csv(data_path / 'test.csv')

train_df.head()

Unnamed: 0,askRate0,askRate1,askRate2,askRate3,askRate4,askRate5,askRate6,askRate7,askRate8,askRate9,...,bidSize6,bidSize7,bidSize8,bidSize9,bidSize10,bidSize11,bidSize12,bidSize13,bidSize14,y
0,1619.5,1620.0,1621.0,,,,,,,,...,20.0,27.0,11.0,14.0,35.0,10.0,1.0,10.0,13.0,-0.5
1,1619.5,1620.0,1621.0,1621.5,,,,,,,...,20.0,27.0,11.0,14.0,35.0,10.0,1.0,10.0,13.0,-0.5
2,1619.5,1620.0,1621.0,1621.5,1622.0,,,,,,...,20.0,27.0,11.0,14.0,35.0,10.0,1.0,10.0,13.0,-0.5
3,1619.5,1620.0,1621.0,1621.5,1622.0,,,,,,...,20.0,27.0,11.0,14.0,35.0,10.0,1.0,10.0,13.0,-0.5
4,1619.5,1620.0,1621.0,1621.5,1622.0,,,,,,...,20.0,27.0,11.0,14.0,35.0,10.0,1.0,10.0,13.0,-0.5


In [None]:
def extract_features(data):
  data_cleaned_zero_fill = data.copy()
  data_cleaned_zero_fill.fillna(0, inplace=True)

  data_cleaned_zero_fill['bid_ask_spread'] = data_cleaned_zero_fill['askRate0'] - data_cleaned_zero_fill['bidRate0']

  data_cleaned_zero_fill['mid_price'] = (data_cleaned_zero_fill['askRate0'] + data_cleaned_zero_fill['bidRate0']) / 2

  data_cleaned_zero_fill['total_ask_size'] = data_cleaned_zero_fill[[f'askSize{i}' for i in range(15)]].sum(axis=1)

  data_cleaned_zero_fill['total_bid_size'] = data_cleaned_zero_fill[[f'bidSize{i}' for i in range(15)]].sum(axis=1)

  data_cleaned_zero_fill['order_imbalance'] = (data_cleaned_zero_fill['total_bid_size'] - data_cleaned_zero_fill['total_ask_size']) / (
      data_cleaned_zero_fill['total_bid_size'] + data_cleaned_zero_fill['total_ask_size'] + 1e-10)

  data_cleaned_zero_fill['ask_price_range'] = data_cleaned_zero_fill[[f'askRate{i}' for i in range(15)]].max(axis=1) - data_cleaned_zero_fill[[f'askRate{i}' for i in range(15)]].min(axis=1)
  data_cleaned_zero_fill['bid_price_range'] = data_cleaned_zero_fill[[f'bidRate{i}' for i in range(15)]].max(axis=1) - data_cleaned_zero_fill[[f'bidRate{i}' for i in range(15)]].min(axis=1)

  epsilon = 1e-10

  data_cleaned_zero_fill['bid_ask_spread_pct'] = data_cleaned_zero_fill['bid_ask_spread'] / (data_cleaned_zero_fill['mid_price'] + epsilon)

  data_cleaned_zero_fill['liquidity_imbalance'] = (data_cleaned_zero_fill['total_bid_size'] - data_cleaned_zero_fill['total_ask_size']) / (
      data_cleaned_zero_fill['total_bid_size'] + data_cleaned_zero_fill['total_ask_size'] + epsilon)

  data_cleaned_zero_fill['relative_size_diff'] = (data_cleaned_zero_fill['total_bid_size'] - data_cleaned_zero_fill['total_ask_size']) / (
      data_cleaned_zero_fill['total_bid_size'] + data_cleaned_zero_fill['total_ask_size'] + epsilon)

  data_cleaned_zero_fill['volume_weighted_mid_price'] = (data_cleaned_zero_fill['askRate0'] * data_cleaned_zero_fill['askSize0'] +
                                                        data_cleaned_zero_fill['bidRate0'] * data_cleaned_zero_fill['bidSize0']) / (
                                                      data_cleaned_zero_fill['askSize0'] + data_cleaned_zero_fill['bidSize0'] + epsilon)

  return data_cleaned_zero_fill

In [None]:
train_df_cleaned = extract_features(train_df)
valid_df_cleaned = extract_features(valid_df)
test_df_cleaned = extract_features(test_df)

## XGBOOST

In [None]:
X_train = train_df_cleaned.drop(columns=['y'])
y_train = train_df_cleaned['y']

# # Define the XGBoost model
# model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.05, max_depth=6)

# # Train the model on the training data
# model.fit(X_train, y_train)

In [None]:
# Define the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=150, learning_rate=0.05, max_depth=6)

# Train the model on the training data
model.fit(X_train, y_train)

In [None]:
X_val = valid_df_cleaned.drop(columns=['y'])
y_val = valid_df_cleaned['y']

# Predict on the test data
y_pred = model.predict(X_val)

# Calculate the Mean Squared Error (MSE) to evaluate the model
mse = mean_squared_error(y_val, y_pred)
print(f'Mean Squared Error: {mse:.4f}')

Mean Squared Error: 0.3697


In [None]:
import joblib
joblib.dump(model, data_path / "xgboost_model.pkl")

['/content/drive/MyDrive/MLDS/IEMS-490/Assignment5/data/xgboost_model.pkl']

In [None]:
X_test = test_df_cleaned
y_pred_test = model.predict(X_test)

In [None]:
y_pred_test_df = pd.DataFrame({
    "pred" : y_pred_test
})
y_pred_test_df.to_csv(data_path / 'test_pred.csv', index=False)

In [None]:
valid_df_cleaned.head()

Unnamed: 0,askRate0,askRate1,askRate2,askRate3,askRate4,askRate5,askRate6,askRate7,askRate8,askRate9,...,mid_price,total_ask_size,total_bid_size,order_imbalance,ask_price_range,bid_price_range,bid_ask_spread_pct,liquidity_imbalance,relative_size_diff,volume_weighted_mid_price
0,1660.5,1661.0,1661.5,1662.0,1662.5,1663.0,1663.5,1664.0,1664.5,1665.0,...,1660.25,292.0,254.0,-0.069597,7.5,7.0,0.000301,-0.069597,-0.069597,1660.076923
1,1660.5,1661.0,1661.5,1662.0,1662.5,1663.0,1663.5,1664.0,1664.5,1665.0,...,1660.25,292.0,256.0,-0.065693,7.5,7.0,0.000301,-0.065693,-0.065693,1660.076923
2,1660.5,1661.0,1661.5,1662.0,1662.5,1663.0,1663.5,1664.0,1664.5,1665.0,...,1660.25,291.0,256.0,-0.063985,7.5,7.0,0.000301,-0.063985,-0.063985,1660.041667
3,1660.5,1661.0,1661.5,1662.0,1662.5,1663.0,1663.5,1664.0,1664.5,1665.0,...,1660.25,292.0,256.0,-0.065693,7.5,7.0,0.000301,-0.065693,-0.065693,1660.041667
4,1660.5,1661.0,1661.5,1662.0,1662.5,1663.0,1663.5,1664.0,1664.5,1665.0,...,1660.25,292.0,254.0,-0.069597,7.5,7.0,0.000301,-0.069597,-0.069597,1660.041667


In [None]:
valid_df.y.min(), valid_df.y.max()

(-4.0, 5.0)

## LSTM


In [None]:
X_train = train_df_cleaned.drop(columns=['y']).values
y_train = train_df_cleaned['y'].values

X_val = valid_df_cleaned.drop(columns=['y']).values
y_val = valid_df_cleaned['y'].values

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_valid_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_val, dtype=torch.float32)


X_train_tensor = X_train_tensor.unsqueeze(1)  # (batch_size, 1, num_features)
X_valid_tensor = X_valid_tensor.unsqueeze(1)

# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

In [None]:
def train_model(model, train_loader):
    model.train()
    epoch_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch=X_batch.to(device)
        y_batch=y_batch.to(device)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs.squeeze(), y_batch)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(train_loader)
    return model, epoch_loss

def evaluate_model(model, valid_loader):
    model.eval()
    epoch_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in valid_loader:
          X_batch=X_batch.to(device)
          y_batch=y_batch.to(device)

          # Forward pass
          outputs = model(X_batch)
          loss = criterion(outputs.squeeze(), y_batch)
          epoch_loss += loss.item()

    epoch_loss /= len(valid_loader)
    return model, epoch_loss

In [None]:
import copy

save_dir = data_path / 'lstm/single_sequence'
save_dir.mkdir(exist_ok=True, parents=True)

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # LSTM forward pass
        lstm_out, _ = self.lstm(x)
        # Pass the last output through a fully connected layer
        out = self.fc(lstm_out[:, -1, :])
        return out

# Model hyperparameters
input_size = X_train.shape[1]  # Number of features
hidden_size = 64  # Number of LSTM units
num_layers = 2  # Number of LSTM layers
output_size = 1  # Predicting a single value (return rate)

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Initialize model, loss function, and optimizer
model = LSTMModel(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, output_size=output_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 10
model.to(device)
model.train()

cuda


In [None]:
def main(model, num_epochs, train_loader, valid_loader):
    best_loss = float('inf')
    best_model = None
    model.to(device)
    for epoch in range(num_epochs):
      model, train_loss = train_model(model, train_loader)

      model, valid_loss = evaluate_model(model, valid_loader)

      print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}')

      if valid_loss < best_loss:
          best_loss = valid_loss
          best_model = copy.deepcopy(model)
          print(f"Best Model Found at {epoch+1} epoch.")

          model_name = f'lstm_epoch_{epoch+1}.pth'
          torch.save(best_model.state_dict(), save_dir / model_name)
          print(f"Model saved at {save_dir / model_name}")
    return best_model

In [None]:
best_model = main(model, num_epochs, train_loader, valid_loader)

Epoch [1/10], Train Loss: 0.5013, Valid Loss: 0.4330
Best Model Found at 1 epoch.
Model saved at /content/drive/MyDrive/MLDS/IEMS-490/Assignment5/data/lstm/single_sequence/lstm_epoch_1.pth
Epoch [2/10], Train Loss: 0.4602, Valid Loss: 0.4710
Epoch [3/10], Train Loss: 0.4293, Valid Loss: 0.5198
Epoch [4/10], Train Loss: 0.4041, Valid Loss: 0.5294
Epoch [5/10], Train Loss: 0.3857, Valid Loss: 0.5515
Epoch [6/10], Train Loss: 0.3724, Valid Loss: 0.5619


KeyboardInterrupt: 

In [None]:
# Evaluating the model
model.eval()
with torch.no_grad():
    test_predictions = []
    test_targets = []
    for X_batch, y_batch in valid_loader:
        X_batch=X_batch.to(device)
        y_batch=y_batch.to(device)
        outputs = model(X_batch)
        test_predictions.append(outputs.squeeze().numpy())
        test_targets.append(y_batch.numpy())

# Convert predictions and targets back to arrays
test_predictions = np.concatenate(test_predictions)
test_targets = np.concatenate(test_targets)

# Calculate test MSE
test_mse = np.mean((test_predictions - test_targets) ** 2)
print(f'Test MSE: {test_mse:.4f}')