## Optiver Trading at the Close: GRU-based Time Series Forecasting

This notebook implements a full pipeline to forecast auction price movements using a GRU-based neural network model. The pipeline includes data cleaning, feature engineering, sequence preparation, model definition, hyperparameter tuning with Optuna, and final live predictions using the optiver2023 test API.

## 1. Environment Setup & Data Loading

We load the Optiver dataset using Polars for performance and initialize the test environment for live predictions.

In [None]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

## 2. Data Exploration

We summarize missing values in each column and visualize the distribution of the target variable to understand its range and behavior

In [None]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd 

# Load the dataset
df = pl.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")

# ---- Compute null summary ----
nulls = (
    pl.DataFrame({
        "column": df.columns,
        "null_count": df.null_count().row(0)
    })
    .with_columns([
        (pl.col("null_count") / df.height * 100).alias("null_pct")
    ])
    .sort("null_count", descending=True)
)

print("📊 Null Summary:")
print(nulls)

# ---- Plot target distribution ----
plt.figure(figsize=(10, 4))
sns.histplot(df.select("target").to_pandas()["target"], bins=100, kde=True)
plt.title("Target Distribution")
plt.xlabel("Target Value")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

## 3. Data Cleaning

We:
	•	Remove unusable columns (far_price, near_price)
	•	Impute missing values with column-wise means
	•	Replace infinite values with 0

Two versions of the cleaning function are defined for train and test sets.

In [None]:
def clean_data(df: pl.DataFrame) -> pl.DataFrame:
    df = df.filter(pl.col("target").is_not_null() & ~pl.col("target").is_nan())

    # Drop unused columns
    df = df.drop(["far_price", "near_price"])

    # Compute global means for fallback
    global_means = {
        col: df.select(pl.col(col).mean()).to_series()[0]
        for col, dtype in df.schema.items()
        if dtype in [pl.Float64, pl.Int64] and col != "target"
    }

    for col, dtype in df.schema.items():
        if dtype in [pl.Float64, pl.Int64] and col != "target":
            # Fill with group mean, fallback to global mean
            df = df.with_columns(
                pl.when(pl.col(col).is_null())
                .then(
                    pl.col(col)
                    .mean()
                    .over(["stock_id", "date_id"])
                    .fill_null(global_means[col])
                )
                .otherwise(pl.col(col))
                .alias(col)
            )

        if dtype == pl.Float64:
            # Replace infinite values with 0.0
            df = df.with_columns(
                pl.when(pl.col(col).is_infinite()).then(0.0).otherwise(pl.col(col)).alias(col)
            )

    return df


df_clean = clean_data(df)

def clean_data_test(df: pl.DataFrame) -> pl.DataFrame:
    df = df.drop(["far_price", "near_price"])

    # Compute global means
    global_means = {
        col: df.select(pl.col(col).mean()).to_series()[0]
        for col, dtype in df.schema.items()
        if dtype in [pl.Float64, pl.Int64]
    }

    for col, dtype in df.schema.items():
        if dtype in [pl.Float64, pl.Int64]:
            # Fill with group mean and fallback to global mean
            df = df.with_columns(
                pl.when(pl.col(col).is_null())
                .then(
                    pl.col(col)
                    .mean()
                    .over(["stock_id", "date_id"])
                    .fill_null(global_means[col])
                )
                .otherwise(pl.col(col))
                .alias(col)
            )

        if dtype == pl.Float64:
            df = df.with_columns(
                pl.when(pl.col(col).is_infinite()).then(0.0).otherwise(pl.col(col)).alias(col)
            )

    return df

## 4. Sequence Preparation for GRU Input

In this step, we reshape the tabular time-series data into 3D padded tensors that are compatible with GRU input requirements:
	•	Each sequence is grouped by (stock_id, date_id) and sorted by seconds_in_bucket.
	•	Features (X) and targets (y) are extracted and padded to the same maximum sequence length across batches using a placeholder value (-9999).
	•	A companion tensor of sequence lengths is returned to handle variable-length sequences during training using pack_padded_sequence.

We define two utility functions:
	•	prepare_gru_inputs for training and validation (returns X, y, and lengths).
	•	prepare_gru_inputs_test for test-time inference (also returns row IDs to match predictions back to the original rows).

This step ensures that the data is correctly structured for temporal modeling using GRUs.

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd

feature_cols = [
    "imbalance_size", "matched_size", "bid_size", "ask_size",
    "reference_price", "bid_price", "ask_price", "wap", "imbalance_buy_sell_flag"
]

def prepare_gru_inputs(df, feature_cols, target_col="target", pad_value=-9999):
    X_batches, y_batches, lengths = [], [], []
    for (stock_id, date_id), group in df.groupby(["stock_id", "date_id"]):
        group_sorted = group.sort_values("seconds_in_bucket")
        X = group_sorted[feature_cols].to_numpy(dtype=np.float32)
        y = group_sorted[target_col].to_numpy(dtype=np.float32)

        lengths.append(len(X))
        X_batches.append(X)
        y_batches.append(y)

    max_len = max(lengths)
    X_padded = np.full((len(X_batches), max_len, len(feature_cols)), pad_value, dtype=np.float32)
    y_padded = np.full((len(y_batches), max_len), pad_value, dtype=np.float32)

    for i, (x, y) in enumerate(zip(X_batches, y_batches)):
        X_padded[i, :len(x), :] = x
        y_padded[i, :len(y)] = y

    return torch.tensor(X_padded), torch.tensor(y_padded), torch.tensor(lengths)



def prepare_gru_inputs_test(df, feature_cols, pad_value=-9999):
    X_batches, lengths, ids = [], [], []
    for (stock_id, date_id), group in df.groupby(["stock_id", "date_id"]):
        group_sorted = group.sort_values("seconds_in_bucket")
        X = group_sorted[feature_cols].to_numpy(dtype=np.float32)

        lengths.append(len(X))
        X_batches.append(X)
        ids.append(group_sorted["row_id"].tolist())  # store row_ids to map back

    max_len = max(lengths)
    X_padded = np.full((len(X_batches), max_len, len(feature_cols)), pad_value, dtype=np.float32)

    for i, x in enumerate(X_batches):
        X_padded[i, :len(x), :] = x

    return torch.tensor(X_padded), torch.tensor(lengths), ids

## 5. Time Series-Aware Train/Validation Split

We perform a date-based split to ensure that the validation set only contains future unseen dates.

In [None]:
def time_series_split_df(df, split_ratio=0.85):
    unique_dates = sorted(df["date_id"].unique())
    split_idx = int(len(unique_dates) * split_ratio)
    train_dates = unique_dates[:split_idx]
    valid_dates = unique_dates[split_idx:]
    
    train_df = df[df["date_id"].isin(train_dates)].copy()
    valid_df = df[df["date_id"].isin(valid_dates)].copy()
    return train_df, valid_df

## 6. GRU Model Definition

We implement a configurable GRU model that supports:
	•	Variable GRU layers
	•	LayerNorm and Dropout
	•	Multiple Dense layers with GELU activation
	•	Final regression output

In [None]:
import optuna
import time
from tqdm import tqdm
from optuna.exceptions import TrialPruned
from sklearn.preprocessing import StandardScaler
import joblib

class GRURegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout, num_dense_layers,gru_layers):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers=gru_layers, batch_first=True, bidirectional=False)
        self.norm = nn.LayerNorm(hidden_dim)
        self.dropout = nn.Dropout(dropout)

        layers = []
        dim = hidden_dim
        for _ in range(num_dense_layers):
            layers += [
                nn.Linear(dim, dim // 2),
                nn.GELU(),
                nn.Dropout(dropout)
            ]
            dim = dim // 2
        layers.append(nn.Linear(dim, 1))
        self.head = nn.Sequential(*layers)

    def forward(self, x, lengths):
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.gru(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        out = self.norm(out)
        out = self.dropout(out)
        return self.head(out).squeeze(-1)

def objective(trial):
    hidden_dim = trial.suggest_categorical("hidden_dim", [128, 160, 192,256])
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-3, log=True)
    beta = trial.suggest_float("beta", 0.7, 1.0)
    num_dense_layers = trial.suggest_int("num_dense_layers", 0, 4)
    num_epochs = trial.suggest_int("num_epochs", 5, 50)
    gru_layers = trial.suggest_int("gru_layers", 1, 3)

    model = GRURegressor(len(feature_cols), hidden_dim, dropout, num_dense_layers,gru_layers).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.SmoothL1Loss(beta=beta)

    for epoch in range(num_epochs):
        model.train()
        for Xb, yb, lb in train_loader:
            Xb, yb, lb = Xb.to(device), yb.to(device), lb.to(device)
            pred = model(Xb, lb)
            mask = yb != -9999
            loss = loss_fn(pred[mask], yb[mask])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for Xb, yb, lb in val_loader:
                Xb, yb, lb = Xb.to(device), yb.to(device), lb.to(device)
                pred = model(Xb, lb)
                mask = yb != -9999
                loss = loss_fn(pred[mask], yb[mask])
                val_loss += loss.item()
        val_loss /= len(val_loader)

        trial.report(val_loss, step=epoch)
        if trial.should_prune():
            raise TrialPruned()

    return val_loss


## 7. Hyperparameter Tuning with Optuna

We define an objective function that trains the model and evaluates MAE on the validation set. Optuna explores combinations of:
	•	Hidden size
	•	Dropout rate
	•	Learning rate
	•	Number of dense/GRU layers
	•	Epochs
	•	Smooth L1 loss beta parameter

In [None]:
# Step 1: Convert to pandas
df_clean_pd = df_clean.to_pandas()

# Step 2: Time Series Split (returns train_df and val_df)
train_df, val_df = time_series_split_df(df_clean_pd)

# Step 3: Normalize features based on train set statistics
scaler = StandardScaler()
train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols])
val_df[feature_cols] = scaler.transform(val_df[feature_cols])

# Optional: Save the scaler if you need it at test time

joblib.dump(scaler, "scaler.pkl")

# Step 4: Prepare GRU inputs
(train_X, train_y, train_len) = prepare_gru_inputs(train_df, feature_cols)
(val_X, val_y, val_len) = prepare_gru_inputs(val_df, feature_cols)




train_ds = TensorDataset(train_X, train_y, train_len)
val_ds = TensorDataset(val_X, val_y, val_len)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Run optimization with progress bar
study = optuna.create_study(direction="minimize")
print("🧪 Starting hyperparameter tuning...")
start = time.time()
for _ in tqdm(range(50), desc="Optuna Trials"):
    study.optimize(objective, n_trials=1, catch=(TrialPruned,))
end = time.time()

print("✅ Best trial:")
print(study.best_trial.params)
print(f"Best validation MAE: {study.best_trial.value:.5f}")
print(f"⏱ Total tuning time: {(end - start) / 60:.2f} min")

## 8. Final Model Training

Using the best hyperparameters found, we retrain the GRU model on the full dataset and save the weights for prediction.

In [None]:
best_params = study.best_trial.params

# Prepare full dataset
df_full = df_clean.to_pandas()

# Normalize using StandardScaler
scaler = StandardScaler()
df_full[feature_cols] = scaler.fit_transform(df_full[feature_cols])
joblib.dump(scaler, "scaler_full.pkl")

# Prepare GRU inputs
full_X, full_y, full_len = prepare_gru_inputs(df_full, feature_cols)
full_ds = TensorDataset(full_X, full_y, full_len)
full_loader = DataLoader(full_ds, batch_size=64, shuffle=False)

# Initialize model with ALL best Optuna params (including gru_layers)
model = GRURegressor(
    input_dim=len(feature_cols),
    hidden_dim=best_params["hidden_dim"],
    dropout=best_params["dropout"],
    num_dense_layers=best_params["num_dense_layers"],
    gru_layers=best_params["gru_layers"]  
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=best_params["lr"])
loss_fn = nn.SmoothL1Loss(beta=best_params["beta"])

# Final Training Loop
for epoch in range(best_params["num_epochs"]):
    model.train()
    for Xb, yb, lb in full_loader:
        Xb, yb, lb = Xb.to(device), yb.to(device), lb.to(device)
        pred = model(Xb, lb)
        mask = yb != -9999
        loss = loss_fn(pred[mask], yb[mask])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
# Save model
model_path = "best_gru_model.pth"
torch.save(model.state_dict(), model_path)
print(f"✅ Model saved to {model_path}")

## 9. Live Predictions with optiver2023 API

For each batch of test data:
	•	We clean and transform the input
	•	Prepare padded sequences
	•	Predict with the trained GRU model
	•	Fill predictions in sample_prediction and submit

In [None]:
model.load_state_dict(torch.load("best_gru_model.pth", map_location=device))
model.eval()

counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    if counter == 0:
        print("📦 Test preview:")
        print(test.head(3))
        print(revealed_targets.head(3))
        print(sample_prediction.head(3))

    # 1. Clean the test set
    test_pl = pl.from_pandas(test)
    test_pl = clean_data_test(test_pl)
    test_df = test_pl.to_pandas()
    test_df[feature_cols] = scaler.transform(test_df[feature_cols])

    # 2. GRU input preparation (sequence format)
    X_test, len_test, row_ids = prepare_gru_inputs_test(test_df, feature_cols)

    # 3. Predict
    with torch.no_grad():
        X_test = X_test.to(device)
        len_test = len_test.to(device)
        preds = model(X_test, len_test).cpu().numpy().flatten()

    # 4. Fill predictions into sample_prediction
    flat_row_ids = [rid for sublist in row_ids for rid in sublist]
    pred_df = pd.DataFrame({"row_id": flat_row_ids, "target": preds})
    sample_prediction = sample_prediction.drop(columns=["target"]).merge(pred_df, on="row_id", how="left")
    sample_prediction["target"] = sample_prediction["target"].fillna(0.0)

    # 5. Submit
    env.predict(sample_prediction)
    counter += 1

## 10. Save Submission File

Finally, we export the sample_prediction.csv file and move it to the working directory for submission.

In [None]:
# Save sample_prediction to CSV
sample_prediction.to_csv("sample_prediction.csv", index=False)

# Create a download link (Kaggle will show it in the sidebar under 'Output')
import shutil
shutil.move("sample_prediction.csv", "/kaggle/working/sample_prediction.csv")