# CSE251B Project Final Code

## Imports/Initializations

In [None]:
!pip install torch-geometric

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch_geometric.data import Data, Batch
import tqdm

## Load Dataset

In [None]:
# DO NOT FORGET TO UPLOAD kaggle.json FILE

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

! kaggle competitions download cse-251-b-2025

!unzip cse-251-b-2025.zip

train_npz = np.load('./train.npz')
train_data = train_npz['data']
test_npz  = np.load('./test_input.npz')
test_data  = test_npz['data']

print(train_data.shape, test_data.shape)

# Split once for later use
X_train = train_data[..., :50, :]
Y_train = train_data[:, 0, 50:, :2]

## Dataset Initializations

In [None]:
class DatasetTrain(Dataset):
    def __init__(self, data, scale=10.0, augment=True):
      """
      Input(s):
      data (np.ndarray: N, 50, 110, 6): training data
      scale (float): scale factor for norm
      augment (bool): Toggle augmentation

      Output(s):
      self (struct): Internal struct with data, scale, and augmentation value
      """
      self.data = data
      self.scale = scale
      self.augment = augment

    def __len__(self):
      """
      Get length of dataset

      Output:
      length (int): length of dataset
      """
      return len(self.data)

    def __getitem__(self, idx):
      """
      Get an item from the dataset

      Input(s):
      idx (int): index of dataset item

      Output(s):
      datapoint (torch_geometric.data.Data): dataset item
      """
      # Extract scene and get the history and future timesteps (50 historical, 60 future)
      scene = self.data[idx]
      historical_traj = scene[:, :50, :].copy()
      future_traj = torch.tensor(scene[0, 50:, :2].copy(), dtype=torch.float32)

      # If the augmentation toggle is True, rotate and flip the data
      if self.augment:
        if np.random.rand() < 0.75:
          # Random heading
          theta = np.random.uniform(-np.pi, np.pi)
          R = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta),  np.cos(theta)]], dtype=np.float32)

          # Rotate the trajectory
          historical_traj[..., :2] = historical_traj[..., :2] @ R
          historical_traj[..., 2:4] = historical_traj[..., 2:4] @ R
          future_traj = future_traj @ R
        if np.random.rand() < 0.5:
          # Flip trajectory
          historical_traj[..., 0] *= -1
          historical_traj[..., 2] *= -1
          future_traj[:, 0] *= -1

      # Isolate ego position, heading, and velocity in 50th time step and use as origin
      pos_origin = historical_traj[0, 49, :2].copy()
      heading_origin = historical_traj[0, 49, 4].copy()
      velocity_origin = historical_traj[0, 49, 2:4].copy()

      # Normalize the trajectory position
      historical_traj[..., :2] = historical_traj[..., :2] - pos_origin
      future_traj = future_traj - pos_origin

      # Normalize heading
      historical_traj[..., 4] = historical_traj[..., 4] - heading_origin

      # Normalize velocity
      historical_traj[..., 2:4] = historical_traj[..., 2:4] - velocity_origin

      # Apply scale to trajectory data
      historical_traj[..., :4] = historical_traj[..., :4] / self.scale
      future_traj = future_traj / self.scale

      # Use a validity mask and append to data to help model know which objects are zero padding and which are valid
      validity_mask = torch.any(torch.tensor(historical_traj, dtype=torch.float32) != 0, dim=(1, 2))
      validity_mask = validity_mask.float()

      # Output the data item in form (x, y, pos_origin, scale, heading_origin, velocity_origin, validity_mask)
      data_item = Data(
        x=torch.tensor(historical_traj, dtype=torch.float32),
        y=future_traj.type(torch.float32),
        pos_origin=torch.tensor(pos_origin, dtype=torch.float32).unsqueeze(0),
        scale=torch.tensor(self.scale, dtype=torch.float32),
        heading_origin=torch.tensor(heading_origin, dtype=torch.float32),
        velocity_origin=torch.tensor(velocity_origin, dtype=torch.float32),
        validity_mask=validity_mask,
      )

      return data_item


class DatasetTest(Dataset):
  def __init__(self, data, scale=10.0):
    """
    Input(s):
    data (np.ndarray: N, 50, 50, 6): testing data
    scale (float): scale factor for norm

    Output(s):
    self (struct): Internal struct with data and scale value
    """
    self.data = data
    self.scale = scale

  def __len__(self):
    """
    Get length of dataset

    Output:
    length (int): length of dataset
    """
    return len(self.data)

  def __getitem__(self, idx):
    """
    Get an item from the test dataset

    Input(s):
    idx (int): index of dataset item

    Output(s):
    datapoint (torch_geometric.data.Data): test dataset item
    """
    # Extract scene (only has historical data so 50 timesteps)
    scene = self.data[idx]
    historical_traj = scene.copy()

    # Isolate ego position, heading, and velocity in 50th time step and use as origin
    pos_origin = historical_traj[0, 49, :2].copy()
    heading_origin = historical_traj[0, 49, 4].copy()
    velocity_origin = historical_traj[0, 49, 2:4].copy()

    # Normalize position
    historical_traj[..., :2] = historical_traj[..., :2] - pos_origin

    # Normalize heading
    historical_traj[..., 4] = historical_traj[..., 4] - heading_origin

    # Normalize velocity
    historical_traj[..., 2:4] = historical_traj[..., 2:4] - velocity_origin

    # Scale trajectory
    historical_traj[..., :4] = historical_traj[..., :4] / self.scale

    # Use a validity mask and append to data to help model know which objects are zero padding and which are valid
    validity_mask = torch.any(torch.tensor(historical_traj, dtype=torch.float32) != 0, dim=(1, 2))
    validity_mask = validity_mask.float()

    # Output the data item in form (x, pos_origin, scale, heading_origin, velocity_origin, validity mask)
    data_item = Data(
        x=torch.tensor(historical_traj, dtype=torch.float32),
        pos_origin=torch.tensor(pos_origin, dtype=torch.float32).unsqueeze(0),
        scale=torch.tensor(self.scale, dtype=torch.float32),
        heading_origin=torch.tensor(heading_origin, dtype=torch.float32),
        velocity_origin=torch.tensor(velocity_origin, dtype=torch.float32),
        validity_mask=validity_mask,
    )

    return data_item

## Model Initialization

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=50):
    """
    Initialize positional encoder

    Input(s):
    d_model (int): embedding dimension
    max_len (int): max sequence length
    """
    super().__init__()

    # Init positional encoder struct
    pos_enc = torch.zeros(max_len, d_model)

    # Positional vector
    pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

    # Scale factor
    scale_val = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

    # Even columns have sin values, odd columns have cos values for positions
    pos_enc[:, 0::2] = torch.sin(pos * scale_val)
    pos_enc[:, 1::2] = torch.cos(pos * scale_val)

    # Positional encoding stored as non-param value in struct
    self.register_buffer('pos_enc', pos_enc.unsqueeze(0))

  def forward(self, x):
    """
    Forward pass of positional encoder

    Input(s):
    x (torch.Tensor): input tensor

    Output(s):
    x (torch.Tensor): output tensor
    """
    return x + self.pos_enc[:, :x.size(1), :]

class TrajTransformer(nn.Module):
  def __init__(self,
               num_agents=50,
               time_steps=50,
               feature_dim=6,
               d_model=192,
               nhead=8,
               num_layers=3,
               dropout=0.05,
               future_steps=60):
    """
    Initialize transformer model

    Input(s):
    num_agents (int): number of agents (sequence length)
    time_steps (int): historical trajectory per agent
    feature_dim (int): input features (x, y, v_x, v_y, heading, validity_mask)
    d_model (int): embedding dimension
    nhead (int): attention heads
    num_layers (int): number of model layers
    dropout (float): dropout value (% drop)
    future_steps (int): future trajectories to predict

    Output(s):
    self (struct): Internal struct with model parameters
    """
    super().__init__()

    # Assign the input values to the internal struct
    self.num_agents = num_agents
    self.time_steps = time_steps
    self.feature_dim = feature_dim
    self.d_model = d_model
    self.future_steps = future_steps

    # Process agent's history into fixed vector
    self.temporal_encoder = nn.Sequential(
        nn.Linear(time_steps * feature_dim, d_model),
        nn.GELU(),
        nn.LayerNorm(d_model),
        nn.Dropout(dropout)
    )

    # Positional encoding for agent sequence
    self.pos_encoder = PositionalEncoding(d_model, max_len=num_agents)

    # Init the transformer encoder layer
    encoder_layer = nn.TransformerEncoderLayer(
        d_model=d_model,
        nhead=nhead,
        dim_feedforward=d_model * 2,
        dropout=dropout,
        activation='gelu',
        batch_first=True,
        norm_first=True
    )
    self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)

    # Output only ego agent position at 110th timestep
    self.output_projection = nn.Sequential(
        nn.Linear(d_model, d_model // 2),
        nn.GELU(),
        nn.Dropout(dropout),
        nn.Linear(d_model // 2, future_steps * 2)
    )

    # Initialize weights
    self.apply(self._init_weights)

  def _init_weights(self, module):
    """
    Initialize weights

    Input(s):
    self (struct): internal structure of transformer
    module (torch.nn.Module): module to initialize
    """
    if isinstance(module, nn.Linear):
      torch.nn.init.xavier_uniform_(module.weight)
      if module.bias is not None:
        torch.nn.init.zeros_(module.bias)
    elif isinstance(module, nn.LayerNorm):
      torch.nn.init.ones_(module.weight)
      torch.nn.init.zeros_(module.bias)

  def forward(self, data_batch):
    """
    Forward pass for transformer model

    Input(s):
    data_batch (torch_geometric.data.Data): input data

    Output(s):
    output (torch.Tensor): output tensor
    """
    # Extract input and validity mask
    x = data_batch.x
    validity_mask = data_batch.validity_mask

    # Reshape input to (batch_size, num_agents, time_steps * feature_dim)
    if x.dim() == 3:
      batch_size = x.size(0) // self.num_agents
      x = x.view(batch_size, self.num_agents, self.time_steps, self.feature_dim)

    # Get batch size
    batch_size = x.size(0)

    # Flatten dimension for each agent
    x_flat = x.view(batch_size, self.num_agents, -1)

    # Encode features for each agent (batch_size, num_agents, d_model)
    tokens = self.temporal_encoder(x_flat)

    # Add positional encoding
    tokens = self.pos_encoder(tokens)

    # Create attention mask from validity_mask
    if validity_mask.dim() == 1:
      validity_mask = validity_mask.view(batch_size, self.num_agents)

    # Apply transformer
    encoded = self.transformer(tokens, src_key_padding_mask=~validity_mask.bool())

    # Extract ego features (batch_size, d_model)
    ego_features = encoded[:, 0, :]

    # Generate future trajectory
    output = self.output_projection(ego_features)
    output = output.view(batch_size, self.future_steps, 2)

    return output

## Training

In [None]:
# Set device for training speedup
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using Apple Silicon GPU")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using CUDA GPU")
else:
    device = torch.device('cpu')

# Set scale
scale = 3.0

# Set seeds
torch.manual_seed(251)
np.random.seed(42)

# Split data into training and validation dataset
N = len(train_data)
val_size = int(0.1 * N)
train_size = N - val_size

train_dataset = DatasetTrain(train_data[:train_size], scale=scale, augment=True)
val_dataset = DatasetTrain(train_data[train_size:], scale=scale, augment=False)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=lambda x: Batch.from_data_list(x))
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=lambda x: Batch.from_data_list(x))

# Initialize model
model = TrajTransformer().to(device)

# Initialize training objects
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.25)
early_stopping_patience = 15
best_val_loss = float("inf")
no_improvement = 0
criterion = nn.MSELoss()

# Loop for 166 epochs
for epoch in tqdm.tqdm(range(166), desc="Epoch", unit="epoch"):
  # Put model in training mode
  model.train()

  # Initialize loss
  train_loss = 0

  # Predict on the batch and calculate loss
  for batch in train_dataloader:
    batch = batch.to(device)
    pred = model(batch)
    y = batch.y.view(batch.num_graphs, 60, 2)
    loss = criterion(pred, y)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
    optimizer.step()
    train_loss += loss.item()

  # Put model in evaluation mode
  model.eval()

  # Initialize validation metrics
  val_loss = 0
  val_mae = 0
  val_mse = 0

  # Predict on validation set and evaluate validation MSE and MAE
  with torch.no_grad():
      for batch in val_dataloader:
          batch = batch.to(device)
          pred = model(batch)
          y = batch.y.view(batch.num_graphs, 60, 2)
          val_loss += criterion(pred, y).item()

          # Denormalize
          pred = pred * batch.scale.view(-1, 1, 1) + batch.pos_origin.unsqueeze(1)
          y = y * batch.scale.view(-1, 1, 1) + batch.pos_origin.unsqueeze(1)
          val_mae += nn.L1Loss()(pred, y).item()
          val_mse += nn.MSELoss()(pred, y).item()

  # Average loss and metrics over batch size
  train_loss /= len(train_dataloader)
  val_loss /= len(val_dataloader)
  val_mae /= len(val_dataloader)
  val_mse /= len(val_dataloader)

  # Call scheduler to potentially adjust learning rate
  scheduler.step()

  # Console output for visibility
  tqdm.tqdm.write(f"Epoch {epoch:03d} | LR {optimizer.param_groups[0]['lr']:.6f} | Train MSE {train_loss:.4f} | Val MSE {val_loss:.4f} | MAE {val_mae:.4f} | MSE {val_mse:.4f}""")

  # Check if model needs to be saved and if early stopping should be triggered
  if val_loss < best_val_loss - 1e-3:
    best_val_loss = val_loss
    no_improvement = 0
    torch.save(model.state_dict(), f"best_model.pt")
  else:
    no_improvement += 1
    if no_improvement >= early_stopping_patience:
      print("Early stopping triggered.")
      break

## Testing / Validation

In [None]:
def plot_trajectory(ax, pred, gt, title=None):
    ax.cla()
    # Plot the predicted future trajectory
    ax.plot(pred[0,:60,0], pred[0,:60,1], color='red', label='Predicted Future Trajectory')

    # Plot the ground truth future trajectory
    ax.plot(gt[0,:60,0], gt[0,:60,1], color='blue', label='Ground Truth Future Trajectory')

    # Optionally set axis limits, labels, and title.
    x_max = max(pred[..., 0].max(), gt[..., 0].max())
    x_min = min(pred[..., 0].min(), gt[..., 0].min())
    y_max = max(pred[..., 1].max(), gt[..., 1].max())
    y_min = min(pred[..., 1].min(), gt[..., 1].min())

    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xlabel('X-axis')
    ax.set_ylabel('Y-axis')

    if title:
        ax.set_title(title)

    ax.legend()
    ax.grid(True, linestyle='--', alpha=0.7)

In [None]:
model.load_state_dict(torch.load("/content/best_model.pt"))
model.eval()

# randomly select 4 samples from the validation set
random_indices = random.sample(range(len(val_dataset)), 4)
fig, axes = plt.subplots(2, 2, figsize=(20, 10))
axes = axes.flatten()  # Flatten the array to iterate single axes objects

for i, idx in enumerate(random_indices):
  batch = val_dataset[idx]
  batch = batch.to(device)
  pred = model(batch)
  gt = torch.stack(torch.split(batch.y, 60, dim=0), dim=0)

  pred = pred * batch.scale.view(-1, 1, 1) + batch.origin.unsqueeze(1)
  gt = torch.stack(torch.split(batch.y, 60, dim=0), dim=0) * batch.scale.view(-1, 1, 1) + batch.origin.unsqueeze(1)

  pred = pred.detach().cpu().numpy()
  gt = gt.detach().cpu().numpy()

  # Plot the trajectory using the i-th axis
  plot_trajectory(axes[i], pred, gt, title=f"Sample {idx}")

plt.show()

## Export Submission

In [None]:
test_dataset = DatasetTest(test_data, scale=scale)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False,
                         collate_fn=lambda xs: Batch.from_data_list(xs))

best_model = torch.load("/content/best_model.pt")
model = TrajTransformer().to(device)

model.load_state_dict(best_model)
model.eval()

pred_list = []
with torch.no_grad():
  for batch in test_loader:
    batch = batch.to(device)
    pred_norm = model(batch)

    # Reshape the prediction to (N, 60, 2)
    pred = pred_norm * batch.scale.view(-1,1,1) + batch.origin.unsqueeze(1)
    pred_list.append(pred.cpu().numpy())
pred_list = np.concatenate(pred_list, axis=0)  # (N,60,2)
pred_output = pred_list.reshape(-1, 2)  # (N*60, 2)
output_df = pd.DataFrame(pred_output, columns=['x', 'y'])
output_df.index.name = 'index'
output_df.to_csv('submission.csv', index=True)