In [1]:
!pip3 install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m61.4/63.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [2]:
import warnings
from functools import partial
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import RGATConv
from torch_geometric.utils import subgraph
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from google.colab import drive

warnings.filterwarnings("ignore")

pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.max_columns", None)
pd.set_option("display.min_rows", 10)
pd.set_option("display.max_rows", 10)
pd.set_option("display.width", None)

sns.set_theme(style="whitegrid")

drive_path = "/content/drive"
drive.mount(drive_path)
base_path = f"{drive_path}/My Drive/Colab Notebooks/gnn/input"
data_path = f"{base_path}/march-machine-learning-mania-2025"
gnn_path = f"{base_path}/gnn"

device = "cuda" if torch.cuda.is_available() else "cpu"

Mounted at /content/drive


In [3]:
def print_df(name, df, info=False):
  print(f"{name} {df.shape}")
  print(df)
  print()
  if info:
    df.info()
    print()

In [4]:
def load_nodes(asc_or_des):
  nodes = pd.read_csv(f"{gnn_path}/nodes_{asc_or_des}.csv")
  nodes["Date"] = pd.to_datetime(nodes["Date"])

  nodes = pd.concat([
      # indentifying info, not passed to model
      nodes[["Index"]].astype("int32"),
      nodes[["Key"]],
      nodes[["Season"]].astype("int32"),
      nodes[["Date"]],
      nodes[["Le_TeamID", "Ri_TeamID"]].astype("int32"),
      nodes[["Le_TeamName", "Ri_TeamName"]],

      # target (scaled as Le_y)
      nodes[["Le_Margin"]].astype("int32"),

      # features (not scaled)
      nodes[["Men", "NCAATourney", "Le_Loc"]].astype("int32"),

      # # features (scaled)
      nodes[["SeasonsAgo", "DayNum", "NumOT"]].astype("int32"),
      nodes.loc[:, "Le_Score":].astype("int32"),
    ],
    axis=1,
  )

  nodes.index = nodes.index.astype("int32")

  return nodes

In [5]:
def scale(scaler, df, cols=None):
  return pd.DataFrame(
    scaler.transform(df).astype("float32"),
    index=df.index,
    columns=df.columns if cols is None else cols,
  )


scaler_x = StandardScaler()
scaler_y = StandardScaler()


def scale_values(nodes):
  return pd.concat([
      nodes.loc[:, :"Le_Margin"],
      scale(scaler_y, nodes[["Le_Margin"]], ["Le_y"]),
      nodes.loc[:, "Men":"Le_Loc"].astype("float32"),
      scale(scaler_x, nodes.loc[:, "SeasonsAgo":]),
    ],
    axis=1,
  )

In [6]:
nodes = [load_nodes(d) for d in ("asc", "des")]
nodes_doubled = pd.concat(nodes)
scaler_x.fit(nodes_doubled.loc[:, "SeasonsAgo":])
scaler_y.fit(nodes_doubled[["Le_Margin"]])
nodes = [scale_values(n) for n in nodes]
print_df("nodes[0]", nodes[0])

nodes[0] (202866, 44)
         Index                 Key  Season       Date  Le_TeamID  Ri_TeamID   Le_TeamName     Ri_TeamName  Le_Margin      Le_y  Men  NCAATourney  Le_Loc  SeasonsAgo    DayNum     NumOT  Le_Score    Le_FGM    Le_FGA   Le_FGM3   Le_FGA3    Le_FTM    Le_FTA     Le_OR     Le_DR    Le_Ast     Le_TO    Le_Stl    Le_Blk     Le_PF  Ri_Score    Ri_FGM    Ri_FGA   Ri_FGM3   Ri_FGA3    Ri_FTM    Ri_FTA     Ri_OR     Ri_DR    Ri_Ast     Ri_TO    Ri_Stl    Ri_Blk     Ri_PF
0            0  2003_010_1104_1328    2003 2002-11-14       1104       1328       Alabama        Oklahoma          6  0.364316  1.0          0.0     0.0    2.069352 -1.669631 -0.214915  0.021993  0.566312  0.102891 -1.084275 -0.794976 -0.364918 -0.108382  0.691001  0.002141 -0.003511  1.793540 -0.013958 -1.005587  0.935628 -0.435726 -0.414578 -0.539242 -1.411433 -1.427122  0.475545  0.406779 -0.208418 -0.373919 -1.106682  0.755983  0.592462 -0.567787  0.494275
1            1  2003_010_1272_1393    2003 2002-

In [7]:
edges = pd.read_csv(f"{gnn_path}/edges.csv", dtype="int32")
edges[["Direction", "Delta"]] = edges[["Direction", "Delta"]].astype("float32")
edges.index = edges.index.astype("int32")
print_df("edges", edges)

edges (183746250, 9)
           SourceIndex  SourceSeason  SourceNCAATourney  TargetIndex  TargetSeason  TargetNCAATourney  Type  Direction  Delta
0                    0          2003                  0            3          2003                  0     4        1.0    1.0
1                    0          2003                  0           10          2003                  0     4        1.0    3.0
2                    0          2003                  0           14          2003                  0     4        1.0    4.0
3                    0          2003                  0           24          2003                  0     5        1.0    4.0
4                    0          2003                  0           27          2003                  0     4        1.0    5.0
...                ...           ...                ...          ...           ...                ...   ...        ...    ...
183746245       202865          2025                  0       202758          2025               

In [8]:
test = nodes[0][
  (2021 <= nodes[0]["Season"]) &
  (nodes[0]["Season"] <= 2024) &
  (nodes[0]["NCAATourney"] == 1)
].index

train = nodes[0][~nodes[0].index.isin(test)].index

print(test)
print()
print(train)

Index([158848, 158849, 158850, 158851, 158852, 158853, 158854, 158855, 158856, 158857,
       ...
       191771, 191772, 191773, 191774, 191775, 191776, 191777, 191778, 191779, 191780],
      dtype='int32', length=531)

Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,      9,
       ...
       202856, 202857, 202858, 202859, 202860, 202861, 202862, 202863, 202864, 202865],
      dtype='int32', length=202335)


In [9]:
def tensor(data):
  return torch.tensor(data.values, device=device, dtype=torch.float32)


def long_tensor(data):
  return torch.tensor(data.values, device=device, dtype=torch.long)


xs = [tensor(n.loc[:, "Men":]) for n in nodes]
y_trues = [tensor(n[["Le_y"]]) for n in nodes]
edge_index = long_tensor(edges[["SourceIndex", "TargetIndex"]].T)
edge_type = long_tensor(edges["Type"])
edge_attr = tensor(edges[["Direction", "Delta"]])


class Model(nn.Module):
  def __init__(self, layers, transforms):
    super().__init__()
    self.layers = nn.ModuleList(layers)
    self.transforms = transforms

  def forward(self, node_indices, x):
    y_pred = x[node_indices]

    ei, _, mask = subgraph(
        node_indices,
        edge_index,
        relabel_nodes=True,
        return_edge_mask=True,
    )

    for transform in self.transforms:
      y_pred = transform(
        y_pred,
        ei,
        edge_type[mask],
        edge_attr[mask],
      )

    return y_pred


def transform_rgat(layer, x, *edge_args):
  out = layer(x, *edge_args)
  out = F.leaky_relu(out)
  return F.dropout(out, training=layer.training)


def transform_linear(layer, x, *edge_args):
  return layer(x)


def initialize_model(layer_sizes, heads=4):
  layers = []
  transforms = []

  for i in range(len(layer_sizes) - 1):
    inp = layer_sizes[i] * (heads if i > 0 else 1)
    out = layer_sizes[i + 1]

    if i < len(layer_sizes) - 2:
      layer = RGATConv(
        inp,
        out,
        num_relations=edges["Type"].unique().shape[0],
        heads=heads,
        edge_dim=len(["Direction", "Delta"]),
      )

      transform = partial(transform_rgat, layer)

    else:
      layer = nn.Linear(inp, out)
      transform = partial(transform_linear, layer)

    layers.append(layer)
    transforms.append(transform)

  return Model(layers, transforms)


def brier_score(margin_pred, margin_true):
    win_prob_pred = 1 / (1 + np.exp(-margin_pred * 0.175))
    win_true = (margin_true > 0).astype("int32")
    return np.mean((win_prob_pred - win_true) ** 2)


def calculate_score(y_preds, train_or_test):
  score = 0

  for y_pred, n in zip(y_preds, nodes):
    margin_pred = scaler_y.inverse_transform(
      y_pred.cpu().numpy().reshape(-1, 1)
    ).flatten()

    score += brier_score(
        margin_pred[train_or_test],
        n.loc[train_or_test, "Le_Margin"]
    )

  return score / len(nodes)

In [10]:
def train_models(
    hidden_layer_sizes=[64, 32, 16],
    n_epochs=10_000,
    patience=60,
  ):
  layer_sizes = [xs[0].shape[1]] + hidden_layer_sizes + [y_trues[0].shape[1]]
  kfold = KFold(shuffle=True, random_state=42)

  y_pred_oofs = [
    torch.zeros(y_true.shape[0], device=device, dtype=torch.float32)
    for y_true in y_trues
  ]

  state_dicts = []

  for fold_n, (i_fold, i_oof) in enumerate(kfold.split(train), 1):
    print(f"  fold {fold_n}")
    start = datetime.now()
    i_fold = long_tensor(train[i_fold])
    i_oof = long_tensor(train[i_oof])
    model = initialize_model(layer_sizes)
    adam = torch.optim.Adam(model.parameters())

    for epoch_n in range(1, n_epochs + 1):
      model.train()
      y_pred_epoch_folds = [model.forward(i_fold, x) for x in xs]

      mse_epoch_folds = [
        F.mse_loss(y_pred_epoch_fold, y_true[i_fold])
        for y_pred_epoch_fold, y_true
        in zip(y_pred_epoch_folds, y_trues)
      ]

      adam.zero_grad()

      for mse in mse_epoch_folds:
        mse.backward()

      mse_epoch_fold = (sum(mse_epoch_folds) / len(mse_epoch_folds)).item()
      adam.step()
      model.eval()

      with torch.no_grad():
        y_pred_epoch_oofs = [model.forward(i_oof, x) for x in xs]

        mse_epoch_oof = (sum(
          F.mse_loss(y_pred_epoch_oof, y_true[i_oof])
          for y_pred_epoch_oof, y_true
          in zip(y_pred_epoch_oofs, y_trues)
        ) / len(y_trues)).item()

      if epoch_n == 1 or m_best[0] > mse_epoch_oof:
        m_best = (mse_epoch_oof, 0, model.state_dict())
      else:
        m_best = (m_best[0], m_best[1]+1, m_best[2])

      if ((epoch_n % (n_epochs // 100) == 0)
          or (epoch_n > (n_epochs - 3))
          or (m_best[1] > patience)):
        print(
          f"    epoch {epoch_n:>6}: "
          f"fold={mse_epoch_fold:.4f} "
          f"oof={mse_epoch_oof:.4f}"
        )

      if m_best[1] > patience:
        print(f"    out of patience: oof={m_best[0]:.4f}")
        break

    model.load_state_dict(m_best[2])
    model.eval()

    with torch.no_grad():
      for x, y_pred_oof in zip(xs, y_pred_oofs):
        y_pred_oof[i_oof] = model.forward(i_oof, x).flatten()

    state_dicts.append(model.state_dict())
    t = (datetime.now() - start).total_seconds()
    print(f"  done fold {fold_n} {t} seconds")

  score = calculate_score(y_pred_oofs, train)
  print(f"oof brier score: {score:.4f}")
  return layer_sizes, state_dicts

In [11]:
def test_models(layer_sizes, state_dicts):
  y_preds = [
    torch.zeros(y_true.shape[0], device=device, dtype=torch.float32)
    for y_true in y_trues
  ]

  for state_dict in state_dicts:
    model = initialize_model(layer_sizes)
    model.load_state_dict(state_dict)
    model.eval()

    with torch.no_grad():
      for x, y_pred in zip(xs, y_preds):
        y_pred += model.forward(long_tensor(test), x).flatten()

  for y_pred in y_preds:
    y_pred /= len(state_dicts)

  score = calculate_score(y_preds, test)
  print(f"test brier score: {score:.4f}")

In [12]:
layer_sizes, state_dicts = train_models()

  fold 1


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.82 GiB. GPU 0 has a total capacity of 14.74 GiB of which 4.54 GiB is free. Process 29429 has 10.20 GiB memory in use. Of the allocated memory 9.21 GiB is allocated by PyTorch, and 899.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
test_models(layer_sizes, state_dicts)