In [None]:
!pip3 install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [None]:
import warnings
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import RGATConv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from google.colab import drive

warnings.filterwarnings("ignore")

pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.max_columns", None)
pd.set_option("display.min_rows", 10)
pd.set_option("display.max_rows", 10)
pd.set_option("display.width", None)

sns.set_theme(style="whitegrid")

drive_path = "/content/drive"
drive.mount(drive_path)
base_path = f"{drive_path}/My Drive/Colab Notebooks/gyat/input"
data_path = f"{base_path}/march-machine-learning-mania-2025"
gyat_path = f"{base_path}/gyat-dataset"

device = "cuda" if torch.cuda.is_available() else "cpu"

Mounted at /content/drive


In [None]:
sea = []

for gender in ["M", "W"]:
  sea_ = pd.read_csv(f"{data_path}/{gender}Seasons.csv", usecols=["Season", "DayZero"])
  sea_["DayZero"] = pd.to_datetime(sea_["DayZero"])
  sea_ = sea_.rename(columns={"DayZero": f"{gender}DayZero"})
  sea.append(sea_)

sea = pd.merge(sea[0], sea[1], on="Season", how="outer")
sea = sea.sort_values("Season").reset_index(drop=True)

print(f"sea {sea.shape}")
print(sea)
print()
sea.info()

sea (41, 3)
    Season   MDayZero   WDayZero
0     1985 1984-10-29        NaT
1     1986 1985-10-28        NaT
2     1987 1986-10-27        NaT
3     1988 1987-11-02        NaT
4     1989 1988-10-31        NaT
..     ...        ...        ...
36    2021 2020-11-02 2020-11-02
37    2022 2021-11-01 2021-11-01
38    2023 2022-10-31 2022-10-31
39    2024 2023-11-06 2023-11-06
40    2025 2024-11-04 2024-11-04

[41 rows x 3 columns]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Season    41 non-null     int64         
 1   MDayZero  41 non-null     datetime64[ns]
 2   WDayZero  28 non-null     datetime64[ns]
dtypes: datetime64[ns](2), int64(1)
memory usage: 1.1 KB


In [None]:
tea = pd.DataFrame()

for gender in ["M", "W"]:
  tea = pd.concat([
      tea,
      pd.read_csv(f"{data_path}/{gender}Teams.csv", usecols=["TeamID", "TeamName"]),
  ])

tea = tea.sort_values("TeamID").reset_index(drop=True)

print(f"tea {tea.shape}")
# print(tea)

tea (758, 2)


In [None]:
nodes = []

for gender in ["M", "W"]:
  for part in ["RegularSeason", "NCAATourney"]:
    nodes_gender_part = pd.read_csv(f"{data_path}/{gender}{part}DetailedResults.csv")
    nodes_gender_part["Men"] = gender == "M"
    nodes_gender_part["NCAATourney"] = part == "NCAATourney"
    nodes.append(nodes_gender_part)

nodes = pd.concat(nodes)
nodes["WLoc"] = nodes["WLoc"].map({"A": -1, "N": 0, "H": 1})
nodes["LLoc"] = nodes["WLoc"] * -1

for c in nodes:
  nodes[c] = nodes[c].astype("int32")

both = nodes[[c for c in nodes if c[0] not in ("W", "L")]]

def extract(W_or_L, Le_or_Ri):
  return nodes[[c for c in nodes if c[0] == W_or_L]].rename(columns={c: f"{Le_or_Ri}_{c[1:]}" for c in nodes})

nodes = pd.concat([
  pd.concat([both, extract("W", "Le"), extract("L", "Ri")], axis=1),
  pd.concat([both, extract("L", "Le"), extract("W", "Ri")], axis=1),
])

# Date
nodes = pd.merge(nodes, sea, on="Season")
daynum = pd.to_timedelta(nodes["DayNum"], unit="D")
nodes["Date"] = nodes["WDayZero"] + daynum
nodes.loc[nodes["Men"], "Date"] = nodes["MDayZero"] + daynum
nodes = nodes.drop(columns=["MDayZero", "WDayZero"])

# TeamName
def add_team_name(Le_or_Ri):
  return pd.merge(nodes, tea, left_on=f"{Le_or_Ri}_TeamID", right_on="TeamID"
    ).rename(columns={"TeamName": f"{Le_or_Ri}_TeamName"}
    ).drop(columns=["TeamID"])
nodes = add_team_name("Le")
nodes = add_team_name("Ri")

# Le_Margin
nodes["Le_Margin"] = nodes["Le_Score"] - nodes["Ri_Score"]

# SeasonsAgo
nodes["SeasonsAgo"] = 2025 - nodes["Season"]

# Le_Loc
nodes = nodes.drop(columns=["Ri_Loc"])

# Split ascending and descending TeamIDs
#   so model doesn't learn from noise in arbitrary order
ascending = nodes["Le_TeamID"] < nodes["Ri_TeamID"]
nodes_asc = nodes[ascending]
nodes_des = nodes[~ascending]
del nodes

# Key
def key(ascending=True):
  if ascending:
    df = nodes_asc
    lesser = "Le"
    greater = "Ri"
  else:
    df = nodes_des
    lesser = "Ri"
    greater = "Le"
  df["Key"] = (
    df["Season"].astype(str) + "_" +
    df["DayNum"].astype(str).str.zfill(3) + "_" +
    df[f"{lesser}_TeamID"].astype(str) + "_" +
    df[f"{greater}_TeamID"].astype(str)
  )
key()
key(ascending=False)


def order_columns(df):
  cols = (
    ["Key", "Season", "Date"] +
    ["Le_TeamID", "Ri_TeamID", "Le_TeamName", "Ri_TeamName"] +
    ["Le_Margin"] +
    ["Men", "NCAATourney", "Le_Loc"] +  # features (not scaled)
    ["SeasonsAgo", "DayNum", "NumOT"]  # features (scaled)
  )
  return df[cols + [c for c in df if c not in cols]]


nodes_asc = order_columns(nodes_asc)
nodes_des = order_columns(nodes_des)

print(f"nodes_asc {nodes_asc.shape}")
print(nodes_asc)
print()

print(f"nodes_des {nodes_des.shape}")
print(nodes_des)

nodes_asc (202866, 42)
                       Key  Season       Date  Le_TeamID  Ri_TeamID  Le_TeamName     Ri_TeamName  Le_Margin  Men  NCAATourney  Le_Loc  SeasonsAgo  DayNum  NumOT  Le_Score  Le_FGM  Le_FGA  Le_FGM3  Le_FGA3  Le_FTM  Le_FTA  Le_OR  Le_DR  Le_Ast  Le_TO  Le_Stl  Le_Blk  Le_PF  Ri_Score  Ri_FGM  Ri_FGA  Ri_FGM3  Ri_FGA3  Ri_FTM  Ri_FTA  Ri_OR  Ri_DR  Ri_Ast  Ri_TO  Ri_Stl  Ri_Blk  Ri_PF
0       2003_010_1104_1328    2003 2002-11-14       1104       1328      Alabama        Oklahoma          6    1            0       0          22      10      0        68      27      58        3       14      11      18     14     24      13     23       7       1     22        62      22      53        2       10      16      22     10     22       8     18       9       2     20
1       2003_010_1272_1393    2003 2002-11-14       1272       1393      Memphis        Syracuse          7    1            0       0          22      10      0        70      26      62        8       20   

In [None]:
def as_struct(nodes):
  cols = ["Index", "Date", "Season", "NCAATourney"]
  return np.array(
    list(nodes[cols].itertuples(index=False)),
    dtype=[(c, nodes[c].dtype) for c in cols]
  )


def create_edges(source, target, edge_type):
  source, target = [pd.DataFrame(n.flatten()) for n in np.meshgrid(source, target)]
  edges = pd.concat([
      source.rename(columns={c: f"Source{c}" for c in source}),
      target.rename(columns={c: f"Target{c}" for c in target}),
  ], axis=1)
  edges["Type"] = edge_type
  edges["Type"] = edges["Type"].astype("int32")
  edges["Delta"] = ((edges["TargetDate"] - edges["SourceDate"]).dt.days).astype("int32")
  edges.insert(9, "Direction", np.sign(edges["Delta"]).astype("int32"))
  edges["Delta"] = np.abs(edges["Delta"])
  edges = edges.drop(columns=["SourceDate", "TargetDate"])
  return edges


def index(df):
  df = df.sort_values("Key").reset_index(drop=True)
  df.index = df.index.astype("int32")
  return df.reset_index(names=["Index"])


for season, men in nodes_asc.groupby(["Season", "Men"]).size().sort_index().reset_index()[["Season", "Men"]].itertuples(index=False):
  gender = "Men" if men else "Women"
  print(f"Processing {season} {gender}")

  nodes_season = nodes_asc[
    (nodes_asc["Season"] == season) &
    (nodes_asc["Men"] == (gender == "Men"))
  ]

  nodes_season = index(nodes_season)
  nodes_season.to_csv(f"{gyat_path}/{season}_{gender}_nodes_asc.csv", index=False)

  nodes_season_des = nodes_des[nodes_des.index.isin(nodes_season.index)]
  nodes_season_des = index(nodes_season_des)
  nodes_season_des.to_csv(f"{gyat_path}/{season}_{gender}_nodes_des.csv", index=False)

  edges = []

  for Le_TeamID, Le_nodes in nodes_season.groupby("Le_TeamID"):
    Le_struct = as_struct(Le_nodes)
    edges.append(create_edges(Le_struct, Le_struct, 0))
    Ri_nodes = nodes_season[nodes_season["Ri_TeamID"] == Le_TeamID]
    Ri_struct = as_struct(Ri_nodes)
    edges.append(create_edges(Le_struct, Ri_struct, 1))

    for Ri_TeamID in Le_nodes["Ri_TeamID"].unique():
      opp_Le_nodes = nodes_season[nodes_season["Le_TeamID"] == Ri_TeamID]
      opp_Le_struct = as_struct(opp_Le_nodes)
      edges.append(create_edges(Le_struct, opp_Le_struct, 4))
      opp_Ri_nodes = nodes_season[(nodes_season["Ri_TeamID"] == Ri_TeamID) & (nodes_season["Le_TeamID"] != Le_TeamID)]
      opp_Ri_struct = as_struct(opp_Ri_nodes)
      edges.append(create_edges(Le_struct, opp_Ri_struct, 5))

  for Ri_TeamID, Ri_nodes in nodes_season.groupby("Ri_TeamID"):
    Ri_struct = as_struct(Ri_nodes)
    edges.append(create_edges(Ri_struct, Ri_struct, 2))
    Le_nodes = nodes_season[nodes_season["Le_TeamID"] == Ri_TeamID]
    Le_struct = as_struct(Le_nodes)
    edges.append(create_edges(Ri_struct, Le_struct, 3))

    for Le_TeamID in Ri_nodes["Le_TeamID"].unique():
      opp_Ri_nodes = nodes_season[nodes_season["Ri_TeamID"] == Le_TeamID]
      opp_Ri_struct = as_struct(opp_Ri_nodes)
      edges.append(create_edges(Ri_struct, opp_Ri_struct, 6))
      opp_Le_nodes = nodes_season[(nodes_season["Le_TeamID"] == Le_TeamID) & (nodes_season["Ri_TeamID"] != Ri_TeamID)]
      opp_Le_struct = as_struct(opp_Le_nodes)
      edges.append(create_edges(Ri_struct, opp_Le_struct, 7))

  edges = pd.concat(edges)
  edges = edges[edges["SourceIndex"] != edges["TargetIndex"]]
  edges = edges.sort_values(["SourceIndex", "TargetIndex", "Type"]).reset_index(drop=True)
  edges.index = edges.index.astype("int32")
  edges.to_csv(f"{gyat_path}/{season}_{gender}_edges.csv", index=False)

  if season in (2006, 2007):
    print(f"edges {edges.shape}")
    print(edges)
    print()
    edges.info()
    print()

Processing 2003 Men
Processing 2004 Men
Processing 2005 Men
Processing 2006 Men
edges (4002127, 9)
         SourceIndex  SourceSeason  SourceNCAATourney  TargetIndex  TargetSeason  TargetNCAATourney  Type  Direction  Delta
0                  0          2006                  0            1          2006                  0     5          0      0
1                  0          2006                  0            1          2006                  0     7          0      0
2                  0          2006                  0            3          2006                  0     0          1      1
3                  0          2006                  0            3          2006                  0     7          1      1
4                  0          2006                  0            4          2006                  0     2          1      1
...              ...           ...                ...          ...           ...                ...   ...        ...    ...
4002122         4820          200