In [1]:
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import xgboost as xgb
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    confusion_matrix,
    roc_auc_score,
)
from torch_geometric.data import Data

import wandb
from soccerai.data.converters import ConnectionMode
from soccerai.data.converters.shot_prediction_converter import (
    ShotPredictionGraphConverter,
)
from wandb.integration.xgboost import WandbCallback

converter = ShotPredictionGraphConverter(ConnectionMode.FULLY_CONNECTED)

In [2]:
df = pl.read_parquet(
    "/home/fbaiocchi/soccerai/soccerai/data/resources/raw/dataset.parquet"
)
print("Raw dataset shape:", df.shape)
print(df.columns)

Raw dataset shape: (428698, 50)
['index', 'gameId', 'gameEventId', 'possessionEventId', 'startTime', 'endTime', 'duration', 'gameEventType', 'possessionEventType', 'teamName', 'playerName', 'videoUrl', 'frameTime', 'label', 'index_right', 'gameId_right', 'team', 'x', 'y', 'z', 'jerseyNum', 'visibility', 'velocity', 'direction', 'homeTeamName', 'awayTeamName', 'homeTeamStartLeft', 'startPeriod2', 'playerId', 'playerName_right', 'playerRole', 'Full Name', 'Height', 'Weight', 'Age Info', 'goals', 'shots', 'shots_on_target', 'shots_on_target_pct', 'shots_per90', 'shots_on_target_per90', 'goals_per_shot', 'goals_per_shot_on_target', 'average_shot_distance', 'pens_made', 'pens_att', 'Market Value', 'birth_date', 'age', 'height_cm']


In [3]:
def split_dataframe_by_event_keys(
    df: pl.DataFrame,
    key_cols: list[str] = ["gameEventId", "possessionEventId"],
    val_ratio: float = 0.2,
) -> tuple[pl.DataFrame, pl.DataFrame]:
    event_keys_df = df.select(key_cols).unique().sort(key_cols)  # important!
    num_events = event_keys_df.height

    train_end_idx = int((1.0 - val_ratio) * num_events)

    train_event_keys = event_keys_df.slice(0, train_end_idx)
    val_event_keys = event_keys_df.slice(train_end_idx, num_events)

    train_df = df.join(train_event_keys, on=key_cols, how="semi")
    val_df = df.join(val_event_keys, on=key_cols, how="semi")
    return train_df, val_df


train_df, val_df = split_dataframe_by_event_keys(df, val_ratio=0.2)
print("Train rows:", train_df.height, " Val rows:", val_df.height)

Train rows: 342953  Val rows: 85745


In [4]:
converter = ShotPredictionGraphConverter(ConnectionMode.FULLY_CONNECTED)

train_graphs: List[Data] = converter.convert_dataframe_to_data_list(train_df)
val_graphs: List[Data] = converter.convert_dataframe_to_data_list(val_df)


# [22, d] → [22*d]
def flatten_graphs(graphs: List[Data]):
    X = np.stack([g.x.numpy().reshape(-1) for g in graphs], axis=0)
    y = np.array([float(g.y.item()) for g in graphs])
    return X, y


X_train, y_train = flatten_graphs(train_graphs)
X_val, y_val = flatten_graphs(val_graphs)

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:  ", X_val.shape, "y_val:", y_val.shape)

ColumnNotFoundError: Weight

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'sink' <---
DF ["gameEventId", "possessionEventId", "duration", "possessionEventType", ...]; PROJECT */16 COLUMNS

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [None]:
wandb.init(
    project="soccerai",
    name="xgb-many-rounds",
    reinit=True,
    config={
        "objective": "binary:logistic",
        "eval_metric": ["auc", "logloss"],
        "eta": 0.005,
        "max_depth": 1,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "seed": 42,
        "num_boost_round": 1000,
    },
)
cfg = wandb.config

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    "objective": cfg.objective,
    "eval_metric": cfg.eval_metric,
    "eta": cfg.eta,
    "max_depth": cfg.max_depth,
    "subsample": cfg.subsample,
    "colsample_bytree": cfg.colsample_bytree,
    "seed": cfg.seed,
}

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=cfg.num_boost_round,
    evals=[(dtrain, "train"), (dval, "validation")],
    callbacks=[WandbCallback()],
)

train_probs = bst.predict(dtrain)
val_probs = bst.predict(dval)

train_preds = (train_probs > 0.5).astype(int)
val_preds = (val_probs > 0.5).astype(int)

wandb.log(
    {
        "train/accuracy": accuracy_score(y_train, train_preds),
        "val/accuracy": accuracy_score(y_val, val_preds),
        "val/roc_auc": roc_auc_score(y_val, val_probs),
    }
)


def log_cm(y_true, y_pred, tag):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    fig, ax = plt.subplots(figsize=(4, 4))
    disp = ConfusionMatrixDisplay(cm, display_labels=["no_shot", "shot"])
    disp.plot(cmap="Blues", ax=ax, colorbar=False)
    ax.set_title(tag)
    wandb.log({tag.replace(" ", "_").lower(): wandb.Image(fig)})
    plt.close(fig)


log_cm(y_train, train_preds, "Train Confusion Matrix")
log_cm(y_val, val_preds, "Val   Confusion Matrix")

wandb.finish()

In [None]:
node_cols = [
    c
    for c in converter._preprocess_dataframe(train_df).columns
    if c not in ("gameEventId", "possessionEventId", "label")
]
node_feature_count = train_graphs[0].x.shape[1]
assert len(node_cols) == node_feature_count

In [None]:
flat_feat_names = [f"{node:02d}_{feat}" for node in range(22) for feat in node_cols]
total_flat = flat_feat_names.__len__()
assert total_flat == X_train.shape[1]

In [None]:
importance_dict = bst.get_score(importance_type="gain")
real_names, scores = [], []
for fx, score in importance_dict.items():
    idx = int(fx[1:])  # 'f27' → 27
    real_names.append(flat_feat_names[idx])
    scores.append(score)


In [None]:
top_k = 20
order = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
for i in order[:top_k]:
    print(f"{real_names[i]:<30} gain = {scores[i]:.3f}")


In [None]:
top_names = [real_names[i] for i in order[:top_k]]
top_scores = [scores[i] for i in order[:top_k]]

fig, ax = plt.subplots(figsize=(6, 4))
ax.barh(range(len(top_names)), top_scores, align="center")
ax.set_yticks(range(len(top_names)))
ax.set_yticklabels(top_names)
ax.invert_yaxis()
ax.set_xlabel("Importance (gain)")
ax.set_title("Top 20 Feature importances")
plt.tight_layout()

In [None]:
# xgboost k-fold CV con logging su W&B

import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

import wandb
from wandb.integration.xgboost import WandbCallback

wandb.init(
    project="soccerai",
    name="xgb-kfold",
    config=dict(
        objective="binary:logistic",
        eval_metric=["auc", "logloss"],
        eta=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        seed=42,
        num_boost_round=1000,
        early_stopping_rounds=20,
        n_splits=5,
    ),
)
cfg = wandb.config

X_all = np.vstack([X_train, X_val])
y_all = np.concatenate([y_train, y_val])

params = dict(
    objective=cfg.objective,
    eval_metric=cfg.eval_metric,
    eta=cfg.eta,
    max_depth=cfg.max_depth,
    subsample=cfg.subsample,
    colsample_bytree=cfg.colsample_bytree,
    seed=cfg.seed,
)

skf = StratifiedKFold(
    n_splits=cfg.n_splits,
    shuffle=True,
    random_state=cfg.seed,
)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_all, y_all), 1):
    print(f"Fold {fold}/{cfg.n_splits}")

    dtrain = xgb.DMatrix(X_all[tr_idx], label=y_all[tr_idx])
    dval = xgb.DMatrix(X_all[va_idx], label=y_all[va_idx])

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=cfg.num_boost_round,
        evals=[(dtrain, "train"), (dval, f"fold{fold}")],
        early_stopping_rounds=cfg.early_stopping_rounds,
        callbacks=[WandbCallback(log_model=False)],
    )

    wandb.log(
        {
            f"fold{fold}/best_iteration": bst.best_iteration,
            f"fold{fold}/best_score": bst.best_score,
        }
    )

wandb.finish()

Fold 1/5
[0]	train-auc:0.64485	train-logloss:0.61131	fold1-auc:0.63381	fold1-logloss:0.61168
[1]	train-auc:0.68800	train-logloss:0.60749	fold1-auc:0.67568	fold1-logloss:0.60814
[2]	train-auc:0.69913	train-logloss:0.60420	fold1-auc:0.67982	fold1-logloss:0.60535
[3]	train-auc:0.70608	train-logloss:0.60079	fold1-auc:0.68604	fold1-logloss:0.60262
[4]	train-auc:0.71590	train-logloss:0.59800	fold1-auc:0.69529	fold1-logloss:0.60022
[5]	train-auc:0.72018	train-logloss:0.59546	fold1-auc:0.69726	fold1-logloss:0.59821
[6]	train-auc:0.72487	train-logloss:0.59260	fold1-auc:0.69679	fold1-logloss:0.59589
[7]	train-auc:0.73534	train-logloss:0.59030	fold1-auc:0.71062	fold1-logloss:0.59363
[8]	train-auc:0.74050	train-logloss:0.58780	fold1-auc:0.71379	fold1-logloss:0.59163
[9]	train-auc:0.74345	train-logloss:0.58541	fold1-auc:0.71462	fold1-logloss:0.58991
[10]	train-auc:0.74634	train-logloss:0.58334	fold1-auc:0.71787	fold1-logloss:0.58811
[11]	train-auc:0.75055	train-logloss:0.58067	fold1-auc:0.71961	fol

0,1
best_iteration,▁▁▁▁▁
best_score,▇▅▁▃█
epoch,▁▂▂▃▃▃▄▅▆▇▂▃▄▅▅▆▆███▂▃▄▄▄▆▇██▁▄▆▁▂▃▄▅▅▅█
fold1-auc,▁▁▂▃▃▄▄▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇█████████████
fold1-logloss,███▇▆▅▅▅▅▅▅▅▄▄▄▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁
fold1/best_iteration,▁
fold1/best_score,▁
fold2-auc,▁▁▃▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
fold2-logloss,█▇▆▆▅▅▅▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
fold2/best_iteration,▁

0,1
best_iteration,999.0
best_score,0.34647
epoch,999.0
fold1/best_iteration,999.0
fold1/best_score,0.34531
fold2/best_iteration,999.0
fold2/best_score,0.34309
fold3/best_iteration,999.0
fold3/best_score,0.33897
fold4/best_iteration,999.0
