In [25]:
import pandas as pd
from pathlib import Path

DATA_RAW = Path("../data/raw")
DATA_PROCESSED = Path("../data/processed")
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

# ---- 1. Load venue/external factors ----
venue = pd.read_csv(DATA_RAW / "1_venue.csv")

# Rename travel column + create target
df_ext = venue.rename(columns={"travel_dist(km)": "travel_km"}).copy()

# result: 2 = home win, 1 = draw, 0 = away win  (dataset convention)
df_ext["home_win"] = (df_ext["result"] == 2).astype(int)

external_cols = [
    "id",
    "league_id",
    "season_id",
    "venue_id",
    "venue_city",
    "venue_capacity",
    "venue_surface_isgrass",
    "temp",
    "cloudcover",
    "humidity",
    "pressure",
    "visibility",
    "winddir",
    "windspeed",
    "night_game",
    "travel_km",
    "home_not_home",
    "travel_outside_state",
    "attendance_ratio",
    "home_win",
]

df_ext = df_ext[external_cols].copy()

print(df_ext.head().to_string())
print("\nMissingness:")
print(df_ext.isna().mean().sort_values(ascending=False).head(10))


       id  league_id  season_id  venue_id     venue_city  venue_capacity  venue_surface_isgrass  temp  cloudcover  humidity  pressure  visibility  winddir  windspeed  night_game  travel_km  home_not_home  travel_outside_state  attendance_ratio  home_win
0  251728        301        765        37        Furiani           16078                      1  23.9          27        48      1022          30       21       3.06           1        886              0                     0          0.816146         1
1  251738        301        765      4451         Monaco           18523                      1  17.9          25        73      1014          10      312       0.92           1        977              0                     1          0.432921         0
2    2188          8         13       199           Hull           25400                      1  20.2          54        62      1014          10      315       3.36           0        136              0                     0          0.7

In [26]:
odds = pd.read_csv(DATA_RAW / "5_odds.csv")

odds_small = odds[[
    "id",
    "league_id",
    "season_id",
    "3W__1_mean",   # home win odds
    "3W__X_mean",   # draw odds
    "3W__2_mean",   # away win odds
]].copy()

odds_small.rename(columns={
    "3W__1_mean": "odds_home",
    "3W__X_mean": "odds_draw",
    "3W__2_mean": "odds_away",
}, inplace=True)

print(odds_small.head().to_string())


        id  league_id  season_id  odds_home  odds_draw  odds_away
0  1710802          8       6397   1.766429   4.372500   6.534643
1  1710803          8       6397  10.623215   5.670357   1.303571
2  1710804          8       6397   1.285714   5.611072  12.416785
3  1710805          8       6397   1.806071   3.547857   4.892858
4  1710806          8       6397   1.760714   3.545714   5.221428


In [27]:
# Drop rows where any of the core odds are missing or non-positive
mask_valid = (odds_small[["odds_home", "odds_draw", "odds_away"]] > 0).all(axis=1)
odds_small = odds_small[mask_valid].copy()

inv_home = 1 / odds_small["odds_home"]
inv_draw = 1 / odds_small["odds_draw"]
inv_away = 1 / odds_small["odds_away"]

margin = inv_home + inv_draw + inv_away

odds_small["p_home_implied"] = inv_home / margin
odds_small["p_draw_implied"] = inv_draw / margin
odds_small["p_away_implied"] = inv_away / margin

print(odds_small[[
    "id", "p_home_implied", "p_draw_implied", "p_away_implied"
]].head().to_string())


        id  p_home_implied  p_draw_implied  p_away_implied
0  1710802        0.597263        0.241286        0.161451
1  1710803        0.090721        0.169963        0.739316
2  1710804        0.750365        0.171938        0.077698
3  1710805        0.532429        0.271038        0.196532
4  1710806        0.545320        0.270793        0.183887


In [28]:
# Merge on match id (and optionally league/season as a safety check)
df_model = df_ext.merge(
    odds_small[[
        "id",
        "p_home_implied",
        "p_draw_implied",
        "p_away_implied"
    ]],
    on="id",
    how="inner",
)

print("Shape after merge:", df_model.shape)
print(df_model.head().to_string())

# Save modeling-ready dataset
out_path = DATA_PROCESSED / "matches_modeling_dataset.csv"
df_model.to_csv(out_path, index=False)
print("Saved:", out_path)


Shape after merge: (8415, 23)
        id  league_id  season_id  venue_id           venue_city  venue_capacity  venue_surface_isgrass  temp  cloudcover  humidity  pressure  visibility  winddir  windspeed  night_game  travel_km  home_not_home  travel_outside_state  attendance_ratio  home_win  p_home_implied  p_draw_implied  p_away_implied
0  1726170        301       6405      4451               Monaco           18523                      1  17.8           0        76      1014          10       12       0.00           1        482              0                     1          0.495060         0        0.729825        0.175775        0.094400
1  1726172        301       6405       131                Paris           47929                      1  23.3          38        45      1018          28      291       4.75           0        117              0                     0          0.978489         0        0.873354        0.092171        0.034475
2  1726167        301       6405      6161 

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss

features_core = [
    "p_home_implied", "p_draw_implied", "p_away_implied",
    "temp", "humidity", "windspeed",
    "travel_km", "night_game", "attendance_ratio",
]

X = df_model[features_core]
y = df_model["home_win"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred_proba = clf.predict_proba(X_test)[:, 1]
print("AUC:", roc_auc_score(y_test, y_pred_proba))
print("Brier:", brier_score_loss(y_test, y_pred_proba))


AUC: 0.7537468612680476
Brier: 0.17542905439177361
