# FinalCode — World Cup match forecasting (CS 418)

This notebook runs end-to-end and generates all artifacts referenced by `FinalReport.ipynb`:

## 0. Setup

In [42]:
from __future__ import annotations

from pathlib import Path
import json
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, log_loss
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

_CWD = Path.cwd().resolve()
PROJECT_ROOT = None
for p in [_CWD] + list(_CWD.parents):
    if (p / "src" / "config.py").exists():
        PROJECT_ROOT = p
        break
if PROJECT_ROOT is None:
    raise FileNotFoundError("Could not locate project root (expected `src/config.py`).")

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import src.config as cfg
import src.cleaning as cleaning
import src.elo as elo_mod
import src.features as feat_mod

RANDOM_STATE = getattr(cfg, "TrainConfig", object) and getattr(getattr(cfg, "TrainConfig", object), "random_state", 42)
RANDOM_STATE = 42 if not isinstance(RANDOM_STATE, int) else RANDOM_STATE
np.random.seed(RANDOM_STATE)

# Canonical artifact paths (do not rely on cfg defining every constant)
FIG_DIR = PROJECT_ROOT / "reports" / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)
METRICS_PATH = PROJECT_ROOT / "reports" / "metrics_holdout_2022.json"

# Cleaned data paths
MATCHES_CLEAN_PARQUET = getattr(cfg, "MATCHES_CLEAN_PARQUET", (PROJECT_ROOT / "data" / "processed" / "matches_clean.parquet"))
DATA_PROCESSED_DIR = getattr(cfg, "DATA_PROCESSED_DIR", MATCHES_CLEAN_PARQUET.parent)
MATCHES_CLEAN_CSV = DATA_PROCESSED_DIR / "matches_clean.csv"

print("PROJECT_ROOT =", PROJECT_ROOT)
print("Cleaned parquet =", MATCHES_CLEAN_PARQUET)
print("Cleaned CSV =", MATCHES_CLEAN_CSV)
print("Figures dir =", FIG_DIR)
print("Metrics path =", METRICS_PATH)

PROJECT_ROOT = C:\CS418\project-check-in-team-4\worldcup-predictor
Cleaned parquet = C:\CS418\project-check-in-team-4\worldcup-predictor\data\processed\matches_clean.parquet
Cleaned CSV = C:\CS418\project-check-in-team-4\worldcup-predictor\data\processed\matches_clean.csv
Figures dir = C:\CS418\project-check-in-team-4\worldcup-predictor\reports\figures
Metrics path = C:\CS418\project-check-in-team-4\worldcup-predictor\reports\metrics_holdout_2022.json


## 1. Data loading and cleaning

In [43]:
# Load cleaned artifacts
matches_clean = None

# Prefer parquet if available
if MATCHES_CLEAN_PARQUET.exists():
    try:
        matches_clean = pd.read_parquet(MATCHES_CLEAN_PARQUET)
    except Exception:
        matches_clean = None  # parquet engine may be unavailable in some environments

# Fall back to CSV, then cleaning pipeline
if matches_clean is None:
    if MATCHES_CLEAN_CSV.exists():
        matches_clean = pd.read_csv(MATCHES_CLEAN_CSV)
    else:
        matches_clean = cleaning.run_clean_and_save()

# Ensure CSV exists for submission / compatibility
DATA_PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
if not MATCHES_CLEAN_CSV.exists():
    matches_clean.to_csv(MATCHES_CLEAN_CSV, index=False)

# Lightweight sanity checks (avoid huge outputs)
matches_clean[["year","team_home","team_away","score_home","score_away","match_result_1x2"]].head(3)


Unnamed: 0,year,team_home,team_away,score_home,score_away,match_result_1x2
0,1930,FRA,MEX,4,1,win
1,1930,USA,BEL,3,0,win
2,1930,ROU,PER,3,1,win


In [44]:
# Compact data summary for the report
data_summary = {
    "n_matches_clean": int(len(matches_clean)),
    "year_min": int(matches_clean["year"].min()),
    "year_max": int(matches_clean["year"].max()),
    "n_unique_teams": int(pd.unique(pd.concat([matches_clean["team_home"], matches_clean["team_away"]])).size),
    "label_counts": matches_clean["match_result_1x2"].value_counts().to_dict(),
}
pd.DataFrame({"Metric": list(data_summary.keys()), "Value": list(data_summary.values())})

Unnamed: 0,Metric,Value
0,n_matches_clean,755
1,year_min,1930
2,year_max,2022
3,n_unique_teams,76
4,label_counts,"{'win': 410, 'loss': 201, 'draw': 144}"


## 2. Visualization — Elo trajectories

In [45]:
# Build Elo features (chronological)
feat_df = feat_mod.engineer_features(matches_clean)

# Create a tidy team-level series of pre-match Elo for plotting
home_part = feat_df[["date","year","team_home","elo_home_pre"]].rename(columns={"team_home":"team","elo_home_pre":"elo"})
away_part = feat_df[["date","year","team_away","elo_away_pre"]].rename(columns={"team_away":"team","elo_away_pre":"elo"})
elo_long = pd.concat([home_part, away_part], ignore_index=True).sort_values("date")

# Pick a small set of teams for the plot (top by match count)
top_teams = (
    elo_long["team"].value_counts().head(6).index.tolist()
)

plt.figure(figsize=(10, 5))
for t in top_teams:
    sub = elo_long[elo_long["team"] == t]
    # Downsample slightly if needed (still deterministic)
    sub = sub.iloc[::max(1, len(sub)//200)]
    plt.plot(sub["date"], sub["elo"], label=t)

plt.title("Elo trajectories (pre-match) for frequently appearing teams")
plt.xlabel("Date")
plt.ylabel("Elo (pre-match)")
plt.legend(loc="best", fontsize=8)
plt.tight_layout()

out_path = FIG_DIR / "elo_trajectories.png"
plt.savefig(out_path, dpi=200)
plt.close()

out_path

WindowsPath('C:/CS418/project-check-in-team-4/worldcup-predictor/reports/figures/elo_trajectories.png')

## 3. ML/Stats — temporal holdout evaluation (2022)

In [46]:
TEST_YEAR = 2022
LABELS_ORDER = ["win", "draw", "loss"]

# Strict temporal split to avoid leakage
train_df = feat_df[feat_df["year"] < TEST_YEAR].copy()
test_df  = feat_df[feat_df["year"] == TEST_YEAR].copy()

FEATURE_COLS = ["elo_home_pre", "elo_away_pre", "elo_delta_pre", "is_knockout"]
TARGET_COL = "match_result_1x2"

X_train = train_df[FEATURE_COLS]
y_train = train_df[TARGET_COL].astype(str)

X_test = test_df[FEATURE_COLS]
y_test = test_df[TARGET_COL].astype(str)

holdout_label_counts = y_test.value_counts().to_dict()

len(train_df), len(test_df), holdout_label_counts

(694, 61, {'win': 32, 'loss': 20, 'draw': 9})

In [47]:
def plot_confusion_matrix(cm: np.ndarray, labels: list[str], title: str, out_file: Path) -> None:
    plt.figure(figsize=(5, 4))
    im = plt.imshow(cm, interpolation="nearest")
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.xticks(range(len(labels)), labels, rotation=45, ha="right")
    plt.yticks(range(len(labels)), labels)

    # annotate counts
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, str(int(cm[i, j])), ha="center", va="center")

    plt.tight_layout()
    plt.savefig(out_file, dpi=200)
    plt.close()

In [48]:
# Technique #1: Multinomial Logistic Regression baseline (with scaling) using a Pipeline
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
preproc_lr = ColumnTransformer(
    transformers=[("num", numeric_transformer, FEATURE_COLS)],
    remainder="drop",
)

model1 = Pipeline(
    steps=[
        ("prep", preproc_lr),
        ("clf", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE, multi_class="multinomial")),
    ]
)

model1.fit(X_train, y_train)

pred1 = model1.predict(X_test)
proba1 = model1.predict_proba(X_test)
classes1 = list(model1.named_steps["clf"].classes_)

# Align probabilities to LABELS_ORDER for log loss
proba1_df = pd.DataFrame(proba1, columns=classes1)[LABELS_ORDER].to_numpy()

res_lr = {
    "accuracy": float(accuracy_score(y_test, pred1)),
    "macro_f1": float(f1_score(y_test, pred1, average="macro", labels=LABELS_ORDER)),
    "log_loss": float(log_loss(y_test, proba1_df, labels=LABELS_ORDER)),
    "classes": classes1,
}
cm1 = confusion_matrix(y_test, pred1, labels=LABELS_ORDER)

plot_confusion_matrix(
    cm1, LABELS_ORDER,
    title="Confusion matrix — Model 1 (LogReg) on 2022 holdout",
    out_file=FIG_DIR / "cm_model1_2022.png"
)

res_lr



{'accuracy': 0.4918032786885246,
 'macro_f1': 0.2974036191974823,
 'log_loss': 1.4816352099180283,
 'classes': ['draw', 'loss', 'win']}

In [49]:
# Technique #2: Random Forest (non-linear model) using a Pipeline (no scaling needed)
preproc_rf = ColumnTransformer(
    transformers=[("num", "passthrough", FEATURE_COLS)],
    remainder="drop",
)

model2 = Pipeline(
    steps=[
        ("prep", preproc_rf),
        ("clf", RandomForestClassifier(
            n_estimators=300,
            random_state=RANDOM_STATE,
            class_weight=None,
            min_samples_leaf=2,
        )),
    ]
)

model2.fit(X_train, y_train)

pred2 = model2.predict(X_test)
proba2 = model2.predict_proba(X_test)
classes2 = list(model2.named_steps["clf"].classes_)

proba2_df = pd.DataFrame(proba2, columns=classes2)[LABELS_ORDER].to_numpy()

res_rf = {
    "accuracy": float(accuracy_score(y_test, pred2)),
    "macro_f1": float(f1_score(y_test, pred2, average="macro", labels=LABELS_ORDER)),
    "log_loss": float(log_loss(y_test, proba2_df, labels=LABELS_ORDER)),
    "classes": classes2,
}
cm2 = confusion_matrix(y_test, pred2, labels=LABELS_ORDER)

plot_confusion_matrix(
    cm2, LABELS_ORDER,
    title="Confusion matrix — Model 2 (RF) on 2022 holdout",
    out_file=FIG_DIR / "cm_model2_2022.png"
)

res_rf



{'accuracy': 0.5081967213114754,
 'macro_f1': 0.4291486291486291,
 'log_loss': 1.6192756076819315,
 'classes': ['draw', 'loss', 'win']}

## 4. Additional work — permutation importance

In [50]:
# Extra deliverable: permutation importance on the 2022 holdout (macro-F1 scoring)
# We'll compute this on the Random Forest pipeline (model2).
perm = permutation_importance(
    model2,
    X_test,
    y_test,
    scoring="f1_macro",
    n_repeats=15,
    random_state=RANDOM_STATE,
    n_jobs=1,
)

imp = pd.DataFrame(
    {"feature": FEATURE_COLS, "importance_mean": perm.importances_mean, "importance_std": perm.importances_std}
).sort_values("importance_mean", ascending=False)

plt.figure(figsize=(7, 4))
plt.bar(imp["feature"], imp["importance_mean"], yerr=imp["importance_std"])
plt.title("Permutation importance (macro-F1) — 2022 holdout")
plt.xlabel("Feature")
plt.ylabel("Mean decrease in macro-F1")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()

plt.savefig(FIG_DIR / "perm_importance.png", dpi=200)
plt.close()

imp

Unnamed: 0,feature,importance_mean,importance_std
0,elo_home_pre,0.125608,0.032628
2,elo_delta_pre,0.05655,0.039081
3,is_knockout,0.015635,0.036389
1,elo_away_pre,0.01395,0.047102


## 5. Save metrics artifact for the report

In [51]:
metrics = {
    "data_summary": {
        **data_summary,
        "holdout_year": int(TEST_YEAR),
        "holdout_n_matches": int(len(y_test)),
        "holdout_label_counts": holdout_label_counts,
    },
    "models": {
        "model1_logreg": {
            "name": "Multinomial Logistic Regression",
            **res_lr,
            "confusion_matrix_labels": LABELS_ORDER,
            "confusion_matrix": cm1.tolist(),
        },
        "model2_random_forest": {
            "name": "Random Forest",
            **res_rf,
            "confusion_matrix_labels": LABELS_ORDER,
            "confusion_matrix": cm2.tolist(),
        },
    },
    "artifacts": {
        "fig_dir": str(FIG_DIR.relative_to(PROJECT_ROOT)),
        "metrics_path": str(METRICS_PATH.relative_to(PROJECT_ROOT)),
        "figures": {
            "elo_trajectories": "reports/figures/elo_trajectories.png",
            "cm_model1_2022": "reports/figures/cm_model1_2022.png",
            "cm_model2_2022": "reports/figures/cm_model2_2022.png",
            "perm_importance": "reports/figures/perm_importance.png",
        },
        "cleaned_csv": str(MATCHES_CLEAN_CSV.relative_to(PROJECT_ROOT)),
    },
}

METRICS_PATH.parent.mkdir(parents=True, exist_ok=True)
METRICS_PATH.write_text(json.dumps(metrics, indent=2), encoding="utf-8")

METRICS_PATH

WindowsPath('C:/CS418/project-check-in-team-4/worldcup-predictor/reports/metrics_holdout_2022.json')