In [255]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report


In [256]:
# Keep full versions with context columns
X_train_full = pd.read_csv("../data/X_train.csv", index_col=0)
X_test_full  = pd.read_csv("../data/X_test.csv", index_col=0)

y_train = pd.read_csv("../data/y_train.csv", index_col=0).squeeze()
y_test  = pd.read_csv("../data/y_test.csv", index_col=0).squeeze()

# Make stripped-down versions for the model
non_features = ["Div", "Date", 
                "HomeTeam_ShotOnTarget", "AwayTeam_ShotOnTarget"]

X_train_features = X_train_full.drop(columns=non_features)
X_test_features  = X_test_full.drop(columns=non_features)

# Save the feature order from the stripped-down version
feature_columns = X_train_features.columns
feature_columns.to_series(name="feature").to_csv("../data/feature_columns.csv", index=False)


In [257]:
X_train_features['B365_overround']

0       1.066188
1       1.056489
2       1.066188
3       1.065873
4       1.070850
          ...   
4175    1.046154
4176    1.065605
4177    1.050770
4178    1.054094
4179    1.057995
Name: B365_overround, Length: 4180, dtype: float64

In [258]:
import pandas as pd

feat_order = pd.read_csv("../data/feature_columns.csv")["feature"].tolist()

def align_features(df: pd.DataFrame, feat_order: list[str]) -> pd.DataFrame:
    # add any missing training columns as zeros
    missing = [c for c in feat_order if c not in df.columns]
    if missing:
        df = df.copy()
        for c in missing:
            df[c] = 0.0

    # drop any extras not used in training
    extra = [c for c in df.columns if c not in feat_order]
    if extra:
        df = df.drop(columns=extra, errors="ignore")

    # put in the exact training order
    df = df.reindex(columns=feat_order, fill_value=0.0)
    return df

# build the actual matrices the model will see
X_train = align_features(X_train_features, feat_order)
X_test  = align_features(X_test_features,  feat_order)

# optional safety check
assert list(X_train.columns) == feat_order == list(X_test.columns)



In [259]:
X_train

Unnamed: 0,HomeTeam,AwayTeam,HomeTeam_avg_goal_diff,HomeTeam_points,AwayTeam_avg_goal_diff,AwayTeam_points,B365H,B365D,B365A,BWH,...,PS_overround,PSC_overround,pH_mean,pD_mean,pA_mean,overround_mean,overround_std,home_adv,draw_tightness,elo_diff
0,Espanol,Valladolid,0.375,0.000000,0.439024,0.000000,0.468960,0.284218,0.246821,0.501789,...,1.084635,1.084635,0.486191,0.282183,0.231626,1.087405,0.042728,0.254565,0.282183,0.000000
1,Valencia,Mallorca,0.375,0.000000,0.439024,0.000000,0.556783,0.262925,0.180292,0.548686,...,1.086819,1.086819,0.544034,0.266139,0.189827,1.088623,0.037464,0.354208,0.266139,0.000000
2,Ath Bilbao,Almeria,0.375,0.000000,0.439024,0.000000,0.468960,0.284218,0.246821,0.477612,...,1.090072,1.090072,0.467183,0.285318,0.247500,1.089762,0.033230,0.219683,0.285318,0.000000
3,Ath Madrid,Malaga,0.375,0.000000,0.439024,0.000000,0.651526,0.223380,0.125093,0.647450,...,1.084317,1.084317,0.642133,0.234665,0.123202,1.082695,0.041275,0.518931,0.234665,0.000000
4,Betis,Recreativo,0.375,0.000000,0.439024,0.000000,0.466919,0.287335,0.245747,0.518664,...,1.085347,1.085347,0.492309,0.283573,0.224118,1.085243,0.041920,0.268191,0.283573,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4175,Levante,Ath Madrid,0.550,0.666667,0.439024,0.666667,0.254902,0.254902,0.490196,0.271493,...,1.025927,1.026803,0.260182,0.250866,0.488952,1.037138,0.016111,-0.228769,0.250866,-235.644217
4176,Sevilla,Ath Bilbao,0.325,0.266667,0.414634,0.466667,0.507262,0.276010,0.216728,0.511111,...,1.026614,1.026469,0.501490,0.285331,0.213179,1.040258,0.019152,0.288311,0.285331,7.004375
4177,Valladolid,Valencia,0.425,0.666667,0.560976,0.600000,0.118960,0.181273,0.699767,0.119069,...,1.032115,1.034061,0.119226,0.192054,0.688720,1.039219,0.014740,-0.569493,0.192054,-161.814985
4178,Eibar,Barcelona,0.325,0.400000,0.536585,0.800000,0.263523,0.237171,0.499307,0.257662,...,1.028292,1.027454,0.249476,0.232252,0.518272,1.040577,0.015316,-0.268796,0.232252,-302.934468


In [260]:
X_train.loc[:, ["MaxH", "MaxD", "MaxA"]]

Unnamed: 0,MaxH,MaxD,MaxA
0,0.480966,0.295652,0.223382
1,0.563347,0.254087,0.182566
2,0.466712,0.288264,0.245024
3,0.673031,0.214797,0.112172
4,0.480418,0.300261,0.219321
...,...,...,...
4175,0.263796,0.253650,0.482554
4176,0.514058,0.283466,0.202476
4177,0.115016,0.188407,0.696577
4178,0.256331,0.234970,0.508699


# <h1 style='font-size:30px;'>Model</h1>

In [261]:
# ---- Define feature groups ----

# Form features
form_features = [
    "HomeTeam_points", "AwayTeam_points",
    "HomeTeam_avg_goal_diff", "AwayTeam_avg_goal_diff"
]

# Raw bookmaker odds (all H/D/A triplets)
bookmaker_odds = [
    "B365H","B365D","B365A",
    "BWH","BWD","BWA",
    "IWH","IWD","IWA",
    "WHH","WHD","WHA",
    "VCH","VCD","VCA",
    "MaxH","MaxD","MaxA",
    "PSH","PSD","PSA",
    "PSCH","PSCD","PSCA"
]

# Overrounds (one per bookmaker set)
overrounds = [
    "B365_overround","BW_overround","IW_overround","WH_overround",
    "VC_overround","Max_overround","PS_overround","PSC_overround"
]

# Elo ratings
elo_features = ["home_elo","away_elo","elo_diff"]

# Consensus features
consensus_features = ["pH_mean","pD_mean","pA_mean","overround_mean","overround_std"]

# Engineered extras
engineered_features = ["home_adv","draw_tightness"]




In [262]:
# --- 0) Feature set: odds + form + raw team IDs as categorical ---
feat_cats  = ["HomeTeam", "AwayTeam"]              # keep as strings (object)
feat_nums  = form_features        # your existing lists
feat_all   = feat_cats + feat_nums + bookmaker_odds

Xtr = X_train[feat_all].copy()
Xte = X_test[feat_all].copy()
ytr, yte = y_train, y_test

# Ensure the two categorical cols are string dtype (safer than category)
for c in feat_cats:
    Xtr[c] = Xtr[c].astype(str)
    Xte[c] = Xte[c].astype(str)

In [225]:
# feat_cats + feat_nums + bookmaker_odds
# --- 1) Param grid (your spec) ---
param_grid = {
    "learning_rate":       [0.10, 0.12, 0.30],
    "l2_leaf_reg":         [15, 20, 25],
    "random_strength":     [0.5, 1.0],
    "bagging_temperature": [0.3, 0.5, 0.9],  # Bayesian bootstrap
    "rsm":                 [0.85, 1.0],
}

# --- 2) Helper to iterate a dict grid (readable) ---
import itertools, pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, log_loss, precision_recall_fscore_support

def combos(grid):
    keys = list(grid.keys())
    for vals in itertools.product(*(grid[k] for k in keys)):
        yield dict(zip(keys, vals))

# --- 3) Run grid (CatBoost with categorical team IDs) ---
rows = []
for p in combos(param_grid):
    model = CatBoostClassifier(
        loss_function="MultiClass",
        eval_metric="Accuracy",
        iterations=1000,
        depth=6,                         # keep your common choice; modify if you want
        class_weights=[1, 1.5, 1],       # keep your current weighting
        verbose=False,
        random_state=42,
        **p
    )

    model.fit(
        Xtr, ytr,
        eval_set=(Xte, yte),
        use_best_model=True,
        cat_features=feat_cats          # << tell CatBoost to treat these as categorical
    )

    y_pred  = model.predict(Xte)
    y_proba = model.predict_proba(Xte)

    # per-class metrics with stable ordering (0=Away, 1=Draw, 2=Home for your labels)
    pr, rc, f1, _ = precision_recall_fscore_support(
        yte, y_pred, zero_division=0
    )

    rows.append({
        **p,
        "acc": accuracy_score(yte, y_pred),
        "logloss": log_loss(yte, y_proba),
        "c0_rec": rc[0], "c1_rec": rc[1], "c2_rec": rc[2],
        "c0_f1":  f1[0], "c1_f1":  f1[1], "c2_f1":  f1[2],
    })

results = pd.DataFrame(rows).sort_values(["acc","logloss"], ascending=[False, True])
display(results.head(15))

Unnamed: 0,learning_rate,l2_leaf_reg,random_strength,bagging_temperature,rsm,acc,logloss,c0_rec,c1_rec,c2_rec,c0_f1,c1_f1,c2_f1
39,0.12,15,0.5,0.5,1.0,0.583333,1.014689,0.510638,0.4,0.73494,0.521739,0.43956,0.689266
11,0.1,15,1.0,0.9,1.0,0.561111,0.997094,0.446809,0.44,0.698795,0.506024,0.435644,0.659091
90,0.3,20,1.0,0.3,0.85,0.55,0.998774,0.489362,0.3,0.73494,0.505495,0.348837,0.666667
105,0.3,25,1.0,0.5,1.0,0.55,1.002298,0.425532,0.38,0.722892,0.45977,0.4,0.674157
74,0.3,15,0.5,0.5,0.85,0.55,1.006687,0.425532,0.4,0.710843,0.465116,0.43956,0.644809
51,0.12,20,0.5,0.5,1.0,0.55,1.015061,0.404255,0.42,0.710843,0.457831,0.415842,0.670455
84,0.3,20,0.5,0.3,0.85,0.55,1.021977,0.468085,0.38,0.698795,0.494382,0.395833,0.662857
80,0.3,15,1.0,0.5,0.85,0.544444,0.987681,0.404255,0.34,0.746988,0.44186,0.386364,0.666667
32,0.1,25,1.0,0.5,0.85,0.544444,0.997917,0.404255,0.32,0.759036,0.447059,0.351648,0.684783
38,0.12,15,0.5,0.5,0.85,0.544444,0.998953,0.468085,0.36,0.698795,0.5,0.378947,0.655367


In [252]:
# feat_cats + feat_nums + bookmaker_odds

from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=1000, loss_function="MultiClass", random_strength=0.5, bagging_temperature=0.5, rsm=1,
                           eval_metric="Accuracy", learning_rate=0.12, random_state=42,
                           l2_leaf_reg=15, class_weights=[1, 1.5, 1])

model.fit(Xtr,
          ytr,
          eval_set=(Xte, yte),
          cat_features=feat_cats,    
          verbose=False)

y_pred = model.predict(Xte)
print(f"Test Set Accuracy: {accuracy_score(yte, y_pred):.2f}")
print(classification_report(yte, y_pred, zero_division=0))
print(model.get_best_score())

Test Set Accuracy: 0.58
              precision    recall  f1-score   support

           0       0.53      0.51      0.52        47
           1       0.49      0.40      0.44        50
           2       0.65      0.73      0.69        83

    accuracy                           0.58       180
   macro avg       0.56      0.55      0.55       180
weighted avg       0.57      0.58      0.58       180

{'learn': {'Accuracy': 0.880949834206867, 'MultiClass': 0.5646166049591457}, 'validation': {'Accuracy': 0.5609756097560976, 'MultiClass': 1.0115546897344136}}


In [228]:
# feat_cats + feat_nums + bookmaker_odds + elo_features

# --- 1) Param grid (your spec) ---
param_grid = {
    "learning_rate":       [0.10, 0.12, 0.30],
    "l2_leaf_reg":         [15, 20, 25],
    "random_strength":     [0.5, 1.0],
    "bagging_temperature": [0.3, 0.5, 0.9],  # Bayesian bootstrap
    "rsm":                 [0.85, 1.0],
}

# --- 2) Helper to iterate a dict grid (readable) ---
import itertools, pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, log_loss, precision_recall_fscore_support

def combos(grid):
    keys = list(grid.keys())
    for vals in itertools.product(*(grid[k] for k in keys)):
        yield dict(zip(keys, vals))

# --- 3) Run grid (CatBoost with categorical team IDs) ---
rows = []
for p in combos(param_grid):
    model = CatBoostClassifier(
        loss_function="MultiClass",
        eval_metric="Accuracy",
        iterations=1000,
        depth=6,                         # keep your common choice; modify if you want
        class_weights=[1, 1.5, 1],       # keep your current weighting
        random_state=42,
        verbose=False,
        **p
    )

    model.fit(
        Xtr, ytr,
        eval_set=(Xte, yte),
        use_best_model=True,
        cat_features=feat_cats          # << tell CatBoost to treat these as categorical
    )

    y_pred  = model.predict(Xte)
    y_proba = model.predict_proba(Xte)

    # per-class metrics with stable ordering (0=Away, 1=Draw, 2=Home for your labels)
    pr, rc, f1, _ = precision_recall_fscore_support(
        yte, y_pred, zero_division=0
    )

    rows.append({
        **p,
        "acc": accuracy_score(yte, y_pred),
        "logloss": log_loss(yte, y_proba),
        "c0_rec": rc[0], "c1_rec": rc[1], "c2_rec": rc[2],
        "c0_f1":  f1[0], "c1_f1":  f1[1], "c2_f1":  f1[2],
    })

results = pd.DataFrame(rows).sort_values(["acc","logloss"], ascending=[False, True])
display(results.head(15))

Unnamed: 0,learning_rate,l2_leaf_reg,random_strength,bagging_temperature,rsm,acc,logloss,c0_rec,c1_rec,c2_rec,c0_f1,c1_f1,c2_f1
78,0.3,15,1.0,0.3,0.85,0.55,1.029679,0.425532,0.4,0.710843,0.465116,0.425532,0.655556
105,0.3,25,1.0,0.5,1.0,0.538889,1.009837,0.404255,0.34,0.73494,0.469136,0.361702,0.659459
65,0.12,25,0.5,0.9,1.0,0.538889,1.011861,0.446809,0.36,0.698795,0.47191,0.391304,0.648045
50,0.12,20,0.5,0.5,0.85,0.538889,1.015997,0.468085,0.4,0.662651,0.505747,0.4,0.635838
7,0.1,15,1.0,0.3,1.0,0.538889,1.016868,0.468085,0.32,0.710843,0.505747,0.351648,0.648352
85,0.3,20,0.5,0.3,1.0,0.538889,1.026381,0.468085,0.4,0.662651,0.53012,0.40404,0.617978
87,0.3,20,0.5,0.5,1.0,0.538889,1.027686,0.468085,0.34,0.698795,0.5,0.365591,0.648045
62,0.12,25,0.5,0.5,0.85,0.538889,1.028556,0.510638,0.26,0.722892,0.527473,0.298851,0.659341
93,0.3,20,1.0,0.5,1.0,0.538889,1.038728,0.468085,0.34,0.698795,0.478261,0.377778,0.651685
100,0.3,25,0.5,0.9,0.85,0.538889,1.038742,0.446809,0.4,0.674699,0.494118,0.4,0.64


In [264]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTENC
from collections import Counter

# --- Encode categorical features ---
encoders = {}
Xtr_enc = Xtr.copy()

for c in feat_cats:  # e.g. ["HomeTeam", "AwayTeam"]
    le = LabelEncoder()
    Xtr_enc[c] = le.fit_transform(Xtr[c])   # encode training set only
    encoders[c] = le

# Indices of categorical columns for SMOTENC
cat_idx = [Xtr_enc.columns.get_loc(c) for c in feat_cats]

# --- Apply SMOTENC ---
smote = SMOTENC(
    categorical_features=cat_idx,
    sampling_strategy="not majority",  # upsample draws & away, keep home
    random_state=42
)
Xtr_bal, ytr_bal = smote.fit_resample(Xtr_enc, ytr)

print("Before:", Counter(ytr))
print("After :", Counter(ytr_bal))

# --- Optional: Decode categorical cols back to original strings ---
Xtr_bal = pd.DataFrame(Xtr_bal, columns=Xtr_enc.columns)
for c in feat_cats:
    Xtr_bal[c] = encoders[c].inverse_transform(Xtr_bal[c].astype(int))


Before: Counter({2: 2013, 0: 1178, 1: 989})
After : Counter({2: 2013, 0: 2013, 1: 2013})


In [None]:
# feat_cats + feat_nums + bookmaker_odds + elo_features

param_grid = {
    "learning_rate": [0.05, 0.1, 0.15],
    "depth": [6, 7, 8],
    "l2_leaf_reg": [3, 5, 10],
    "random_strength": [1.0, 2.0],
    "bagging_temperature": [0.5, 1.0],
    "rsm": [0.9, 1.0],
    "grow_policy": ["SymmetricTree", "Lossguide"],  # compare both
}

# --- 2) Helper to iterate a dict grid (readable) ---
import itertools, pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, log_loss, precision_recall_fscore_support

def combos(grid):
    keys = list(grid.keys())
    for vals in itertools.product(*(grid[k] for k in keys)):
        yield dict(zip(keys, vals))

# --- 3) Run grid (CatBoost with categorical team IDs) ---
rows = []
from sklearn.utils import shuffle
Xtr_bal, ytr_bal = shuffle(Xtr_bal, ytr_bal, random_state=42)

for p in combos(param_grid):
    model = CatBoostClassifier(
        loss_function="MultiClass",
        eval_metric="Accuracy",
        iterations=1000,
        # class_weights=[1, 1.5, 1],       # keep your current weighting
        verbose=False,
        random_state=42,
        **p
    )

    model.fit(
        Xtr_bal, ytr_bal,
        eval_set=(Xte, yte),
        use_best_model=True,
        cat_features=feat_cats          # << tell CatBoost to treat these as categorical
    )

    y_pred  = model.predict(Xte)
    y_proba = model.predict_proba(Xte)

    # per-class metrics with stable ordering (0=Away, 1=Draw, 2=Home for your labels)
    pr, rc, f1, _ = precision_recall_fscore_support(
        yte, y_pred, zero_division=0
    )

    rows.append({
        **p,
        "acc": accuracy_score(yte, y_pred),
        "logloss": log_loss(yte, y_proba),
        "c0_rec": rc[0], "c1_rec": rc[1], "c2_rec": rc[2],
        "c0_f1":  f1[0], "c1_f1":  f1[1], "c2_f1":  f1[2],
    })

results = pd.DataFrame(rows).sort_values(["acc","logloss"], ascending=[False, True])
display(results.head(15))

Unnamed: 0,learning_rate,depth,l2_leaf_reg,random_strength,bagging_temperature,rsm,grow_policy,acc,logloss,c0_rec,c1_rec,c2_rec,c0_f1,c1_f1,c2_f1
313,0.15,6,5,2.0,0.5,0.9,Lossguide,0.577778,0.982381,0.617021,0.3,0.722892,0.568627,0.365854,0.681818
397,0.15,8,3,2.0,1.0,0.9,Lossguide,0.572222,1.010951,0.531915,0.36,0.722892,0.520833,0.404494,0.685714
113,0.05,8,5,1.0,0.5,0.9,Lossguide,0.566667,1.000367,0.574468,0.3,0.722892,0.534653,0.357143,0.685714
171,0.1,6,5,2.0,0.5,1.0,Lossguide,0.561111,1.001364,0.595745,0.3,0.698795,0.571429,0.344828,0.662857
41,0.05,6,10,2.0,0.5,0.9,Lossguide,0.561111,1.013455,0.595745,0.32,0.686747,0.565657,0.372093,0.651429
339,0.15,7,3,1.0,0.5,1.0,Lossguide,0.561111,1.014537,0.574468,0.28,0.722892,0.568421,0.333333,0.662983
393,0.15,8,3,2.0,0.5,0.9,Lossguide,0.561111,1.025662,0.553191,0.32,0.710843,0.530612,0.367816,0.674286
401,0.15,8,5,1.0,0.5,0.9,Lossguide,0.561111,1.044534,0.510638,0.34,0.722892,0.510638,0.386364,0.674157
169,0.1,6,5,2.0,0.5,0.9,Lossguide,0.555556,0.998675,0.553191,0.34,0.686747,0.536082,0.373626,0.662791
151,0.1,6,3,1.0,1.0,1.0,Lossguide,0.555556,1.008282,0.574468,0.28,0.710843,0.54,0.337349,0.666667


In [266]:
# feat_cats + feat_nums + bookmaker_odds

param_grid = {
    "learning_rate": [0.05, 0.1, 0.15],
    "depth": [6, 7, 8],
    "l2_leaf_reg": [3, 5, 10],
    "random_strength": [1.0, 2.0],
    "bagging_temperature": [0.5, 1.0],
    "rsm": [0.9, 1.0],
    "grow_policy": ["SymmetricTree", "Lossguide"],  # compare both
}

# --- 2) Helper to iterate a dict grid (readable) ---
import itertools, pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, log_loss, precision_recall_fscore_support

def combos(grid):
    keys = list(grid.keys())
    for vals in itertools.product(*(grid[k] for k in keys)):
        yield dict(zip(keys, vals))

# --- 3) Run grid (CatBoost with categorical team IDs) ---
rows = []
from sklearn.utils import shuffle
Xtr_bal, ytr_bal = shuffle(Xtr_bal, ytr_bal, random_state=42)

for p in combos(param_grid):
    model = CatBoostClassifier(
        loss_function="MultiClass",
        eval_metric="Accuracy",
        iterations=1000,
        # class_weights=[1, 1.5, 1],       # keep your current weighting
        verbose=False,
        random_state=42,
        **p
    )

    model.fit(
        Xtr_bal, ytr_bal,
        eval_set=(Xte, yte),
        use_best_model=True,
        cat_features=feat_cats          # << tell CatBoost to treat these as categorical
    )

    y_pred  = model.predict(Xte)
    y_proba = model.predict_proba(Xte)

    # per-class metrics with stable ordering (0=Away, 1=Draw, 2=Home for your labels)
    pr, rc, f1, _ = precision_recall_fscore_support(
        yte, y_pred, zero_division=0
    )

    rows.append({
        **p,
        "acc": accuracy_score(yte, y_pred),
        "logloss": log_loss(yte, y_proba),
        "c0_rec": rc[0], "c1_rec": rc[1], "c2_rec": rc[2],
        "c0_f1":  f1[0], "c1_f1":  f1[1], "c2_f1":  f1[2],
    })

results = pd.DataFrame(rows).sort_values(["acc","logloss"], ascending=[False, True])
display(results.head(15))

Unnamed: 0,learning_rate,depth,l2_leaf_reg,random_strength,bagging_temperature,rsm,grow_policy,acc,logloss,c0_rec,c1_rec,c2_rec,c0_f1,c1_f1,c2_f1
147,0.1,6,3,1.0,0.5,1.0,Lossguide,0.594444,1.027716,0.510638,0.42,0.746988,0.510638,0.466667,0.704545
57,0.05,7,3,2.0,0.5,0.9,Lossguide,0.588889,1.000995,0.553191,0.38,0.73494,0.547368,0.436782,0.685393
275,0.1,8,10,1.0,0.5,1.0,Lossguide,0.588889,1.004595,0.510638,0.44,0.722892,0.505263,0.483516,0.689655
163,0.1,6,5,1.0,0.5,1.0,Lossguide,0.588889,1.005583,0.574468,0.42,0.698795,0.568421,0.446809,0.678363
241,0.1,8,3,1.0,0.5,0.9,Lossguide,0.588889,1.006023,0.553191,0.4,0.722892,0.541667,0.454545,0.681818
303,0.15,6,3,2.0,1.0,1.0,Lossguide,0.583333,0.993366,0.510638,0.42,0.722892,0.516129,0.456522,0.685714
1,0.05,6,3,1.0,0.5,0.9,Lossguide,0.583333,1.005941,0.489362,0.44,0.722892,0.5,0.483516,0.677966
283,0.1,8,10,2.0,0.5,1.0,Lossguide,0.583333,1.006859,0.574468,0.44,0.674699,0.55102,0.44898,0.682927
293,0.15,6,3,1.0,1.0,0.9,Lossguide,0.583333,1.012315,0.574468,0.4,0.698795,0.556701,0.434783,0.678363
7,0.05,6,3,1.0,1.0,1.0,Lossguide,0.583333,1.015563,0.553191,0.44,0.686747,0.541667,0.463158,0.674556


In [275]:
# feat_cats + feat_nums + bookmaker_odds

from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=1000, loss_function="MultiClass", random_strength=1, bagging_temperature=0.5, rsm=1,
                           eval_metric="Accuracy", learning_rate=0.1, random_state=42, depth=6, grow_policy="Lossguide",
                           l2_leaf_reg=3)

model.fit(Xtr_bal,
          ytr_bal,
          eval_set=(Xte, yte),
          cat_features=feat_cats,    
          verbose=False)

y_pred = model.predict(Xte)
print(f"Test Set Accuracy: {accuracy_score(yte, y_pred):.2f}")
print(classification_report(yte, y_pred, zero_division=0))
print(model.get_best_score())

Test Set Accuracy: 0.59
              precision    recall  f1-score   support

           0       0.51      0.51      0.51        47
           1       0.53      0.42      0.47        50
           2       0.67      0.75      0.70        83

    accuracy                           0.59       180
   macro avg       0.57      0.56      0.56       180
weighted avg       0.59      0.59      0.59       180

{'learn': {'Accuracy': 0.9769829441960589, 'MultiClass': 0.3006730253169239}, 'validation': {'Accuracy': 0.5944444444444444, 'MultiClass': 1.00171692944458}}


# Model

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score

def run_model(classifier, param_grid, X_train, y_train, X_test, y_test):

    grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring='balanced_accuracy',
                               cv=5, verbose=1)
    grid_search.fit(X_train, y_train)
    model = grid_search.best_estimator_
    y_pred = model.predict(X_test)
    print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred, zero_division=0))

    return grid_search.best_params_

# <h1 style='font-size:30px;'>Random Forest Classifier</h1>

In [10]:
# New

from sklearn.ensemble import RandomForestClassifier
param_grid = {
    'n_estimators': [100, 500, 1000],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    }
classifier = RandomForestClassifier(random_state=42)
best_params_rf = run_model(classifier, param_grid, X_train, y_train, X_test, y_test)
best_params_rf

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Test Set Accuracy: 0.51
              precision    recall  f1-score   support

           0       0.44      0.40      0.42        47
           1       0.38      0.18      0.24        50
           2       0.56      0.76      0.64        83

    accuracy                           0.51       180
   macro avg       0.46      0.45      0.44       180
weighted avg       0.48      0.51      0.47       180



{'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}

In [30]:
from sklearn.ensemble import RandomForestClassifier
param_grid = {
    'n_estimators': [100, 500, 1000],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    }
classifier = RandomForestClassifier(random_state=42)
best_params_rf = run_model(classifier, param_grid, X_train, y_train, X_test, y_test)
best_params_rf

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Test Set Accuracy: 0.52
              precision    recall  f1-score   support

           0       0.42      0.36      0.39        47
           1       0.48      0.24      0.32        50
           2       0.57      0.78      0.66        83

    accuracy                           0.52       180
   macro avg       0.49      0.46      0.46       180
weighted avg       0.50      0.52      0.49       180



{'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}

# <h1 style='font-size:30px;'>Gradient Boosting Classifier</h1>

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=1000, max_depth=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, zero_division=0))

Test Set Accuracy: 0.50
              precision    recall  f1-score   support

           0       0.42      0.38      0.40        47
           1       0.38      0.20      0.26        50
           2       0.56      0.75      0.64        83

    accuracy                           0.50       180
   macro avg       0.45      0.44      0.43       180
weighted avg       0.47      0.50      0.47       180



# <h1 style='font-size:30px;'>Naive Bayes</h1>

In [34]:
from sklearn.naive_bayes import GaussianNB

param_grid = {
    'var_smoothing': np.logspace(0, -12, num=10),
    'priors': [[0.3, 0.4, 0.3]]
}
classifier = GaussianNB()

best_params_nb = run_model(classifier, param_grid, X_train, y_train, X_test, y_test)
best_params_nb

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Test Set Accuracy: 0.49
              precision    recall  f1-score   support

           0       0.46      0.55      0.50        47
           1       0.34      0.46      0.39        50
           2       0.70      0.47      0.56        83

    accuracy                           0.49       180
   macro avg       0.50      0.49      0.48       180
weighted avg       0.54      0.49      0.50       180



{'priors': [0.3, 0.4, 0.3], 'var_smoothing': 2.1544346900318868e-11}

# <h1 style='font-size:30px;'>Stacking Classifier</h1>

# <h1 style='font-size:15px;'>By Using Stacking Classifier, we can have a more balanced result which have a better performance in predicting results for Draw</h1>

In [35]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

classifier_1 = GradientBoostingClassifier(n_estimators=1000, criterion='friedman_mse', learning_rate=0.1, subsample=0.5)
classifier_2 = RandomForestClassifier(n_estimators=1000, min_samples_leaf=1, max_leaf_nodes=5)
classifier_3 = GaussianNB(var_smoothing=1e-09)
sclf = StackingClassifier(estimators = [('rf', classifier_2), ('gb', classifier_1), ('gnb', classifier_3)],
                          final_estimator = classifier_3
                          )

model = sclf.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, zero_division=0))

Test Set Accuracy: 0.53
              precision    recall  f1-score   support

           0       0.46      0.62      0.53        47
           1       0.42      0.30      0.35        50
           2       0.64      0.63      0.63        83

    accuracy                           0.53       180
   macro avg       0.51      0.51      0.50       180
weighted avg       0.53      0.53      0.53       180



# <h1 style='font-size:30px;'>CatBoost</h1>

In [12]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
X_test_resampled, y_test_resampled = smote.fit_resample(X_test, y_test)

In [13]:
# New

from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=1000, loss_function="MultiClass", 
                                eval_metric="Accuracy", learning_rate=0.3, l2_leaf_reg=9, class_weights=[1, 1.5, 1])

model.fit(X_train,
          y_train,
          eval_set=(X_test, y_test),
          verbose=False)

y_pred = model.predict(X_test)
print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, zero_division=0))
print(model.get_best_score())

Test Set Accuracy: 0.56
              precision    recall  f1-score   support

           0       0.54      0.47      0.50        47
           1       0.46      0.36      0.40        50
           2       0.60      0.72      0.66        83

    accuracy                           0.56       180
   macro avg       0.53      0.52      0.52       180
weighted avg       0.54      0.56      0.55       180

{'learn': {'Accuracy': 0.9836346133276286, 'MultiClass': 0.2857588260274736}, 'validation': {'Accuracy': 0.5317073170731708, 'MultiClass': 1.013489871666235}}


In [10]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=1000, loss_function="MultiClass", 
                                eval_metric="Accuracy", learning_rate=0.3, l2_leaf_reg=9, class_weights=[1, 1.5, 1])

model.fit(X_train,
          y_train,
          eval_set=(X_test, y_test),
          verbose=False)

y_pred = model.predict(X_test)
print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, zero_division=0))
print(model.get_best_score())

Test Set Accuracy: 0.58
              precision    recall  f1-score   support

           0       0.60      0.45      0.51        47
           1       0.50      0.46      0.48        50
           2       0.61      0.72      0.66        83

    accuracy                           0.58       180
   macro avg       0.57      0.54      0.55       180
weighted avg       0.58      0.58      0.57       180

{'learn': {'Accuracy': 0.9829928334581238, 'MultiClass': 0.2874334984834144}, 'validation': {'Accuracy': 0.5634146341463414, 'MultiClass': 1.0087578860801056}}


# <h1 style='font-size:30px;'>XGBoost</h1>

In [37]:
from xgboost import XGBClassifier

grid_params = {
    'max_depth': [3,6,9],
    'min_child_weight': [1,3,5],
    'learning_rate': [0.1, 0.5, 1],
    'objective': ['multi:softmax'],
    'n_estimators': [1000]
}

classifier = XGBClassifier()
best_params_gb = run_model(classifier, grid_params, X_train, y_train, X_test, y_test)
best_params_gb

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Test Set Accuracy: 0.52
              precision    recall  f1-score   support

           0       0.48      0.43      0.45        47
           1       0.37      0.20      0.26        50
           2       0.57      0.76      0.65        83

    accuracy                           0.52       180
   macro avg       0.47      0.46      0.45       180
weighted avg       0.49      0.52      0.49       180



{'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 5,
 'n_estimators': 1000,
 'objective': 'multi:softmax'}

# <h1 style='font-size:30px;'>Save Model</h1>

# <h1 style='font-size:15px;'>Catboost is the most efficient model such that it has best balanced prediction result in all 3 possible outcomes</h1>

In [19]:
import pickle

catboost = CatBoostClassifier(iterations=1000, loss_function="MultiClass", 
                                eval_metric="Accuracy", learning_rate=0.3, l2_leaf_reg=9, class_weights=[1, 1.5, 1])

catboost.fit(X_train,
          y_train,
          eval_set=(X_test, y_test),
          verbose=False)

pickle.dump(catboost, open('catboost.pkl', 'wb'))