# ML Model Training and Persistence - Prototyping Notebook

This notebook is part of **Story 2.1: ML Model Training and Persistence**.

Goals:
- Load football match data from SQLite.
- Preprocess features.
- Train a classifier on match outcomes.
- Save the trained model for inference.

Steps:
1. Setup and Data Loading
2. Data Exploration
3. Data Preprocessing
4. Model Training, Evaluation, and Persistence

## Setup and Data Loading

In [11]:
import sqlite3
import pandas as pd
import os
from pandas.io.sql import DatabaseError

# Path to database
db_path = '../football.db'

if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database '{db_path}' not found. Run db_setup.py first.")

conn = sqlite3.connect(db_path)

try:
    df_raw = pd.read_sql_query("SELECT * FROM matches", conn)
    print(f"✅ Loaded {len(df_raw)} rows from 'matches' table")
except DatabaseError as e:
    df_raw = pd.DataFrame()
    print(f"❌ Error loading data: {e}")
finally:
    conn.close()

✅ Loaded 1508 rows from 'matches' table


## Preprocessing step 1: add season col

In [13]:
def add_season(df):
    df['Date'] = pd.to_datetime(df['Date'])
    def get_season(date):
        if pd.Timestamp('2024-07-26') <= date <= pd.Timestamp('2025-05-29'):
            return '2024/2025'
        elif pd.Timestamp('2023-07-28') <= date <= pd.Timestamp('2024-06-02'):
            return '2023/2024'
        elif pd.Timestamp('2022-07-22') <= date <= pd.Timestamp('2023-04-23'):
            return '2022/2023'
        elif pd.Timestamp('2021-07-23') <= date <= pd.Timestamp('2022-04-10'):
            return '2021/2022'
        elif pd.Timestamp('2020-08-08') <= date <= pd.Timestamp('2021-04-18'):
            return '2020/2021'
        else:
            return '2019/2020'

    df['Season'] = df['Date'].apply(get_season)
    return df

## Preprocessing step 2: add rolling goal overall performance cols

In [None]:
import numpy as np

def add_overall_perf(df, rolling_window):
    # Copy and sort dataset chronologically
    df_raw = df.copy()
    df_raw = df_raw.sort_values('Date').reset_index(drop=True)
    df_raw['rowid'] = df_raw.index  # Unique ID to merge later

    # --- HOME TEAM FEATURES ---
    home_df = df_raw[['rowid', 'Date', 'HomeTeam', 'FTR', 'HS', 'HST', 'HC', 'HF', 'HY', 'HR', 'FTHG', 'FTAG']].copy()
    home_df['team'] = home_df['HomeTeam']
    home_df['side'] = 'home'
    home_df['points'] = home_df['FTR'].map({'H': 3, 'D': 1, 'A': 0})
    home_df = home_df.rename(columns={
        'HS': 'shots', 'HST': 'shots_on_target',
        'HC': 'corners', 'HF': 'fouls',
        'HY': 'yellow_cards', 'HR': 'red_cards',
        'FTHG': 'goals_scored', 'FTAG': 'goals_conceded'
    })

    # --- AWAY TEAM FEATURES ---
    away_df = df_raw[['rowid', 'Date', 'AwayTeam', 'FTR', 'AS', 'AST', 'AC', 'AF', 'AY', 'AR', 'FTAG', 'FTHG']].copy()
    away_df['team'] = away_df['AwayTeam']
    away_df['side'] = 'away'
    away_df['points'] = away_df['FTR'].map({'A': 3, 'D': 1, 'H': 0})
    away_df = away_df.rename(columns={
        'AS': 'shots', 'AST': 'shots_on_target',
        'AC': 'corners', 'AF': 'fouls',
        'AY': 'yellow_cards', 'AR': 'red_cards',
        'FTAG': 'goals_scored', 'FTHG': 'goals_conceded'
    })

    # --- COMBINE AND SORT ---
    team_games = pd.concat([
        home_df[['rowid', 'Date', 'team', 'side', 'points', 'shots', 'shots_on_target',
                 'corners', 'fouls', 'yellow_cards', 'red_cards', 'goals_scored', 'goals_conceded']],
        away_df[['rowid', 'Date', 'team', 'side', 'points', 'shots', 'shots_on_target',
                 'corners', 'fouls', 'yellow_cards', 'red_cards', 'goals_scored', 'goals_conceded']]
    ])
    team_games = team_games.sort_values(by=['team', 'Date'])

    # --- WEIGHT SETUP ---
    weights = np.arange(1, rolling_window + 1)
    weights = weights / weights.sum()

    def weighted_avg(series, weights_array):
        w = weights_array[-len(series):]
        return np.dot(series, w)

    # --- FORM SCORE (Weighted Points) ---
    team_games['form_score'] = (
        team_games.groupby('team')['points']
        .apply(lambda x: x.shift(1).rolling(window=rolling_window, min_periods=rolling_window)
               .apply(lambda y: weighted_avg(y, weights), raw=True))
        .reset_index(level=0, drop=True)
    )

    # --- OTHER STATS (Weighted Averages) ---
    for col in ['shots', 'shots_on_target', 'corners', 'fouls',
                'yellow_cards', 'red_cards', 'goals_scored', 'goals_conceded']:
        team_games[f'avg_{col}'] = (
            team_games.groupby('team')[col]
            .apply(lambda x: x.shift(1).rolling(window=rolling_window, min_periods=rolling_window)
                   .apply(lambda y: weighted_avg(y, weights), raw=True))
            .reset_index(level=0, drop=True)
        )

    # --- SPLIT HOME/AWAY FEATURES ---
    home_features = team_games[team_games['side'] == 'home'].copy()
    away_features = team_games[team_games['side'] == 'away'].copy()

    # --- MERGE BACK TO ORIGINAL DATA ---
    df_enriched = df_raw.merge(home_features[[
        'rowid', 'form_score', 'avg_shots', 'avg_shots_on_target', 'avg_corners',
        'avg_fouls', 'avg_yellow_cards', 'avg_red_cards',
        'avg_goals_scored', 'avg_goals_conceded'
    ]], on='rowid', how='left').rename(columns={
        'form_score': 'HomeTeam_FormScore',
        'avg_shots': 'HomeTeam_AvgShots',
        'avg_shots_on_target': 'HomeTeam_AvgShotsOnTarget',
        'avg_corners': 'HomeTeam_AvgCorners',
        'avg_fouls': 'HomeTeam_AvgFouls',
        'avg_yellow_cards': 'HomeTeam_AvgYellowCards',
        'avg_red_cards': 'HomeTeam_AvgRedCards',
        'avg_goals_scored': 'HomeTeam_AvgGoalsScored',
        'avg_goals_conceded': 'HomeTeam_AvgGoalsConceded'
    })

    df_enriched = df_enriched.merge(away_features[[
        'rowid', 'form_score', 'avg_shots', 'avg_shots_on_target', 'avg_corners',
        'avg_fouls', 'avg_yellow_cards', 'avg_red_cards',
        'avg_goals_scored', 'avg_goals_conceded'
    ]], on='rowid', how='left').rename(columns={
        'form_score': 'AwayTeam_FormScore',
        'avg_shots': 'AwayTeam_AvgShots',
        'avg_shots_on_target': 'AwayTeam_AvgShotsOnTarget',
        'avg_corners': 'AwayTeam_AvgCorners',
        'avg_fouls': 'AwayTeam_AvgFouls',
        'avg_yellow_cards': 'AwayTeam_AvgYellowCards',
        'avg_red_cards': 'AwayTeam_AvgRedCards',
        'avg_goals_scored': 'AwayTeam_AvgGoalsScored',
        'avg_goals_conceded': 'AwayTeam_AvgGoalsConceded'
    })

    return df_enriched


## *Preprocessing step 3: add rolling goal head-to-head performance cols?*

In [None]:
def add_h2h_perf(df, rolling_window=4):
    df = df.copy()
    df['rowid'] = df.index

    # Prepare home team view
    home = df[['rowid', 'Date', 'HomeTeam', 'AwayTeam', 'FTR', 'HS', 'HC', 'HF']].copy()
    home.rename(columns={
        'HomeTeam': 'team',
        'AwayTeam': 'opponent',
        'HS': 'shots',
        'HC': 'corners',
        'HF': 'fouls',
    }, inplace=True)
    home['side'] = 'home'
    home['points'] = home['FTR'].map({'H':3, 'D':1, 'A':0})

    # Prepare away team view
    away = df[['rowid', 'Date', 'AwayTeam', 'HomeTeam', 'FTR', 'AS', 'AC', 'AF']].copy()
    away.rename(columns={
        'AwayTeam': 'team',
        'HomeTeam': 'opponent',
        'AS': 'shots',
        'AC': 'corners',
        'AF': 'fouls',
    }, inplace=True)
    away['side'] = 'away'
    away['points'] = away['FTR'].map({'A':3, 'D':1, 'H':0})

    # Combine
    combined = pd.concat([home, away], ignore_index=True)
    combined.sort_values(['team', 'opponent', 'side', 'Date'], inplace=True)

    # Precompute weights for exponential decay
    decay = 0.8
    weights = np.array([decay ** i for i in range(rolling_window)][::-1])
    weight_sum = weights.sum()

    def weighted_rolling_apply(x):
        # Shift by 1 to exclude current match from calculation
        return x.shift(1).rolling(window=rolling_window, min_periods=1).apply(
            lambda w: np.dot(w, weights[-len(w):]) / weights[-len(w):].sum() * 10,
            raw=True
        )

    # Calculate rolling weighted form_score, shots, corners, fouls by group
    combined['form_score'] = combined.groupby(['team', 'opponent', 'side'])['points'].apply(weighted_rolling_apply).reset_index(level=[0,1,2], drop=True)
    combined['avg_shots_h2h'] = combined.groupby(['team', 'opponent', 'side'])['shots'].apply(weighted_rolling_apply).reset_index(level=[0,1,2], drop=True)
    combined['avg_corners_h2h'] = combined.groupby(['team', 'opponent', 'side'])['corners'].apply(weighted_rolling_apply).reset_index(level=[0,1,2], drop=True)
    combined['avg_fouls_h2h'] = combined.groupby(['team', 'opponent', 'side'])['fouls'].apply(weighted_rolling_apply).reset_index(level=[0,1,2], drop=True)


    # Split back into home and away features
    home_feats = combined[combined['side'] == 'home'][['rowid', 'form_score', 'avg_shots_h2h', 'avg_corners_h2h', 'avg_fouls_h2h']]
    away_feats = combined[combined['side'] == 'away'][['rowid', 'form_score', 'avg_shots_h2h', 'avg_corners_h2h', 'avg_fouls_h2h']]

    # Merge features back to original df
    df = df.merge(home_feats, on='rowid', how='left').rename(columns={
        'form_score': 'HomeTeam_FormScore_h2h',
        'avg_shots_h2h': 'HomeTeam_AvgShots_h2h',
        'avg_corners_h2h': 'HomeTeam_AvgCorners_h2h',
        'avg_fouls_h2h': 'HomeTeam_AvgFouls_h2h'
    })
    df = df.merge(away_feats, on='rowid', how='left').rename(columns={
        'form_score': 'AwayTeam_FormScore_h2h',
        'avg_shots_h2h': 'AwayTeam_AvgShots_h2h',
        'avg_corners_h2h': 'AwayTeam_AvgCorners_h2h',
        'avg_fouls_h2h': 'AwayTeam_AvgFouls_h2h'
    })

    return df

## Try single conventional ML model in sklearn:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
import itertools

# Select features and train
# optional1: Select all betting (Note: betting high collinearity) -> selected_features = [col for col in df.columns if col not in ['Div', 'Date', 'Time', 'FTR', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']]
# optional2: perform PCA in numerical pipeline additionaly.
feature_groups = [['B365H', 'B365D', 'B365A'], # odds
                     ['HomeTeam_FormScore', 'AwayTeam_FormScore'],
                     ['HomeTeam_AvgShots', 'AwayTeam_AvgShots'],
                     ['HomeTeam_AvgShotsOnTarget', 'AwayTeam_AvgShotsOnTarget'],
                     ['HomeTeam_AvgCorners', 'AwayTeam_AvgCorners'],
                     ['HomeTeam_AvgFouls', 'AwayTeam_AvgFouls'],
                     ['HomeTeam_AvgYellowCards', 'AwayTeam_AvgYellowCards'],
                     ['HomeTeam_AvgRedCards', 'AwayTeam_AvgRedCards'],
                     ['HomeTeam_AvgGoalsScored', 'AwayTeam_AvgGoalsScored'],
                     ['HomeTeam_AvgGoalsConceded', 'AwayTeam_AvgGoalsConceded']]

# Loop combination of features and models to find the best model:
acc_ultra_max = 0
model_ultra_max = "Init"
features_ultra_max = []

for r in range(1, len(feature_groups) + 1):
    for combo in itertools.combinations(feature_groups, r):
        selected_features = ['HomeTeam', 'AwayTeam'] + [feature for group in combo for feature in group]
        # Prepare selected features
        print(f"📂 Selected features: {selected_features}")

        # Prepare train and test subsets
        df = add_season(df_raw)
        df = add_overall_perf(df, rolling_window=10)
        df = df[selected_features + ['FTR'] +['Season']].dropna()
        train_df = df[df['Season'] != '2024/2025']
        test_df = df[df['Season'] == '2024/2025']
        X_train = train_df[selected_features]
        y_train = train_df['FTR']
        X_test = test_df[selected_features]
        y_test = test_df['FTR']

        # Make clear cat and num features in X for later preprocessing
        categorical_features = ['HomeTeam', 'AwayTeam']
        numerical_features = [col for col in X_train.columns if col not in categorical_features]

        # Prepare pipeline. Note that most models support one-hot, for models do not supporting one-hot (HistGradientBoosting, NaiveBayes, QDA), use ordinal encoding instead
        numeric_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])
        # optional: PCA
        # numeric_pipeline = numeric_pipeline = Pipeline([
        #     ('imputer', SimpleImputer(strategy='mean')),
        #     ('scaler', StandardScaler()),
        #     ('pca', PCA(n_components=0.95))
        # ])

        categorical_pipeline1 = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        categorical_pipeline2 = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ])

        preprocessor1 = ColumnTransformer([
            ('num', numeric_pipeline, numerical_features),
            ('cat', categorical_pipeline1, categorical_features)
        ])
        preprocessor2 = ColumnTransformer([
            ('num', numeric_pipeline, numerical_features),
            ('cat', categorical_pipeline2, categorical_features)
        ])

        # Training with full pipeline and display accuracy
        models_and_preprocessors = {
            "RandomForest": [RandomForestClassifier(random_state=42), preprocessor1],
            "LogisticRegression": [LogisticRegression(max_iter=1000, random_state=42), preprocessor1],
            "PassiveAggressiveClassifier": [PassiveAggressiveClassifier(max_iter=1000, random_state=42), preprocessor1],
            "RidgeClassifier": [RidgeClassifier(max_iter=1000), preprocessor1],
            "KNN": [KNeighborsClassifier(), preprocessor1],
            "SVC": [SVC(random_state=42), preprocessor1],
            "DecisionTree": [DecisionTreeClassifier(random_state=42), preprocessor1],
            "GradientBoosting": [GradientBoostingClassifier(random_state=42), preprocessor1],
            "HistGradientBoosting": [HistGradientBoostingClassifier(random_state=42), preprocessor2],
            "AdaBoost": [AdaBoostClassifier(random_state=42), preprocessor1],
            "ExtraTrees": [ExtraTreesClassifier(random_state=42), preprocessor1],
            "NaiveBayes": [GaussianNB(), preprocessor2],
            "MLP": [MLPClassifier(max_iter=1000, random_state=42), preprocessor1],
            "QDA": [QuadraticDiscriminantAnalysis(), preprocessor2],
            "LDA": [LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto'), preprocessor2]
        }

        acc_max = 0
        model_max = "Init"
        for name, model_and_proprocessor in models_and_preprocessors.items():
            model_pipeline = Pipeline([
                ('preprocess', model_and_proprocessor[1]),
                ('classifier', model_and_proprocessor[0])
            ])
            model_pipeline.fit(X_train, y_train)
            y_pred = model_pipeline.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            print(f"🎯 {name} Accuracy: {acc:.4f}")
            if acc > acc_max:
                acc_max = acc
                model_max = name
        
        if acc_max > acc_ultra_max:
            acc_ultra_max = acc_max
            model_ultra_max = model_max
            features_ultra_max = selected_features
        
    print(f"The best model is {model_ultra_max}.\nThe accuracy is {acc_ultra_max}.\nUsed features: {features_ultra_max}.")

📂 Selected features: ['HomeTeam', 'AwayTeam', 'B365H', 'B365D', 'B365A']


  team_games['form_score'] = team_games.groupby('team', group_keys=False).apply(compute_form_score)


🎯 RandomForest Accuracy: 0.5217
🎯 LogisticRegression Accuracy: 0.5000
🎯 PassiveAggressiveClassifier Accuracy: 0.4783
🎯 RidgeClassifier Accuracy: 0.4783
🎯 KNN Accuracy: 0.4130
🎯 SVC Accuracy: 0.4348
🎯 DecisionTree Accuracy: 0.3696
🎯 GradientBoosting Accuracy: 0.5217
🎯 HistGradientBoosting Accuracy: 0.4348
🎯 AdaBoost Accuracy: 0.4565
🎯 ExtraTrees Accuracy: 0.4783
🎯 NaiveBayes Accuracy: 0.4130


  team_games['form_score'] = team_games.groupby('team', group_keys=False).apply(compute_form_score)


🎯 MLP Accuracy: 0.5217
🎯 QDA Accuracy: 0.3913
🎯 LDA Accuracy: 0.4130
📂 Selected features: ['HomeTeam', 'AwayTeam', 'HomeTeam_FormScore', 'AwayTeam_FormScore']
🎯 RandomForest Accuracy: 0.4500
🎯 LogisticRegression Accuracy: 0.4250
🎯 PassiveAggressiveClassifier Accuracy: 0.5500
🎯 RidgeClassifier Accuracy: 0.4250
🎯 KNN Accuracy: 0.5000
🎯 SVC Accuracy: 0.4750
🎯 DecisionTree Accuracy: 0.5000
🎯 GradientBoosting Accuracy: 0.5000
🎯 HistGradientBoosting Accuracy: 0.4750
🎯 AdaBoost Accuracy: 0.4750
🎯 ExtraTrees Accuracy: 0.3750
🎯 NaiveBayes Accuracy: 0.4500


  team_games['form_score'] = team_games.groupby('team', group_keys=False).apply(compute_form_score)


🎯 MLP Accuracy: 0.4000
🎯 QDA Accuracy: 0.4500
🎯 LDA Accuracy: 0.4750
📂 Selected features: ['HomeTeam', 'AwayTeam', 'HomeTeam_AvgShots', 'AwayTeam_AvgShots']
🎯 RandomForest Accuracy: 0.4667
🎯 LogisticRegression Accuracy: 0.4667
🎯 PassiveAggressiveClassifier Accuracy: 0.4222
🎯 RidgeClassifier Accuracy: 0.4667
🎯 KNN Accuracy: 0.4667
🎯 SVC Accuracy: 0.4889
🎯 DecisionTree Accuracy: 0.4000
🎯 GradientBoosting Accuracy: 0.4889
🎯 HistGradientBoosting Accuracy: 0.4889
🎯 AdaBoost Accuracy: 0.4222
🎯 ExtraTrees Accuracy: 0.5111
🎯 NaiveBayes Accuracy: 0.4000


  team_games['form_score'] = team_games.groupby('team', group_keys=False).apply(compute_form_score)


🎯 MLP Accuracy: 0.5333
🎯 QDA Accuracy: 0.3778
🎯 LDA Accuracy: 0.3556
📂 Selected features: ['HomeTeam', 'AwayTeam', 'HomeTeam_AvgShotsOnTarget', 'AwayTeam_AvgShotsOnTarget']
🎯 RandomForest Accuracy: 0.3556
🎯 LogisticRegression Accuracy: 0.4889
🎯 PassiveAggressiveClassifier Accuracy: 0.4667
🎯 RidgeClassifier Accuracy: 0.4889
🎯 KNN Accuracy: 0.3556
🎯 SVC Accuracy: 0.4889
🎯 DecisionTree Accuracy: 0.4000
🎯 GradientBoosting Accuracy: 0.4444
🎯 HistGradientBoosting Accuracy: 0.4000
🎯 AdaBoost Accuracy: 0.4222
🎯 ExtraTrees Accuracy: 0.3556
🎯 NaiveBayes Accuracy: 0.3556


  team_games['form_score'] = team_games.groupby('team', group_keys=False).apply(compute_form_score)


🎯 MLP Accuracy: 0.4667
🎯 QDA Accuracy: 0.3333
🎯 LDA Accuracy: 0.3556
📂 Selected features: ['HomeTeam', 'AwayTeam', 'HomeTeam_AvgCorners', 'AwayTeam_AvgCorners']
🎯 RandomForest Accuracy: 0.4444
🎯 LogisticRegression Accuracy: 0.4889
🎯 PassiveAggressiveClassifier Accuracy: 0.5111
🎯 RidgeClassifier Accuracy: 0.4889
🎯 KNN Accuracy: 0.4000
🎯 SVC Accuracy: 0.4889
🎯 DecisionTree Accuracy: 0.4444
🎯 GradientBoosting Accuracy: 0.4889
🎯 HistGradientBoosting Accuracy: 0.5333
🎯 AdaBoost Accuracy: 0.4222
🎯 ExtraTrees Accuracy: 0.4889
🎯 NaiveBayes Accuracy: 0.3556


  team_games['form_score'] = team_games.groupby('team', group_keys=False).apply(compute_form_score)


🎯 MLP Accuracy: 0.4667
🎯 QDA Accuracy: 0.4667
🎯 LDA Accuracy: 0.3556
📂 Selected features: ['HomeTeam', 'AwayTeam', 'HomeTeam_AvgFouls', 'AwayTeam_AvgFouls']
🎯 RandomForest Accuracy: 0.4444
🎯 LogisticRegression Accuracy: 0.4889
🎯 PassiveAggressiveClassifier Accuracy: 0.4000
🎯 RidgeClassifier Accuracy: 0.4889
🎯 KNN Accuracy: 0.4444
🎯 SVC Accuracy: 0.4222
🎯 DecisionTree Accuracy: 0.4000
🎯 GradientBoosting Accuracy: 0.4667
🎯 HistGradientBoosting Accuracy: 0.4667
🎯 AdaBoost Accuracy: 0.4222
🎯 ExtraTrees Accuracy: 0.4667
🎯 NaiveBayes Accuracy: 0.3556


  team_games['form_score'] = team_games.groupby('team', group_keys=False).apply(compute_form_score)


🎯 MLP Accuracy: 0.4222
🎯 QDA Accuracy: 0.3111
🎯 LDA Accuracy: 0.3556
📂 Selected features: ['HomeTeam', 'AwayTeam', 'HomeTeam_AvgYellowCards', 'AwayTeam_AvgYellowCards']
🎯 RandomForest Accuracy: 0.4222
🎯 LogisticRegression Accuracy: 0.4667
🎯 PassiveAggressiveClassifier Accuracy: 0.3333
🎯 RidgeClassifier Accuracy: 0.4889
🎯 KNN Accuracy: 0.5111
🎯 SVC Accuracy: 0.4667
🎯 DecisionTree Accuracy: 0.4444
🎯 GradientBoosting Accuracy: 0.5111
🎯 HistGradientBoosting Accuracy: 0.5556
🎯 AdaBoost Accuracy: 0.4222
🎯 ExtraTrees Accuracy: 0.4889
🎯 NaiveBayes Accuracy: 0.3778


  team_games['form_score'] = team_games.groupby('team', group_keys=False).apply(compute_form_score)


🎯 MLP Accuracy: 0.4000
🎯 QDA Accuracy: 0.3556
🎯 LDA Accuracy: 0.3778
📂 Selected features: ['HomeTeam', 'AwayTeam', 'HomeTeam_AvgRedCards', 'AwayTeam_AvgRedCards']
🎯 RandomForest Accuracy: 0.4444
🎯 LogisticRegression Accuracy: 0.4889
🎯 PassiveAggressiveClassifier Accuracy: 0.4667
🎯 RidgeClassifier Accuracy: 0.4889
🎯 KNN Accuracy: 0.4667
🎯 SVC Accuracy: 0.5111
🎯 DecisionTree Accuracy: 0.4889
🎯 GradientBoosting Accuracy: 0.4222
🎯 HistGradientBoosting Accuracy: 0.4222
🎯 AdaBoost Accuracy: 0.4222
🎯 ExtraTrees Accuracy: 0.4889
🎯 NaiveBayes Accuracy: 0.3556
🎯 MLP Accuracy: 0.4444
🎯 QDA Accuracy: 0.2444
🎯 LDA Accuracy: 0.3556
📂 Selected features: ['HomeTeam', 'AwayTeam', 'HomeTeam_AvgGoalsScored', 'AwayTeam_AvgGoalsScored']


  team_games['form_score'] = team_games.groupby('team', group_keys=False).apply(compute_form_score)


🎯 RandomForest Accuracy: 0.4000
🎯 LogisticRegression Accuracy: 0.5111
🎯 PassiveAggressiveClassifier Accuracy: 0.4000
🎯 RidgeClassifier Accuracy: 0.5111
🎯 KNN Accuracy: 0.4444
🎯 SVC Accuracy: 0.4444
🎯 DecisionTree Accuracy: 0.3778
🎯 GradientBoosting Accuracy: 0.4000
🎯 HistGradientBoosting Accuracy: 0.4444
🎯 AdaBoost Accuracy: 0.4444
🎯 ExtraTrees Accuracy: 0.4222
🎯 NaiveBayes Accuracy: 0.2889


  team_games['form_score'] = team_games.groupby('team', group_keys=False).apply(compute_form_score)


🎯 MLP Accuracy: 0.4222
🎯 QDA Accuracy: 0.4667
🎯 LDA Accuracy: 0.3556
📂 Selected features: ['HomeTeam', 'AwayTeam', 'HomeTeam_AvgGoalsConceded', 'AwayTeam_AvgGoalsConceded']
🎯 RandomForest Accuracy: 0.4000
🎯 LogisticRegression Accuracy: 0.4889
🎯 PassiveAggressiveClassifier Accuracy: 0.2667
🎯 RidgeClassifier Accuracy: 0.4889
🎯 KNN Accuracy: 0.3111
🎯 SVC Accuracy: 0.4667
🎯 DecisionTree Accuracy: 0.3111
🎯 GradientBoosting Accuracy: 0.4444
🎯 HistGradientBoosting Accuracy: 0.4222
🎯 AdaBoost Accuracy: 0.4222
🎯 ExtraTrees Accuracy: 0.4222
🎯 NaiveBayes Accuracy: 0.3778
🎯 MLP Accuracy: 0.4222
🎯 QDA Accuracy: 0.3111
🎯 LDA Accuracy: 0.3556
The best model is HistGradientBoosting.
The accuracy is 0.5555555555555556.
Used features: ['HomeTeam', 'AwayTeam', 'HomeTeam_AvgYellowCards', 'AwayTeam_AvgYellowCards'].




## Try mixed model in H2O AutoML:

In [None]:
import h2o
from h2o.automl import H2OAutoML
from h2o.automl import get_leaderboard

# Initiate h2o and input the train and test set (note that h2o will handel cross-validation automately in train set)
h2o.init()
df = pd.concat([X, y], axis=1)
train, test = train_test_split(df, test_size=0.2, random_state=42)
train_h2o = h2o.H2OFrame(train)
test_h2o = h2o.H2OFrame(test)

# Ensure H2o know the target is a classification problem
target_col = y.name
train_h2o[target_col] = train_h2o[target_col].asfactor()
test_h2o[target_col] = test_h2o[target_col].asfactor()

# Ensure H2o know the categorial cols in preictor variables is categorial
for col in ['HomeTeam', 'AwayTeam']:
    train_h2o[col] = train_h2o[col].asfactor()
    test_h2o[col] = test_h2o[col].asfactor()

# Set the param for automl and train it
aml = H2OAutoML(
    # max_models=50, # try maximum 50 models
    max_runtime_secs=60, # limit total runtime to 60 seconds
    balance_classes=True, # upsample the minority classes - "Draw"
    sort_metric='logloss', # good for multi-class classification
    nfolds=5, # use 5-fold cross-validation
    stopping_metric='logloss', # early stopping based on log loss
    stopping_rounds=30, # early stop after 30 rounds of no improvement
    seed=42 # random state 42
)
aml.train(x=[col for col in train_h2o.columns if col != target_col], y=target_col, training_frame=train_h2o)

# Get the best mixed model from the leaderboard
lb = get_leaderboard(aml, extra_columns='ALL')
top_models = lb.head(rows=1) # only need the top 1 model here
model_id = top_models.as_data_frame().iloc[0]['model_id']
model = h2o.get_model(model_id)

# Show model and train-val info
if model.algo == "stackedensemble":
    print(f"🖥️ {model.metalearner()}")
else:
    print(f"🖥️ {model.algo}")
    print("Info:", model._model_json['output'])
            
# Display accuracy
preds = model.predict(test_h2o).as_data_frame()['predict']
true = test_h2o[y.name].as_data_frame()[y.name]
acc = accuracy_score(true, preds)
print(f"🎯 Accuracy: {acc:.4f}")

## Try NN in Pytorch: