## Libraries

In [394]:
from datetime import datetime

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import random
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,f1_score
import itertools




tqdm.pandas()
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 50)

## Preprocessing

In [395]:
# Read the entire dataset
df = pd.read_csv("train_data.csv", index_col=0).reset_index(drop=True)

# Set the number of last x relevant seasons
last_seasons = 5
# Check for NA in seasons
print(df["season"].isna().sum())

# Find relevant seasons
relevant_seasons = df["season"].unique()[-last_seasons:]
print("Relevant seasons: ", relevant_seasons)

# Filter out relevant seasons from DF
df = df.loc[df["season"].isin(relevant_seasons) , :]

# Modify the season format for better data manipulation
df["season"] = df["season"].apply(lambda x: int(str(x)[:2]))
print("Modified seasons: ", df["season"].unique().tolist())

  df = pd.read_csv("train_data.csv", index_col=0).reset_index(drop=True)


0
Relevant seasons:  [1718 1819 1920 2021 2122]
Modified seasons:  [17, 18, 19, 20, 21]


In [396]:
X_train["AvgHomeOdds"].describe()

count    27212.000000
mean         2.647933
std          1.674930
min          0.000000
25%          1.800000
50%          2.250000
75%          2.870000
max         32.333333
Name: AvgHomeOdds, dtype: float64

In [397]:
# Check missing values
missing_vals = pd.DataFrame((df.isna().sum()/df.shape[0]).sort_values( ascending=False)).T
missing_vals

Unnamed: 0,AT,SJD,SYD,SYA,GB>2.5,GB<2.5,GBAHH,GBAHA,GBAH,B365AH,SJH,SJA,SOD,BSH,BSD,BSA,SOH,HBP,Attendance,HHW,AHW,HO,SYH,ABP,SOA,GBA,HT,LB.2,LB.1,LB,LBAHH,LBAH,LBAHA,GBH,GBD,AO,SBA,SBH,SBD,AFKC,HFKC,LBH,LBD,LBA,Referee,BbMxH,Bb1X2,BbAv>2.5,BbAH,BbAvAHH,BbAvH,BbMxAHA,BbMxAHH,BbAvAHA,BbMxD,BbAvD,BbMxA,BbAvA,BbOU,BbMx>2.5,BbMx<2.5,BbAv<2.5,BbAHh,WHCH,WHCA,WHCD,B365AHA,B365AHH,P<2.5,BWCD,BWCA,BWCH,P>2.5,PAHH,PAHA,B365<2.5,IWCA,IWCD,B365>2.5,IWCH,B365C>2.5,B365CAHA,B365CAHH,PC<2.5,B365CH,AHh,AvgH,PCAHH,PCAHA,Avg>2.5,PC>2.5,B365C<2.5,VCCH,B365CA,B365CD,MaxAHA,AvgA,AvgCAHA,AvgAHA,AvgAHH,MaxAHH,AvgCA,AHCh,Max>2.5,Max<2.5,Avg<2.5,MaxH,VCCD,VCCA,MaxA,MaxD,AvgD,MaxCA,AvgCH,Time,MaxC>2.5,MaxCAHH,MaxCD,MaxCH,MaxC<2.5,AvgC>2.5,AvgC<2.5,MaxCAHA,AvgCAHH,AvgCD,HF,AF,PSH,PSA,PSD,WHD,WHA,WHH,BWD,BWA,BWH,VCH,VCD,VCA,B365A,B365D,IWD,B365H,IWH,IWA,PSCH,PSCD,PSCA,AR,HTR,HST,HC,HTHG,AST,HY,HR,AY,Date,AC,AS,HS,HTAG,FTAG,FTHG,HomeTeam,FTR,season,league,country,AwayTeam,Div
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.96227,0.96227,0.797885,0.797625,0.79751,0.62102,0.593979,0.593864,0.593662,0.593459,0.593459,0.593459,0.593402,0.593199,0.593199,0.593199,0.593199,0.593199,0.593199,0.593199,0.593199,0.593199,0.593199,0.593199,0.414312,0.413937,0.413937,0.413388,0.413359,0.41203,0.411568,0.41151,0.41151,0.41151,0.411279,0.41099,0.410008,0.409979,0.409776,0.409776,0.409661,0.409083,0.408939,0.408939,0.408765,0.40865,0.408621,0.408621,0.408592,0.408563,0.408505,0.408419,0.408361,0.408303,0.408245,0.408245,0.408216,0.408101,0.408072,0.408072,0.408072,0.408014,0.407956,0.407956,0.407956,0.40787,0.40787,0.407841,0.407841,0.407841,0.407841,0.407841,0.407841,0.407812,0.407812,0.407725,0.407696,0.40761,0.407494,0.407494,0.407494,0.407494,0.407494,0.407494,0.407494,0.407494,0.039146,0.039059,0.011527,0.010892,0.010747,0.009967,0.009505,0.009447,0.007049,0.006616,0.006471,0.003785,0.003525,0.003525,0.003496,0.003438,0.00338,0.003293,0.003091,0.003062,0.0026,0.0026,0.002485,0.00182,0.00182,0.001705,0.001473,0.001387,0.001329,0.001329,0.001184,0.001127,0.000982,0.000896,0.000896,0.000896,0.000838,0.000693,0.000549,0.000202,2.9e-05,0.0,0.0,0.0,0.0,0.0


In [398]:
bet_columns = ["B365", "BS", "BW", "GB", "IW", "LB", "PS", "SO", "SB", "SJ", "SY", "VC", "WH"]
home_bet = [col+"H" for col in bet_columns]
draw_bet = [col+"D" for col in bet_columns]
away_bet = [col+"A" for col in bet_columns]

#Check missing values
bet_cols_missing_vals = (pd.DataFrame(df[home_bet + draw_bet + away_bet].isna().sum())/df.shape[0]).sort_values(by=0, ascending=False).T
display(bet_cols_missing_vals)

# We can see that IW, B365 have the least missing values, so we pick B365 odds and impute the
# missing values based on the other odd columns
df.loc[df["B365H"].isna(), "B365H"] = df[home_bet].mean(axis=1)
df.loc[df["B365D"].isna(), "B365D"] = df[draw_bet].mean(axis=1)
df.loc[df["B365A"].isna(), "B365A"] = df[away_bet].mean(axis=1)

# Rename columns
df["AvgH"] = df["B365H"].copy()
df["AvgD"] = df["B365D"].copy()
df["AvgA"] = df["B365A"].copy()


# Drop the extra odds columns data and matches where we have completely missing odds
print("Number of completely missing odds: \n", df.loc[:,["AvgH", "AvgD", "AvgA"]].isna().sum())

#Drop unnecessary columns
df.drop(home_bet + draw_bet + away_bet, axis = 1, inplace=True)
df.drop(["PSCH", "PSCD", "PSCA"], axis = 1, inplace=True)

def dropnas(df):
    # We drop the values where there are missing non-imputable values in main columns
    df = df.loc[~df["FTHG"].isna() & ~df["FTAG"].isna() & (df["FTAG"] > -1) & (df["FTHG"] > -1)].copy()
    df = df.loc[~df["HTHG"].isna() & ~df["HTAG"].isna()].copy()
    df = df.loc[~df["HomeTeam"].isna() & ~df["AwayTeam"].isna()].copy()
    df = df.loc[~df["AvgH"].isna() & ~df["AvgD"].isna() & ~df["AvgA"].isna(), :]
    df = df.loc[~df["Div"].isna() & ~df["Date"].isna()].copy()
    df = df.loc[~df["FTR"].isna() & ~df["HTR"].isna()].copy()
    	
    return df

def process_dates(df):
    df["Date"] = df["Date"].apply(
        lambda date: datetime.strptime(date, "%d/%m/%y") if len(date)==8 else datetime.strptime(date, "%d/%m/%Y")
    )
    return df

def fill_goals(df):
    df["HTR"] = calculate_win(df["HTHG"], df["HTAG"])
    df["FTR_test"] = calculate_win(df["FTHG"], df["FTAG"])
    return df

def calculate_win(home_goals, away_goals):
    results = []
    for home, away in zip(home_goals, away_goals):
        if home < away:
            results.append("A")
        elif home > away:
            results.append("H")
        else:
            results.append("D")
    return results

def drop_goal_outliers(df):
    df = df.loc[(df["FTHG"] + df["FTAG"]) < 20, :].copy()
    return df

df = dropnas(df)
df = process_dates(df)
df = fill_goals(df)
df = drop_goal_outliers(df)

Unnamed: 0,SOA,SBA,SYD,SJD,SBD,SOD,BSH,GBA,GBD,BSD,BSA,SYH,SJH,SBH,SOH,SJA,GBH,SYA,LBH,LBD,LBA,PSH,PSA,PSD,WHD,WHA,WHH,BWD,BWA,BWH,VCH,VCD,VCA,B365A,B365D,IWD,B365H,IWH,IWA
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.797885,0.797625,0.79751,0.011527,0.010892,0.010747,0.009967,0.009505,0.009447,0.007049,0.006616,0.006471,0.003785,0.003525,0.003525,0.003496,0.003438,0.00338,0.003293,0.003091,0.003062


Number of completely missing odds: 
 AvgH    37
AvgD    37
AvgA    37
dtype: int64


In [399]:
# Check missing values
missing_vals = pd.DataFrame((df.isna().sum()/df.shape[0]).sort_values( ascending=False)).T
display(missing_vals)
# We can see there is a group of columns that have a lot of missing values and 
# are irrelevant to our analysis, so we also drop them
to_drop = ['GBAHA', 'HBP', 'HO', 'AHW', 'B365AH', 'HHW', 'Attendance', 'GBAH',
       'GBAHH', 'GB>2.5', 'AO', 'GB<2.5', 'ABP', 'LB', 'AT', 'HT',
       'LBAHH', 'LB.1', 'LB.2', 'LBAH', 'LBAHA', 'AFKC', 'HFKC', 'Bb1X2',
       'BbAv>2.5', 'BbAH', 'BbAvH', 'BbAvAHH', 'BbMxAHA', 'BbAvAHA',
       'BbMxAHH', 'BbAHh', 'BbMx<2.5', 'BbAv<2.5', 'BbMxA', 'BbMxD',
       'BbAvD', 'BbMxH', 'BbAvA', 'BbOU', 'BbMx>2.5', 'Referee', 'WHCH',
       'WHCD', 'WHCA', 'B365AHA', 'B365AHH', 'P<2.5', 'BWCD', 'BWCH',
       'BWCA', 'P>2.5', 'PAHH', 'PAHA', 'IWCA', 'B365<2.5', 'IWCD',
       'IWCH', 'B365>2.5', 'B365C>2.5', 'B365CAHH', 'B365CAHA', 'PC<2.5',
       'B365CH', 'PCAHH', 'PCAHA', 'PC>2.5', 'B365C<2.5', 'AHh', 'VCCH',
       'B365CD', 'B365CA', 'AvgCAHA', 'Avg>2.5', 'AvgCA', 'AHCh', 'MaxCA',
       'MaxAHA', 'Time', 'AvgCH', 'VCCD', 'VCCA', 'AvgAHA', 'MaxC>2.5',
       'AvgAHH', 'MaxAHH', 'MaxCAHH', 'Max>2.5', 'MaxC<2.5', 'AvgC>2.5',
       'AvgC<2.5', 'AvgCD', 'Max<2.5', 'MaxD', 'MaxCD', 'AvgCAHH',
       'MaxCH', 'MaxH', 'MaxCAHA', 'MaxA', 'Avg<2.5']

df = df.drop(to_drop, axis=1).reset_index(drop=True)

Unnamed: 0,HHW,HBP,Attendance,B365AH,HO,AHW,GBAH,GBAHA,GBAHH,GB<2.5,AO,GB>2.5,ABP,LBAHH,LBAHA,LBAH,LB,LB.1,LB.2,HT,AT,AFKC,HFKC,Referee,BbMxH,Bb1X2,BbAv>2.5,BbAvAHH,BbAvH,BbAH,BbMxAHA,BbMxD,BbOU,BbMxA,BbAvAHA,BbAvA,BbAvD,BbAv<2.5,BbAHh,BbMx>2.5,BbMxAHH,BbMx<2.5,WHCH,WHCA,WHCD,B365AHA,B365AHH,P<2.5,BWCD,BWCH,BWCA,P>2.5,PAHH,PAHA,IWCA,B365<2.5,IWCD,IWCH,B365>2.5,B365C>2.5,B365CAHH,B365CAHA,PC<2.5,B365CH,PCAHH,PCAHA,PC>2.5,B365C<2.5,AHh,VCCH,B365CA,B365CD,AvgCAHA,Avg>2.5,AHCh,AvgCA,MaxCA,MaxAHA,AvgCH,VCCA,Time,VCCD,MaxC>2.5,AvgAHA,AvgAHH,MaxAHH,MaxCAHH,Max>2.5,AvgCD,MaxCAHA,AvgC>2.5,MaxC<2.5,AvgCAHH,AvgC<2.5,Max<2.5,MaxCD,MaxH,MaxCH,MaxA,MaxD,Avg<2.5,HF,AF,AR,HST,HC,HY,AST,HR,AY,AC,AS,HS,Div,Date,AvgA,AvgD,AvgH,season,league,country,HTR,HTAG,HTHG,FTR,FTAG,FTHG,AwayTeam,HomeTeam,FTR_test
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.962415,0.962415,0.620513,0.59402,0.593816,0.5937,0.593496,0.593496,0.593496,0.593438,0.593234,0.593234,0.593234,0.593234,0.593234,0.593234,0.593234,0.593234,0.593234,0.593234,0.593234,0.413433,0.413112,0.413112,0.412239,0.41221,0.4109,0.410609,0.410609,0.410609,0.410405,0.410172,0.409881,0.409095,0.40892,0.408891,0.408775,0.408687,0.408251,0.408047,0.408047,0.407901,0.407814,0.407727,0.407697,0.407552,0.407523,0.407523,0.407494,0.407406,0.407406,0.407348,0.407348,0.407232,0.407232,0.407086,0.407057,0.407028,0.407028,0.407028,0.407028,0.40697,0.40697,0.406941,0.406911,0.406882,0.406853,0.406766,0.406766,0.406766,0.406766,0.406766,0.406766,0.406766,0.406766,0.406766,0.406766,0.406766,0.406766,0.406766,0.038138,0.038051,0.000961,0.000844,0.000553,0.000437,0.000408,0.000262,0.000262,2.9e-05,2.9e-05,2.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [400]:
print("Wrong result in matches: ", (~(df["FTR"] == df["FTR_test"])).sum())
# We fix the wrong results
df["FTR"] = df["FTR_test"].copy()

# Check missing values again
missing_vals = pd.DataFrame((df.isna().sum()/df.shape[0]).sort_values( ascending=False)).T
display(missing_vals)

# For now we will exclude the fouls columns due to missing values, 
#and for the rest of the values, we impute them using mean

df_cleaned = df.drop(["HF", "AF", "FTR_test"], axis=1).copy()
missing_vals = pd.DataFrame((df_cleaned.isna().sum()/df_cleaned.shape[0]).sort_values( ascending=False)).T

# Which columns need to be imputed
to_impute = (missing_vals.T)[(missing_vals.T)[0] > 0].index.tolist()
to_impute

# a tiny bit data leakage, to be fixed
def mean_imputer(df, features):
    for col in features:
        mean = df.loc[~df[col].isna(), col].mean()
        df.loc[df[col].isna(), col] = mean 
    return df
df_cleaned = mean_imputer(df_cleaned, to_impute).reset_index(drop=True)

# We identify where we need to swap order of matches in order to generate unique match key
df_cleaned["swap_needed"] = df_cleaned["HomeTeam"] > df_cleaned["AwayTeam"]

Wrong result in matches:  0


Unnamed: 0,HF,AF,AR,HST,HC,HY,AST,HR,AY,AS,AC,HS,AvgA,AvgD,AvgH,Div,Date,season,league,country,HTR,HTAG,HTHG,FTR,FTAG,FTHG,AwayTeam,HomeTeam,FTR_test
0,0.038138,0.038051,0.000961,0.000844,0.000553,0.000437,0.000408,0.000262,0.000262,2.9e-05,2.9e-05,2.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [401]:
# Final check for missing values
missing_vals = pd.DataFrame((df_cleaned.isna().sum()/df_cleaned.shape[0]).sort_values( ascending=False)).T
display(missing_vals)

Unnamed: 0,Div,AS,AvgA,AvgD,AvgH,AR,HR,AY,HY,AC,HC,AST,HST,HS,Date,season,league,country,HTR,HTAG,HTHG,FTR,FTAG,FTHG,AwayTeam,HomeTeam,swap_needed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Renaming and reordering

In [402]:
# Rename columns for clarity and reorder them
new_names = {
    "country": "Country",
    "league": "League",
    "Div": "Division",
    "season": "Season",
    "Date": "Date",
    "HomeTeam": "HomeTeam",
    "AwayTeam": "AwayTeam",
    "FTHG": "HomeGoals",
    "FTAG": "AwayGoals",
    "FTR": "Result",
    "AvgH": "AvgHomeOdds",
    "AvgD": "AvgDrawOdds",
    "AvgA": "AvgAwayOdds",
    "HTHG": "HomeGoalsHalf",
    "HTAG": "AwayGoalsHalf",
    "HTR": "HalfResult",
    "HS": "HomeShots",
    "AS": "AwayShots",
    "HST": "HomeShotsTarget",
    "AST": "AwayShotsTarget",
    "HC": "HomeCorners",
    "AC": "AwayCorners",
    "HY": "HomeYellowCards",
    "AY": "AwayYellowCards",
    "HR": "HomeRedCards",
    "AR": "AwayRedCards",
    "swap_needed": "swap_needed"
    }
df_cleaned = df_cleaned.rename(columns=new_names)
df_cleaned.columns.values
df_cleaned = df_cleaned.loc[:, new_names.values()].reset_index(drop=True)

## Feature engineering and modeling

In [403]:
# Separate into train and test dataframe, and test is season 21, train is rest
X_train = df_cleaned.loc[(df_cleaned["Season"] != 21), :].drop(["Result"], axis=1)
y_train = df_cleaned.loc[(df_cleaned["Season"] != 21) , "Result"]
X_test = df_cleaned.loc[(df_cleaned["Season"] == 21), :].drop(["Result"], axis=1)
y_test = df_cleaned.loc[(df_cleaned["Season"] == 21), "Result"]

### Matchup specific statistics calculation per season

In [404]:
# Here we will try to get specific matchup statistics
hf = ['HomeTeam', 'HomeGoals', 'AvgHomeOdds', 'HomeGoalsHalf', 'HomeShots',  'HomeShotsTarget', 'HomeCorners',  'HomeYellowCards', 'HomeRedCards']
af = ['AwayTeam', 'AwayGoals', 'AvgAwayOdds', 'AwayGoalsHalf', 'AwayShots',  'AwayShotsTarget', 'AwayCorners',  'AwayYellowCards', 'AwayRedCards']

# We identify matches where we have to swap their order, so that we can calculate matchup specific statistics

def swap_result(x):
    if x == "H":
        return "A"
    if x == "A":
        return "H"
    return "D"
def get_matchup_statistics(df,home_features,away_features, target, stats):
    grouped_df = df.copy()    
    mask = grouped_df["swap_needed"]    
    for hf, af in zip(home_features, away_features):
        # Swap the features where swap_needed is True
        temp = grouped_df.loc[mask, hf].copy()
        grouped_df.loc[mask, hf] = grouped_df.loc[mask, af]
        grouped_df.loc[mask, af] = temp
    
    # Also correct the result according to the swapped statistics
    grouped_df.loc[mask,"HalfResult"] = grouped_df.loc[mask,"HalfResult"].apply(lambda x: swap_result(x))
    
    # Add result and swap it correctly
    grouped_df["Result"] = target
    grouped_df.loc[mask, "Result"] = grouped_df.loc[mask, "Result"].apply(lambda x: swap_result(x))
    
    # Create unique identifier for specific matchup
    grouped_df["MatchKey"] = grouped_df["HomeTeam"] + grouped_df["AwayTeam"]

    # Group by Divison, Season and matchkey to get per-season statistics for specific unique  matchups
    group = grouped_df.groupby(["Division", "Season", "MatchKey"])

    #Get the aggregated statistics that were passed to function
    statistics = group[stats].agg(["sum","count"]).reset_index()
    
    #Now reset the two level column index to one level with the specific aggregates only
    selected_columns = [
    ('Division', ''),
    ('Season', ''),
    ('MatchKey', ''),
    ('HomeGoals', 'sum'),
    ('AwayGoals', 'sum'),
    ('HomeShots', 'sum'),
    ('AwayShots', 'sum'),
    ('HomeCorners', 'sum'),
    ('AwayCorners', 'sum'),
        
    ]
    
    # Reducing to a single level index
    statistics = statistics[selected_columns]
    statistics.columns = [col[0] if type(col) is tuple else col for col in statistics.columns.values]

    count_H = group['Result'].apply(lambda x: (x == 'H').sum()).reset_index().rename(columns={"Result": "HomeWins"})
    count_D = group['Result'].apply(lambda x: (x == 'D').sum()).reset_index().rename(columns={"Result": "Draws"})
    count_A = group['Result'].apply(lambda x: (x == 'A').sum()).reset_index().rename(columns={"Result": "AwayWins"})

    # Merge with home/draw/away result statistics
    statistics = pd.merge(statistics, count_H, on = ["Division", "Season", "MatchKey"], how='left')
    statistics = pd.merge(statistics, count_D, on = ["Division", "Season", "MatchKey"], how='left')
    statistics = pd.merge(statistics, count_A, on = ["Division", "Season", "MatchKey"], how='left')

    statistics = statistics.sort_values(by='Season')
    cumulative_stats = ["HomeGoals", "AwayGoals", "HomeShots", "AwayShots", "HomeWins", "Draws", "AwayWins", "HomeCorners", "AwayCorners"]
    
    for col in cumulative_stats:
        statistics[col + "_cum"] = statistics.groupby(['Division', 'MatchKey'])[col].cumsum()
        
    statistics["matches"] = statistics["HomeWins_cum"] + statistics["Draws_cum"] + statistics["AwayWins_cum"]
        
    return statistics
stats = ["HomeGoals", "AwayGoals", "HomeShots", "AwayShots","HomeCorners", "AwayCorners", "Result"]
statistics = get_matchup_statistics(X_train, hf, af, y_train, stats)
statistics = statistics.rename(columns={"Season": "SeasonKey"}).reset_index(drop=True)
display(statistics.loc[statistics.MatchKey=="AnderlechtAntwerp", :].head(10))

Unnamed: 0,Division,SeasonKey,MatchKey,HomeGoals,AwayGoals,HomeShots,AwayShots,HomeCorners,AwayCorners,HomeWins,Draws,AwayWins,HomeGoals_cum,AwayGoals_cum,HomeShots_cum,AwayShots_cum,HomeWins_cum,Draws_cum,AwayWins_cum,HomeCorners_cum,AwayCorners_cum,matches
0,B1,17,AnderlechtAntwerp,2.0,1.0,15.0,15.0,5.0,14.0,1,1,0,2.0,1.0,15.0,15.0,1,1,0,5.0,14.0,2
6527,B1,18,AnderlechtAntwerp,2.0,1.0,12.0,16.0,11.0,11.0,1,1,0,4.0,2.0,27.0,31.0,2,2,0,16.0,25.0,4
8033,B1,19,AnderlechtAntwerp,1.0,2.0,18.0,24.0,11.0,13.0,0,1,1,5.0,4.0,45.0,55.0,2,3,1,27.0,38.0,6
11067,B1,20,AnderlechtAntwerp,5.0,1.0,28.0,24.0,11.0,7.0,2,0,0,10.0,5.0,73.0,79.0,4,3,1,38.0,45.0,8


### Merging matchup specific statistics and matches

In [405]:
X_train = df_cleaned.loc[(df_cleaned["Season"] != 21), :].drop(["Result"], axis=1)
y_train = df_cleaned.loc[(df_cleaned["Season"] != 21) , "Result"]
X_test = df_cleaned.loc[(df_cleaned["Season"] == 21), :].drop(["Result"], axis=1)
y_test = df_cleaned.loc[(df_cleaned["Season"] == 21), "Result"]

# Create keys for merging with statistics
def generate_match_keys(row):
    match_key = row["AwayTeam"] + row["HomeTeam"] if row["swap_needed"] else row["HomeTeam"] + row["AwayTeam"] 
    return match_key

X_train["MatchKey"] = X_train.apply(lambda x: generate_match_keys(x), axis=1) 
X_test["MatchKey"] = X_test.apply(lambda x: generate_match_keys(x), axis=1)

X_train["SeasonKey"] = X_train["Season"] - 1
X_test["SeasonKey"] = X_test["Season"] - 1


merging_keys = ["Division", "SeasonKey", "MatchKey"]
home_stats = ['HomeGoals_cum',
              'HomeShots_cum',
              'HomeWins_cum',
              "HomeCorners_cum"
             ]
away_stats = ['AwayGoals_cum',
              'AwayShots_cum',
              'AwayWins_cum',
              'AwayCorners_cum'
             ]
other_stats = ["Draws_cum", "matches"]
all_stats = home_stats + away_stats + other_stats

# Define a function to swap statistics in case the teams
# have different order than the MatchKey
def swap_feature_values(data, home_stats, away_stats):
    mask = data["swap_needed"]
    for hst, ast in zip(home_stats, away_stats):
        # Swap the features where swap_needed is True
        temp = data.loc[mask, hst].copy()
        data.loc[mask, hst] = data.loc[mask, ast]
        data.loc[mask, ast] = temp
    return data

X_train = pd.merge(X_train, statistics[merging_keys + all_stats], on=merging_keys, how='left')
X_test = pd.merge(X_test, statistics[merging_keys + all_stats], on=merging_keys, how='left')

X_train = swap_feature_values(X_train, home_stats, away_stats).reset_index(drop=True)
X_test = swap_feature_values(X_test, home_stats, away_stats).reset_index(drop=True)

# We have to drop these stats since they cannot be used for prediction of matches
#(they are known only after the match occured)
to_drop = ['HomeGoals', 'AwayGoals','HomeGoalsHalf', 'AwayGoalsHalf', 'HalfResult',
       'HomeShots', 'AwayShots', 'HomeShotsTarget', 'AwayShotsTarget',
       'HomeCorners', 'AwayCorners', 'HomeYellowCards', 'AwayYellowCards',
       'HomeRedCards', 'AwayRedCards']

X_train = X_train.drop(to_drop, axis=1).reset_index(drop=True)
X_test = X_test.drop(to_drop, axis=1).reset_index(drop=True)
X_train["HWR"] = X_train["HomeWins_cum"]/X_train["matches"] 
X_test["HWR"] = X_test["HomeWins_cum"]/X_test["matches"]

y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


#### Cross-validation

In [406]:
seasons = X_train.Season.unique()

In [407]:
def get_folds(train_data):
    folds = []
    for x in range(len(train_data.Season.unique())-1):
        folds.append({"train": seasons[:(x+1)],
                      "val": [seasons[x]+1]})
    return folds

In [408]:

def get_param_combinations(grid):
    # Extract the keys and the corresponding lists of possible values
    keys, values = zip(*grid.items())
    
    # Use itertools.product to get all combinations of parameter values
    all_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
    
    return all_combinations

def cross_validate(model, data_x, data_y, grid):
    folds = get_folds(data_x)
    parameters_grid = {}
    param_sets = get_param_combinations(grid)
    scores = []
    total_sets = len(param_sets)
    cou = 0
    for hyperp in param_sets:
        temp = []
        for i in folds:
            X_train = data_x.loc[data_x.Season.isin(i["train"]), :]
            X_val = data_x.loc[data_x.Season.isin(i["val"]), :]
            y_train = data_y[X_train.index.values] 
            y_val = data_y[X_val.index.values]
            
            model.set_params(**hyperp)
            model.fit(X_train, y_train)
            
            y_pred = model.predict(X_val)
            temp.append(f1_score(y_val, y_pred, average="macro"))

        mean_score = np.mean(np.array(temp))
        cou = cou + 1
        print("Progress: ", str(cou/total_sets), "Score: ", mean_score)
        scores.append({"hyperparameters": hyperp,
                      "mean_score": mean_score})
        
    return scores

In [409]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [7, 9, 11, 13],
    'min_samples_split': [3, 6, 9],
    'n_jobs': [4]
}
#model = RandomForestClassifier()
#features = ["AvgHomeOdds", "AvgDrawOdds", "AvgAwayOdds"]
#result = cross_validate(model, X_train[features+["Season"]], y_train, param_grid)

In [410]:
result

[{'hyperparameters': {'n_estimators': 100,
   'max_depth': 7,
   'min_samples_split': 3,
   'n_jobs': 4},
  'mean_score': 0.3966671943881847},
 {'hyperparameters': {'n_estimators': 100,
   'max_depth': 7,
   'min_samples_split': 6,
   'n_jobs': 4},
  'mean_score': 0.3905699319561627},
 {'hyperparameters': {'n_estimators': 100,
   'max_depth': 7,
   'min_samples_split': 9,
   'n_jobs': 4},
  'mean_score': 0.3945267527962247},
 {'hyperparameters': {'n_estimators': 100,
   'max_depth': 9,
   'min_samples_split': 3,
   'n_jobs': 4},
  'mean_score': 0.3986917185581216},
 {'hyperparameters': {'n_estimators': 100,
   'max_depth': 9,
   'min_samples_split': 6,
   'n_jobs': 4},
  'mean_score': 0.3985645370264521},
 {'hyperparameters': {'n_estimators': 100,
   'max_depth': 9,
   'min_samples_split': 9,
   'n_jobs': 4},
  'mean_score': 0.39849699793968724},
 {'hyperparameters': {'n_estimators': 100,
   'max_depth': 11,
   'min_samples_split': 3,
   'n_jobs': 4},
  'mean_score': 0.4018276167749564

#### Model for each division

In [411]:
def predictive_pipeline(X_train, X_test, y_train, y_test, na_features, non_na_features):
    train_na = X_train.isna().any(axis=1)
    test_na = X_test.isna().any(axis=1)
    
    train_ind = X_train[train_na].index.values
    test_ind = X_test[test_na].index.values

    train_ind_nonna = X_train[~train_na].index.values
    test_ind_nonna = X_test[~test_na].index.values
    
    
    # Splitting the datasets
    X_train_na, y_train_na = X_train[train_na], y_train[train_ind]
    X_train_no_na, y_train_no_na = X_train[~train_na], y_train[train_ind_nonna]
    
    X_test_na, y_test_na = X_test[test_na], y_test[test_ind]
    X_test_no_na, y_test_no_na = X_test[~test_na], y_test[test_ind_nonna]
    
    # Initialize models (you can choose any model)
    model_na = xgb.XGBClassifier(nthread=6,max_depth=4,n_estimators=100)
    model_no_na = xgb.XGBClassifier(nthread=6,max_depth=4,n_estimators=100)
    
    # Train models
    model_na.fit(X_train_na[na_features]
                 , y_train_na)
    model_no_na.fit(X_train_no_na[non_na_features],
                    y_train_no_na)
    
    # Evaluate models (optional)
    preds_na = model_na.predict(X_test_na[na_features])
    preds_no_na = model_no_na.predict(X_test_no_na[non_na_features])
    
    f1_score_na = f1_score(y_test_na, preds_na, average="macro")
    f1_score_no_na = f1_score(y_test_no_na, preds_no_na, average="macro")
    
    return {
    #"model_na": model_na,
    #"model_no_na": model_no_na,    
    "f1_test_na": f1_score_na,
    "f1_test_no_na": f1_score_no_na}


#na_features = ["AvgHomeOdds", "AvgDrawOdds", "AvgAwayOdds"]
#non_na_features = ["AvgHomeOdds", "AvgDrawOdds", "AvgAwayOdds"] +  #+ all_stats
#scores = predictive_pipeline(X_train, X_test, y_train, y_test, na_features, non_na_features)
#scores

In [412]:
#results = {}
#for div in unique_divisions:
#    a = X_train.Division == div 
#    b = X_test.Division == div
#    X_temp_tr = X_train.loc[a, :]
#    X_temp_te = X_test.loc[b, :]
#    y_temp_tr = y_train[a]
#    y_temp_te = y_test[b]
#    results[div] = predictive_pipeline(X_temp_tr, X_temp_te, y_temp_tr, y_temp_te, na_features, non_na_features)

#### Random classifier performance

In [413]:
# Random classifier
y_pred = [random.choice(["H", "A", "D"]) for _ in range(len(y_train))]

accuracy = accuracy_score(y_train, y_pred)
macro_f1 = f1_score(y_train, y_pred, average='macro')
print(f'Training Accuracy: {accuracy:.3f}')
print(f'Training Macro F1 Score: {macro_f1:.3f}')

y_pred = [random.choice(["H", "A", "D"]) for _ in range(len(y_test))]

accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
print(f'Testing Accuracy: {accuracy:.3f}')
print(f'Testing Macro F1 Score: {macro_f1:.3f}')

#class_report = classification_report(y_test, y_pred)
#print('Classification Report:\n', class_report)


Training Accuracy: 0.334
Training Macro F1 Score: 0.330
Testing Accuracy: 0.330
Testing Macro F1 Score: 0.326


#### Random Forest model with engineered features

In [417]:
X_train = X_train.loc[~X_train.HWR.isna(),:]
X_test = X_test.loc[~X_test.HWR.isna(),:]
y_train = y_train[X_train.index.values]
y_test = y_test[X_test.index.values]



In [420]:
X_train.isna().sum()

Country            0
League             0
Division           0
Season             0
Date               0
HomeTeam           0
AwayTeam           0
AvgHomeOdds        0
AvgDrawOdds        0
AvgAwayOdds        0
swap_needed        0
MatchKey           0
SeasonKey          0
HomeGoals_cum      0
HomeShots_cum      0
HomeWins_cum       0
HomeCorners_cum    0
AwayGoals_cum      0
AwayShots_cum      0
AwayWins_cum       0
AwayCorners_cum    0
Draws_cum          0
matches            0
HWR                0
dtype: int64

In [432]:
features = ["HWR"] + ["AvgHomeOdds", "AvgDrawOdds", "AvgAwayOdds"] # + all_stats
rf_params = {'n_estimators': 200,
   'max_depth': 15,
   'min_samples_split': 6,
   'n_jobs': 4}


model = RandomForestClassifier(**rf_params)
model.fit(X_train[features], y_train)

# Evaluate on train
y_pred = model.predict(X_train[features])

accuracy = accuracy_score(y_train, y_pred)
macro_f1 = f1_score(y_train, y_pred, average='macro')
print(f'Training Accuracy: {accuracy:.3f}')
print(f'Training Macro F1 Score: {macro_f1:.3f}')

# Evaluate on test
y_pred = model.predict(X_test[features])

accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
print(f'Testing Accuracy: {accuracy:.3f}')
print(f'Testing Macro F1 Score: {macro_f1:.3f}')
#class_report = classification_report(y_test, y_pred)
#print('Classification Report:\n', class_report)

Training Accuracy: 0.704
Training Macro F1 Score: 0.681
Testing Accuracy: 0.470
Testing Macro F1 Score: 0.418


### XGBoost

In [440]:
#XGboost 
model = xgb.XGBClassifier(nthread=6,max_depth=5,n_estimators=300, learning_rate=0.075)

mapping_dict = {"H": 0, "D": 2, "A": 1}

# Map the categorical values to integers
y_train_mapped = pd.Series(y_train).map(mapping_dict)
y_test_mapped = pd.Series(y_test).map(mapping_dict)

# Fit the classifier to the training set
model.fit(X_train[features], y_train_mapped)

# Evaluate train/test
y_pred = model.predict(X_train[features])

accuracy = accuracy_score(y_train_mapped, y_pred)
macro_f1 = f1_score(y_train_mapped, y_pred, average='macro')
print(f'Training Accuracy: {accuracy:.3f}')
print(f'Training Macro F1 Score: {macro_f1:.3f}')

y_pred = model.predict(X_test[features])

accuracy = accuracy_score(y_test_mapped, y_pred)
macro_f1 = f1_score(y_test_mapped, y_pred, average='macro')
print(f'Testing Accuracy: {accuracy:.3f}')
print(f'Testing Macro F1 Score: {macro_f1:.3f}')

#class_report = classification_report(y_test, y_pred)
#print('Classification Report:\n', class_report)

Training Accuracy: 0.547
Training Macro F1 Score: 0.478
Testing Accuracy: 0.497
Testing Macro F1 Score: 0.423


#### Betting strategy for classification

In [28]:
unique_divs = X_test.Division.unique()
unique_divs
for div in unique_divs:
    mask = X_test.Division == div
    matches = X_test.loc[mask, :].shape[0]
    bet = 10000/matches
    X_test.loc[mask, "bet"] = bet


In [65]:
X_test["preds"] = y_pred
X_test["act"] = y_test_mapped

In [71]:
def evaluate_bet(row):
    if ((row["preds"] == 0) & (row["act"] == 0)):
        return (row["bet"]*row["AvgHomeOdds"])

    if ((row["preds"] == 2) & (row["act"] == 2)):
        return (row["bet"]*row["AvgDrawOdds"])

    if ((row["preds"] == 1) & (row["act"] == 1)):
        return (row["bet"]*row["AvgAwayOdds"])
    return 0

In [72]:
X_test["profit"] = X_test.apply(lambda row: evaluate_bet(row), axis=1)

In [79]:
total_profit = round(X_test["profit"].sum()-X_test["bet"].sum(),2)
print("Our profit would be:", total_profit, "$")

Our profit would be: -8486.86 $
