## Non-shot-based expected goals regression model

Predicting an expected number of goals for each match given the different in-game statistics excluding shots. 


In [466]:
import sys
import os
import numpy as np
import pandas as pd

In [467]:
FILEPATH = os.path.join(os.getcwd(), "data/non-shot-xG/non_shot_data.csv")
data = pd.read_csv(FILEPATH)

In [468]:
data

Unnamed: 0,GameID,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HF,AF,HC,AC,HY,AY,HR,AR
0,1,16/08/08,Arsenal,West Brom,1,0,H,1,0,H,H Webb,11,8,7,5,0,0,0,0
1,2,16/08/08,Bolton,Stoke,3,1,H,3,0,H,C Foy,13,12,4,3,1,2,0,0
2,3,16/08/08,Everton,Blackburn,2,3,A,1,1,D,A Marriner,11,9,3,5,2,2,0,0
3,4,16/08/08,Hull,Fulham,2,1,H,1,1,D,P Walton,10,9,5,6,3,0,0,0
4,5,16/08/08,Middlesbrough,Tottenham,2,1,H,0,0,D,M Atkinson,11,12,7,9,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4733,4734,16/01/2021,Fulham,Chelsea,0,1,A,0,0,D,P Bankes,11,10,1,12,2,3,1,0
4734,4735,16/01/2021,Leicester,Southampton,2,0,H,1,0,H,S Attwell,10,11,2,5,2,2,0,0
4735,4736,17/01/2021,Sheffield United,Tottenham,1,3,A,0,2,A,A Marriner,13,4,5,7,4,0,0,0
4736,4737,17/01/2021,Liverpool,Man United,0,0,D,0,0,D,P Tierney,15,6,7,3,2,1,0,0


In [469]:
general_training_data = data.drop(['GameID', 'Date'], axis=1)

In [470]:
general_training_data

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HF,AF,HC,AC,HY,AY,HR,AR
0,Arsenal,West Brom,1,0,H,1,0,H,H Webb,11,8,7,5,0,0,0,0
1,Bolton,Stoke,3,1,H,3,0,H,C Foy,13,12,4,3,1,2,0,0
2,Everton,Blackburn,2,3,A,1,1,D,A Marriner,11,9,3,5,2,2,0,0
3,Hull,Fulham,2,1,H,1,1,D,P Walton,10,9,5,6,3,0,0,0
4,Middlesbrough,Tottenham,2,1,H,0,0,D,M Atkinson,11,12,7,9,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4733,Fulham,Chelsea,0,1,A,0,0,D,P Bankes,11,10,1,12,2,3,1,0
4734,Leicester,Southampton,2,0,H,1,0,H,S Attwell,10,11,2,5,2,2,0,0
4735,Sheffield United,Tottenham,1,3,A,0,2,A,A Marriner,13,4,5,7,4,0,0,0
4736,Liverpool,Man United,0,0,D,0,0,D,P Tierney,15,6,7,3,2,1,0,0


### Data preperation

Mapping team names and referee to respective unique IDs

__Note__: Standard team names and referee names along with their respective unique IDs are located in [this](data/standard) directory

In [471]:
teams_data = pd.read_csv(os.path.join(os.getcwd(), "data/standard/standard.teamnames.csv"))
referee_data = pd.read_csv(os.path.join(os.getcwd(), "data/standard/standard.referee.names.csv"))

In [472]:
# Generating teams mappings 
teamname, teamID = list(teams_data['Standard teamname']), list(teams_data['TeamID'])
teamID_mapping = dict(zip(teamname, teamID))

generate_teamID_mappings = lambda teamnames: [teamID_mapping[teamname] for teamname in teamnames]

In [473]:
# Generating referees mappings 
referee, refereeID = list(referee_data['Standard referee name']), list(referee_data['RefereeID'])
refereeID_mapping = dict(zip(referee, refereeID))

generate_refereeID_mappings = lambda referees: [refereeID_mapping[referee] for referee in referees]

Encoding result:

- A __Home__ win is encoded as 1
- An __Away__ win is encoded as 0
- A __Draw__ is encoded as 0.5

In [474]:
def encode_results(results):
    encode = {
        'H': 1,
        'A': 0,
        'D': 0.5
    }
    return [encode[result] for result in results]

Applying transformations to the (general) training dataset.

In [475]:
# Teams
general_training_data['HomeTeam'] = generate_teamID_mappings(general_training_data['HomeTeam'])
general_training_data['AwayTeam'] = generate_teamID_mappings(general_training_data['AwayTeam'])

# Referees
general_training_data['Referee'] = generate_refereeID_mappings(general_training_data['Referee'])

# Encoding Results
general_training_data['FTR'] = encode_results(general_training_data['FTR'])
general_training_data['HTR'] = encode_results(general_training_data['HTR'])

In [476]:
general_training_data

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HF,AF,HC,AC,HY,AY,HR,AR
0,1,37,1,0,1.0,1,0,1.0,11,11,8,7,5,0,0,0,0
1,7,32,3,1,1.0,3,0,1.0,5,13,12,4,3,1,2,0,0
2,14,5,2,3,0.0,1,1,0.5,2,11,9,3,5,2,2,0,0
3,18,15,2,1,1.0,1,1,0.5,31,10,9,5,6,3,0,0,0
4,24,35,2,1,1.0,0,0,0.5,18,11,12,7,9,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4733,15,12,0,1,0.0,0,0,0.5,28,11,10,1,12,2,3,1,0
4734,20,31,2,0,1.0,1,0,1.0,37,10,11,2,5,2,2,0,0
4735,30,35,1,3,0.0,0,2,0.0,2,13,4,5,7,4,0,0,0
4736,21,23,0,0,0.5,0,0,0.5,30,15,6,7,3,2,1,0,0


### Building the regression model for the home team

Removing Half Time Away Team Goals, Full Time Home and Away Team Goals, and Away Team Corners from the training data

In [477]:
X_home = home_training_data = general_training_data.drop(['HTAG', 'FTAG', 'FTHG', 'AC'], axis=1)

In [478]:
Y_home = general_training_data.FTHG

In evaluating the model performance, the standard practice is to split the dataset into 2 (or more partitions) partitions and here we will be using the 80/20 split ratio whereby the 80% subset will be used as the train set and the 20% subset the test set. As scikit-learn requires that the data be further separated to their X and Y components, the train_test_split() function can readily perform the above-mentioned task.

In [479]:
from sklearn.model_selection import train_test_split 

In [480]:
X_home_train, X_home_test, Y_home_train, Y_home_test = train_test_split(X_home, Y_home, test_size=0.2)
X_home_train

Unnamed: 0,HomeTeam,AwayTeam,FTR,HTHG,HTR,Referee,HF,AF,HC,HY,AY,HR,AR
906,38,39,1.0,1,1.0,21,8,8,7,0,0,0,0
4623,25,14,1.0,0,0.5,37,9,10,5,2,4,0,0
1591,23,32,1.0,2,1.0,3,9,10,3,1,1,0,0
2617,18,21,1.0,1,1.0,17,10,10,7,2,0,0,0
1065,39,35,0.5,0,0.5,2,11,13,8,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,5,24,0.5,0,0.5,20,17,11,9,3,1,0,0
2034,23,14,0.0,0,0.5,18,12,12,7,2,0,0,0
3375,31,18,0.5,0,0.5,20,9,13,6,0,3,0,0
1270,5,34,1.0,2,1.0,5,12,13,3,1,1,0,1


Using Random Forest Regressor model with a forest size of 100 decision trees

In [481]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [482]:
model_home = RandomForestRegressor(n_estimators = 100)
model_home.fit(X_home_train, Y_home_train)

RandomForestRegressor()

We will now apply the trained model to make predictions on the training set (X_train).

In [483]:
Y_home_pred_train = model_home.predict(X_home_train)

In [484]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_train, Y_home_pred_train))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_train, Y_home_pred_train))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_train, Y_home_pred_train))

Mean squared error (MSE): 0.09
Mean absolute error (MAE): 0.23
Coefficient of determination (R^2): 0.95


We will now apply the trained model to make predictions on the test set (X_test).

In [485]:
Y_home_pred_test = model_home.predict(X_home_test)

In [486]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_test, Y_home_pred_test))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_test, Y_home_pred_test))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_test, Y_home_pred_test))

Mean squared error (MSE): 0.66
Mean absolute error (MAE): 0.61
Coefficient of determination (R^2): 0.61


### Now repeating the process for the away team

In [487]:
X_away = home_training_data = general_training_data.drop(['HTHG', 'FTHG', 'FTAG', 'HC'], axis=1)

In [488]:
Y_away = general_training_data.FTAG

In [489]:
from sklearn.model_selection import train_test_split 

In [490]:
X_away_train, X_away_test, Y_away_train, Y_away_test = train_test_split(X_away, Y_away, test_size=0.2)

In [491]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [492]:
model_away = RandomForestRegressor(n_estimators = 100)
model_away.fit(X_away_train, Y_away_train)

RandomForestRegressor()

In [493]:
Y_away_pred_train = model_away.predict(X_away_train)

In [494]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_train, Y_home_pred_train))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_train, Y_home_pred_train))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_train, Y_home_pred_train))

Mean squared error (MSE): 0.09
Mean absolute error (MAE): 0.23
Coefficient of determination (R^2): 0.95


In [495]:
Y_away_pred_test = model_away.predict(X_away_test)

In [496]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_away_test, Y_away_pred_test))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_away_test, Y_away_pred_test))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_away_test, Y_away_pred_test))

Mean squared error (MSE): 0.49
Mean absolute error (MAE): 0.55
Coefficient of determination (R^2): 0.64
