## Non-shot-based expected goals regression model

Predicting an expected number of goals for each match given the different in-game statistics excluding shots. 


In [71]:
import sys
import os
import numpy as np
import pandas as pd

In [72]:
FILEPATH = os.path.join(os.getcwd(), "data/non-shot-xG/non_shot_data.csv")
complete_data = pd.read_csv(FILEPATH)

In [73]:
complete_data

Unnamed: 0,GameID,Date,HomeTeam,AwayTeam,Referee,HC,AC,HF,AF,HY,AY,HR,AR,FTHG,FTAG
0,1,13/08/2016,Burnley,Swansea,J Moss,7,4,10,14,3,2,0,0,0,1
1,2,13/08/2016,Crystal Palace,West Brom,C Pawson,3,6,12,15,2,2,0,0,0,1
2,3,13/08/2016,Everton,Tottenham,M Atkinson,5,6,10,14,0,0,0,0,1,1
3,4,13/08/2016,Hull,Leicester,M Dean,5,3,8,17,2,2,0,0,2,1
4,5,13/08/2016,Man City,Sunderland,R Madley,9,6,11,14,1,2,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,1680,02/01/2021,Brighton,Wolves,A Madley,5,8,13,8,2,1,0,0,3,3
1680,1681,02/01/2021,West Brom,Arsenal,M Atkinson,3,5,7,4,1,2,0,0,0,4
1681,1682,03/01/2021,Newcastle,Leicester,R Jones,3,6,10,11,0,2,0,0,1,2
1682,1683,03/01/2021,Chelsea,Man City,A Taylor,5,3,11,10,3,1,0,0,1,3


In [74]:
general_training_data = complete_data.drop(['GameID','Date'], axis=1)

In [75]:
general_training_data

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,AC,HF,AF,HY,AY,HR,AR,FTHG,FTAG
0,Burnley,Swansea,J Moss,7,4,10,14,3,2,0,0,0,1
1,Crystal Palace,West Brom,C Pawson,3,6,12,15,2,2,0,0,0,1
2,Everton,Tottenham,M Atkinson,5,6,10,14,0,0,0,0,1,1
3,Hull,Leicester,M Dean,5,3,8,17,2,2,0,0,2,1
4,Man City,Sunderland,R Madley,9,6,11,14,1,2,0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,Brighton,Wolves,A Madley,5,8,13,8,2,1,0,0,3,3
1680,West Brom,Arsenal,M Atkinson,3,5,7,4,1,2,0,0,0,4
1681,Newcastle,Leicester,R Jones,3,6,10,11,0,2,0,0,1,2
1682,Chelsea,Man City,A Taylor,5,3,11,10,3,1,0,0,1,3


### Data preperation

Mapping team names and referee to respective unique IDs

__Note__: Standard team names and referee names along with their respective unique IDs are located in [this](data/standard) directory

In [76]:
teams_data = pd.read_csv(os.path.join(os.getcwd(), "data/standard/standard.teamnames.csv"))
referee_data = pd.read_csv(os.path.join(os.getcwd(), "data/standard/standard.referee.names.csv"))

In [77]:
# Generating teams mappings 
teamname, teamID = list(teams_data['Standard teamname']), list(teams_data['TeamID'])
teamID_mapping = dict(zip(teamname, teamID))

generate_teamID_mappings = lambda teamnames: [teamID_mapping[teamname] for teamname in teamnames]

In [78]:
# Generating referees mappings 
referee, refereeID = list(referee_data['Standard referee name']), list(referee_data['RefereeID'])
refereeID_mapping = dict(zip(referee, refereeID))

generate_refereeID_mappings = lambda referees: [refereeID_mapping[referee] for referee in referees]

Encoding result:

- A __Home__ win is encoded as 1
- An __Away__ win is encoded as 0
- A __Draw__ is encoded as 0.5

In [79]:
def encode_results(results):
    encode = {
        'H': 1,
        'A': 0,
        'D': 0.5
    }
    return [encode[result] for result in results]

Applying transformations to the (general) training dataset.

In [80]:
# Teams
general_training_data['HomeTeam'] = generate_teamID_mappings(general_training_data['HomeTeam'])
general_training_data['AwayTeam'] = generate_teamID_mappings(general_training_data['AwayTeam'])

# Referees
general_training_data['Referee'] = generate_refereeID_mappings(general_training_data['Referee'])

### Integrating expected goals (xG) data from shots-based model 

Extracting all the output from the shots-based model

In [81]:
shots_xg_predictions = pd.read_csv(os.path.join(os.getcwd(), 'output/shots_xG_predictions.csv'))

In [82]:
# Add expected goals for the home team
general_training_data['Shots_XGH'] = shots_xg_predictions['xG_h']

# Add expected goals for the away team
general_training_data['Shots_XGA'] = shots_xg_predictions['xG_a']

In [83]:
general_training_data

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,AC,HF,AF,HY,AY,HR,AR,FTHG,FTAG,Shots_XGH,Shots_XGA
0,10,34,12,7,4,10,14,3,2,0,0,0,1,0.000000,1.008553
1,13,37,7,3,6,12,15,2,2,0,0,0,1,0.000000,1.008553
2,14,35,18,5,6,10,14,0,0,0,0,1,1,0.950882,1.008553
3,18,20,20,5,3,8,17,2,2,0,0,2,1,1.950882,1.000000
4,22,33,34,9,6,11,14,1,2,0,0,2,1,1.000000,1.008553
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,9,40,1,5,8,13,8,2,1,0,0,3,3,1.000000,1.000000
1680,37,1,18,3,5,7,4,1,2,0,0,0,4,0.000000,4.008553
1681,25,20,33,3,6,10,11,0,2,0,0,1,2,0.950882,2.008553
1682,12,22,3,5,3,11,10,3,1,0,0,1,3,0.950882,3.008553


### Building the regression model for the home team

Removing Full Time Home and Away Team Goals, and Away Team Corners from the training data

In [84]:
X_home = home_training_data = general_training_data.drop(['FTAG', 'FTHG', 'AC'], axis=1)

In [85]:
Y_home = general_training_data.FTHG

In evaluating the model performance, the standard practice is to split the dataset into 2 (or more partitions) partitions and here we will be using the 80/20 split ratio whereby the 80% subset will be used as the train set and the 20% subset the test set. As scikit-learn requires that the data be further separated to their X and Y components, the train_test_split() function can readily perform the above-mentioned task.

In [86]:
from sklearn.model_selection import train_test_split 

In [87]:
X_home_train, X_home_test, Y_home_train, Y_home_test = train_test_split(X_home, Y_home, test_size=0.2)
X_home_train

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,HF,AF,HY,AY,HR,AR,Shots_XGH,Shots_XGA
908,35,31,3,8,7,5,0,0,0,0,2.682392,3.988528
1252,10,38,14,11,17,10,2,1,0,0,2.950882,0.008553
510,9,13,2,10,13,8,2,0,0,0,0.363475,0.328147
1017,36,14,17,8,21,10,2,0,0,1,1.253627,1.006811
479,10,25,20,5,9,10,3,0,0,0,0.985392,1.077645
...,...,...,...,...,...,...,...,...,...,...,...,...
480,17,37,32,3,8,15,1,4,1,0,0.245334,0.810992
1334,38,20,8,5,16,4,4,1,0,0,0.950882,2.008553
17,37,14,26,7,16,10,3,1,0,0,0.950882,2.008553
1114,31,8,10,9,8,9,2,1,0,0,1.917476,0.918836


Using Random Forest Regressor model with a forest size of 100 decision trees

In [88]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [89]:
model_home = RandomForestRegressor(n_estimators = 100)
model_home.fit(X_home_train, Y_home_train)

RandomForestRegressor()

We will now apply the trained model to make predictions on the training set (X_train).

In [90]:
Y_home_pred_train = model_home.predict(X_home_train)

In [91]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_train, Y_home_pred_train))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_train, Y_home_pred_train))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_train, Y_home_pred_train))

Mean squared error (MSE): 0.08
Mean absolute error (MAE): 0.17
Coefficient of determination (R^2): 0.95


We will now apply the trained model to make predictions on the test set (X_test).

In [92]:
Y_home_pred_test = model_home.predict(X_home_test)

In [93]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_test, Y_home_pred_test))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_test, Y_home_pred_test))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_test, Y_home_pred_test))

Mean squared error (MSE): 0.75
Mean absolute error (MAE): 0.53
Coefficient of determination (R^2): 0.60


### Now repeating the process for the away team

In [94]:
X_away = home_training_data = general_training_data.drop(['FTHG', 'FTAG', 'HC'], axis=1)

In [95]:
Y_away = general_training_data.FTAG

In [96]:
from sklearn.model_selection import train_test_split 

In [97]:
X_away_train, X_away_test, Y_away_train, Y_away_test = train_test_split(X_away, Y_away, test_size=0.2)

In [98]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [99]:
model_away = RandomForestRegressor(n_estimators = 100)
model_away.fit(X_away_train, Y_away_train)

RandomForestRegressor()

In [100]:
Y_away_pred_train = model_away.predict(X_away_train)

In [101]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_train, Y_home_pred_train))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_train, Y_home_pred_train))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_train, Y_home_pred_train))

Mean squared error (MSE): 0.08
Mean absolute error (MAE): 0.17
Coefficient of determination (R^2): 0.95


In [102]:
Y_away_pred_test = model_away.predict(X_away_test)

In [103]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_away_test, Y_away_pred_test))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_away_test, Y_away_pred_test))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_away_test, Y_away_pred_test))

Mean squared error (MSE): 0.49
Mean absolute error (MAE): 0.42
Coefficient of determination (R^2): 0.66


### FTHG Results

Extract model input data from the training dataset.

In [104]:
home_training_data = general_training_data.copy().drop(['FTAG', 'AC'], axis=1)
home_model_input_data = home_training_data.copy().drop(columns=['FTHG'])

In [105]:
home_training_data

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,HF,AF,HY,AY,HR,AR,FTHG,Shots_XGH,Shots_XGA
0,10,34,12,7,10,14,3,2,0,0,0,0.000000,1.008553
1,13,37,7,3,12,15,2,2,0,0,0,0.000000,1.008553
2,14,35,18,5,10,14,0,0,0,0,1,0.950882,1.008553
3,18,20,20,5,8,17,2,2,0,0,2,1.950882,1.000000
4,22,33,34,9,11,14,1,2,0,0,2,1.000000,1.008553
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,9,40,1,5,13,8,2,1,0,0,3,1.000000,1.000000
1680,37,1,18,3,7,4,1,2,0,0,0,0.000000,4.008553
1681,25,20,33,3,10,11,0,2,0,0,1,0.950882,2.008553
1682,12,22,3,5,11,10,3,1,0,0,1,0.950882,3.008553


Extract predictions

In [106]:
home_pred_data = pd.get_dummies(home_model_input_data)
home_r = model_home.predict(home_pred_data)
home_r = pd.DataFrame(home_r)

In [107]:
home_r.columns= ['Predicted FTHG']
home_training_data.reset_index(drop=True, inplace=True)
home_results = pd.concat([home_training_data, home_r], axis=1)
home_results["Deviation in FTHG"] = abs(home_results["Predicted FTHG"] - home_results["FTHG"])

In [108]:
'''
path = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")
home_results.to_csv(path, index=False)
'''

'\npath = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")\nhome_results.to_csv(path, index=False)\n'

### FTAG Testing

In [109]:
away_training_data = general_training_data.copy().drop(['FTHG', 'HC'], axis=1)
away_model_input_data = away_training_data.copy().drop(columns=['FTAG'])

In [110]:
away_pred_data = pd.get_dummies(away_model_input_data)
away_r = model_away.predict(away_pred_data)
away_r = pd.DataFrame(away_r)

In [111]:
away_r.columns= ['Predicted FTAG']
away_training_data.reset_index(drop=True, inplace=True)
away_results = pd.concat([away_training_data, away_r], axis=1)
away_results["Deviation in FTAG"] = abs(away_results["Predicted FTAG"] - away_results["FTAG"])

In [112]:
'''
path = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")
home_results.to_csv(path, index=False)
'''

'\npath = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")\nhome_results.to_csv(path, index=False)\n'

### Merging results of both models

In [113]:
complete_non_shot_predictions = general_training_data.copy()

# Add predicted FTHG
complete_non_shot_predictions['Predicted FTHG'] = home_results['Predicted FTHG']

# Add predicted FTAG
complete_non_shot_predictions['Predicted FTAG'] = away_results['Predicted FTAG']

complete_non_shot_predictions

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,AC,HF,AF,HY,AY,HR,AR,FTHG,FTAG,Shots_XGH,Shots_XGA,Predicted FTHG,Predicted FTAG
0,10,34,12,7,4,10,14,3,2,0,0,0,1,0.000000,1.008553,0.00,0.99
1,13,37,7,3,6,12,15,2,2,0,0,0,1,0.000000,1.008553,0.00,1.00
2,14,35,18,5,6,10,14,0,0,0,0,1,1,0.950882,1.008553,1.00,1.00
3,18,20,20,5,3,8,17,2,2,0,0,2,1,1.950882,1.000000,2.00,1.36
4,22,33,34,9,6,11,14,1,2,0,0,2,1,1.000000,1.008553,1.92,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,9,40,1,5,8,13,8,2,1,0,0,3,3,1.000000,1.000000,2.60,2.92
1680,37,1,18,3,5,7,4,1,2,0,0,0,4,0.000000,4.008553,0.00,4.02
1681,25,20,33,3,6,10,11,0,2,0,0,1,2,0.950882,2.008553,1.00,1.99
1682,12,22,3,5,3,11,10,3,1,0,0,1,3,0.950882,3.008553,1.00,3.00


In [114]:
path = os.path.join(os.getcwd(), "output/non_shot_predictions.csv")
complete_non_shot_predictions.to_csv(path, index=False)