## Non-shot-based expected goals regression model

Predicting an expected number of goals for each match given the different in-game statistics excluding shots. 


In [30]:
import sys
import os
import numpy as np
import pandas as pd

In [31]:
FILEPATH = os.path.join(os.getcwd(), "data/non-shot-xG/non_shot_data.csv")
complete_data = pd.read_csv(FILEPATH)

In [32]:
complete_data

Unnamed: 0,GameID,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HF,AF,HC,AC,HY,AY,HR,AR
0,3041,13/08/2016,Burnley,Swansea,0,1,A,0,0,D,J Moss,10,14,7,4,3,2,0,0
1,3042,13/08/2016,Crystal Palace,West Brom,0,1,A,0,0,D,C Pawson,12,15,3,6,2,2,0,0
2,3043,13/08/2016,Everton,Tottenham,1,1,D,1,0,H,M Atkinson,10,14,5,6,0,0,0,0
3,3044,13/08/2016,Hull,Leicester,2,1,H,1,0,H,M Dean,8,17,5,3,2,2,0,0
4,3045,13/08/2016,Man City,Sunderland,2,1,H,1,0,H,R Madley,11,14,9,6,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,4720,02/01/2021,Brighton,Wolves,3,3,D,1,3,A,A Madley,13,8,5,8,2,1,0,0
1680,4721,02/01/2021,West Brom,Arsenal,0,4,A,0,2,A,M Atkinson,7,4,3,5,1,2,0,0
1681,4722,03/01/2021,Newcastle,Leicester,1,2,A,0,0,D,R Jones,10,11,3,6,0,2,0,0
1682,4723,03/01/2021,Chelsea,Man City,1,3,A,0,3,A,A Taylor,11,10,5,3,3,1,0,0


In [33]:
general_training_data = complete_data.drop(['GameID','Date'], axis=1)

In [34]:
general_training_data

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HF,AF,HC,AC,HY,AY,HR,AR
0,Burnley,Swansea,0,1,A,0,0,D,J Moss,10,14,7,4,3,2,0,0
1,Crystal Palace,West Brom,0,1,A,0,0,D,C Pawson,12,15,3,6,2,2,0,0
2,Everton,Tottenham,1,1,D,1,0,H,M Atkinson,10,14,5,6,0,0,0,0
3,Hull,Leicester,2,1,H,1,0,H,M Dean,8,17,5,3,2,2,0,0
4,Man City,Sunderland,2,1,H,1,0,H,R Madley,11,14,9,6,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,Brighton,Wolves,3,3,D,1,3,A,A Madley,13,8,5,8,2,1,0,0
1680,West Brom,Arsenal,0,4,A,0,2,A,M Atkinson,7,4,3,5,1,2,0,0
1681,Newcastle,Leicester,1,2,A,0,0,D,R Jones,10,11,3,6,0,2,0,0
1682,Chelsea,Man City,1,3,A,0,3,A,A Taylor,11,10,5,3,3,1,0,0


### Data preperation

Mapping team names and referee to respective unique IDs

__Note__: Standard team names and referee names along with their respective unique IDs are located in [this](data/standard) directory

In [35]:
teams_data = pd.read_csv(os.path.join(os.getcwd(), "data/standard/standard.teamnames.csv"))
referee_data = pd.read_csv(os.path.join(os.getcwd(), "data/standard/standard.referee.names.csv"))

In [36]:
# Generating teams mappings 
teamname, teamID = list(teams_data['Standard teamname']), list(teams_data['TeamID'])
teamID_mapping = dict(zip(teamname, teamID))

generate_teamID_mappings = lambda teamnames: [teamID_mapping[teamname] for teamname in teamnames]

In [37]:
# Generating referees mappings 
referee, refereeID = list(referee_data['Standard referee name']), list(referee_data['RefereeID'])
refereeID_mapping = dict(zip(referee, refereeID))

generate_refereeID_mappings = lambda referees: [refereeID_mapping[referee] for referee in referees]

Encoding result:

- A __Home__ win is encoded as 1
- An __Away__ win is encoded as 0
- A __Draw__ is encoded as 0.5

In [38]:
def encode_results(results):
    encode = {
        'H': 1,
        'A': 0,
        'D': 0.5
    }
    return [encode[result] for result in results]

Applying transformations to the (general) training dataset.

In [39]:
# Teams
general_training_data['HomeTeam'] = generate_teamID_mappings(general_training_data['HomeTeam'])
general_training_data['AwayTeam'] = generate_teamID_mappings(general_training_data['AwayTeam'])

# Referees
general_training_data['Referee'] = generate_refereeID_mappings(general_training_data['Referee'])

# Encoding Results
general_training_data['FTR'] = encode_results(general_training_data['FTR'])
general_training_data['HTR'] = encode_results(general_training_data['HTR'])

### Integrating expected goals (xG) data from shots-based model 

Extracting all the output from the shots-based model

In [40]:
shots_xg_predictions = pd.read_csv(os.path.join(os.getcwd(), 'output/shots_xG_predictions.csv'))
shots_xg_predictions

Unnamed: 0,GameID,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HF,AF,HC,AC,HY,AY,HR,AR,xG_h,xG_a
0,1,2008-08-16,Arsenal,West Brom,1,0,H,1,0,H,...,11,8,7,5,0,0,0,0,,
1,2,2008-08-16,Bolton,Stoke,3,1,H,3,0,H,...,13,12,4,3,1,2,0,0,,
2,3,2008-08-16,Everton,Blackburn,2,3,A,1,1,D,...,11,9,3,5,2,2,0,0,,
3,4,2008-08-16,Hull,Fulham,2,1,H,1,1,D,...,10,9,5,6,3,0,0,0,,
4,5,2008-08-16,Middlesbrough,Tottenham,2,1,H,0,0,D,...,11,12,7,9,1,2,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4719,4720,2021-01-02,Brighton,Wolves,3,3,D,1,3,A,...,13,8,5,8,2,1,0,0,1.0,1.0
4720,4721,2021-01-02,West Brom,Arsenal,0,4,A,0,2,A,...,7,4,3,5,1,2,0,0,,
4721,4722,2021-01-03,Newcastle,Leicester,1,2,A,0,0,D,...,10,11,3,6,0,2,0,0,,
4722,4723,2021-01-03,Chelsea,Man City,1,3,A,0,3,A,...,11,10,5,3,3,1,0,0,,


``in the process of integrating``

In [41]:
general_training_data

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HF,AF,HC,AC,HY,AY,HR,AR
0,10,34,0,1,0.0,0,0,0.5,12,10,14,7,4,3,2,0,0
1,13,37,0,1,0.0,0,0,0.5,7,12,15,3,6,2,2,0,0
2,14,35,1,1,0.5,1,0,1.0,18,10,14,5,6,0,0,0,0
3,18,20,2,1,1.0,1,0,1.0,20,8,17,5,3,2,2,0,0
4,22,33,2,1,1.0,1,0,1.0,34,11,14,9,6,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,9,40,3,3,0.5,1,3,0.0,1,13,8,5,8,2,1,0,0
1680,37,1,0,4,0.0,0,2,0.0,18,7,4,3,5,1,2,0,0
1681,25,20,1,2,0.0,0,0,0.5,33,10,11,3,6,0,2,0,0
1682,12,22,1,3,0.0,0,3,0.0,3,11,10,5,3,3,1,0,0


### Building the regression model for the home team

Removing Full Time Home and Away Team Goals, and Away Team Corners from the training data

In [42]:
X_home = home_training_data = general_training_data.drop(['HTAG', 'FTAG', 'FTHG', 'AC'], axis=1)

In [43]:
Y_home = general_training_data.FTHG

In evaluating the model performance, the standard practice is to split the dataset into 2 (or more partitions) partitions and here we will be using the 80/20 split ratio whereby the 80% subset will be used as the train set and the 20% subset the test set. As scikit-learn requires that the data be further separated to their X and Y components, the train_test_split() function can readily perform the above-mentioned task.

In [44]:
from sklearn.model_selection import train_test_split 

In [45]:
X_home_train, X_home_test, Y_home_train, Y_home_test = train_test_split(X_home, Y_home, test_size=0.2)
X_home_train

Unnamed: 0,HomeTeam,AwayTeam,FTR,HTHG,HTR,Referee,HF,AF,HC,HY,AY,HR,AR
1330,25,14,0.0,0,0.0,16,13,13,5,2,2,0,0
1464,12,36,1.0,2,1.0,14,9,12,5,0,2,0,0
1468,31,22,1.0,1,1.0,2,6,7,2,1,2,0,0
1510,1,36,1.0,3,1.0,20,9,12,4,3,3,0,0
1325,30,36,0.5,1,0.5,8,11,8,9,1,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,12,10,0.0,0,0.0,7,16,11,8,3,3,2,0
184,21,22,1.0,1,1.0,7,12,12,4,2,1,0,0
1057,12,40,0.5,0,0.5,23,8,14,13,1,4,0,0
917,38,13,1.0,0,0.0,3,10,8,5,1,2,0,0


Using Random Forest Regressor model with a forest size of 100 decision trees

In [46]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [47]:
model_home = RandomForestRegressor(n_estimators = 100)
model_home.fit(X_home_train, Y_home_train)

RandomForestRegressor()

We will now apply the trained model to make predictions on the training set (X_train).

In [48]:
Y_home_pred_train = model_home.predict(X_home_train)

In [49]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_train, Y_home_pred_train))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_train, Y_home_pred_train))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_train, Y_home_pred_train))

Mean squared error (MSE): 0.08
Mean absolute error (MAE): 0.22
Coefficient of determination (R^2): 0.95


We will now apply the trained model to make predictions on the test set (X_test).

In [50]:
Y_home_pred_test = model_home.predict(X_home_test)

In [51]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_test, Y_home_pred_test))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_test, Y_home_pred_test))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_test, Y_home_pred_test))

Mean squared error (MSE): 0.69
Mean absolute error (MAE): 0.65
Coefficient of determination (R^2): 0.62


### Now repeating the process for the away team

In [52]:
X_away = home_training_data = general_training_data.drop(['HTHG', 'FTHG', 'FTAG', 'HC'], axis=1)

In [53]:
Y_away = general_training_data.FTAG

In [54]:
from sklearn.model_selection import train_test_split 

In [55]:
X_away_train, X_away_test, Y_away_train, Y_away_test = train_test_split(X_away, Y_away, test_size=0.2)

In [56]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [57]:
model_away = RandomForestRegressor(n_estimators = 100)
model_away.fit(X_away_train, Y_away_train)

RandomForestRegressor()

In [58]:
Y_away_pred_train = model_away.predict(X_away_train)

In [59]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_train, Y_home_pred_train))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_train, Y_home_pred_train))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_train, Y_home_pred_train))

Mean squared error (MSE): 0.08
Mean absolute error (MAE): 0.22
Coefficient of determination (R^2): 0.95


In [60]:
Y_away_pred_test = model_away.predict(X_away_test)

In [61]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_away_test, Y_away_pred_test))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_away_test, Y_away_pred_test))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_away_test, Y_away_pred_test))

Mean squared error (MSE): 0.52
Mean absolute error (MAE): 0.56
Coefficient of determination (R^2): 0.68


### FTHG Results

Extract model input data from the training dataset.

In [121]:
home_training_data = general_training_data.copy().drop(['HTAG', 'FTAG', 'AC'], axis=1)
home_model_input_data = home_training_data.copy().drop(columns=['FTHG'])

In [122]:
home_training_data

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTR,HTHG,HTR,Referee,HF,AF,HC,HY,AY,HR,AR
0,10,34,0,0.0,0,0.5,12,10,14,7,3,2,0,0
1,13,37,0,0.0,0,0.5,7,12,15,3,2,2,0,0
2,14,35,1,0.5,1,1.0,18,10,14,5,0,0,0,0
3,18,20,2,1.0,1,1.0,20,8,17,5,2,2,0,0
4,22,33,2,1.0,1,1.0,34,11,14,9,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,9,40,3,0.5,1,0.0,1,13,8,5,2,1,0,0
1680,37,1,0,0.0,0,0.0,18,7,4,3,1,2,0,0
1681,25,20,1,0.0,0,0.5,33,10,11,3,0,2,0,0
1682,12,22,1,0.0,0,0.0,3,11,10,5,3,1,0,0


Extract predictions

In [123]:
home_pred_data = pd.get_dummies(home_model_input_data)
home_r = model_home.predict(home_pred_data)
home_r = pd.DataFrame(home_r)

In [124]:
home_r.columns= ['Predicted FTHG']
home_training_data.reset_index(drop=True, inplace=True)
home_results = pd.concat([home_training_data, home_r], axis=1)
home_results["Deviation in FTHG"] = abs(home_results["Predicted FTHG"] - home_results["FTHG"])

In [125]:
'''
path = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")
home_results.to_csv(path, index=False)
'''

'\npath = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")\nhome_results.to_csv(path, index=False)\n'

### FTAG Testing

In [126]:
away_training_data = general_training_data.copy().drop(['HTHG', 'FTHG', 'HC'], axis=1)
away_model_input_data = away_training_data.copy().drop(columns=['FTAG'])

In [127]:
away_pred_data = pd.get_dummies(away_model_input_data)
away_r = model_away.predict(away_pred_data)
away_r = pd.DataFrame(away_r)

In [128]:
away_r.columns= ['Predicted FTAG']
away_training_data.reset_index(drop=True, inplace=True)
away_results = pd.concat([away_training_data, away_r], axis=1)
away_results["Deviation in FTAG"] = abs(away_results["Predicted FTAG"] - away_results["FTAG"])

In [129]:
'''
path = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")
home_results.to_csv(path, index=False)
'''

'\npath = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")\nhome_results.to_csv(path, index=False)\n'

### Merging results of both models

In [130]:
complete_non_shot_predictions = general_training_data.copy()

# Add predicted FTHG
complete_non_shot_predictions['Predicted FTHG'] = home_results['Predicted FTHG']

# Add predicted FTAG
complete_non_shot_predictions['Predicted FTAG'] = away_results['Predicted FTAG']

complete_non_shot_predictions

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HF,AF,HC,AC,HY,AY,HR,AR,Predicted FTHG,Predicted FTAG
0,10,34,0,1,0.0,0,0,0.5,12,10,14,7,4,3,2,0,0,0.06,1.19
1,13,37,0,1,0.0,0,0,0.5,7,12,15,3,6,2,2,0,0,0.02,1.13
2,14,35,1,1,0.5,1,0,1.0,18,10,14,5,6,0,0,0,0,1.09,1.16
3,18,20,2,1,1.0,1,0,1.0,20,8,17,5,3,2,2,0,0,1.75,0.68
4,22,33,2,1,1.0,1,0,1.0,34,11,14,9,6,1,2,0,0,2.11,0.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,9,40,3,3,0.5,1,3,0.0,1,13,8,5,8,2,1,0,0,2.47,2.65
1680,37,1,0,4,0.0,0,2,0.0,18,7,4,3,5,1,2,0,0,0.16,3.72
1681,25,20,1,2,0.0,0,0,0.5,33,10,11,3,6,0,2,0,0,0.60,1.32
1682,12,22,1,3,0.0,0,3,0.0,3,11,10,5,3,3,1,0,0,0.20,3.31


In [132]:
path = os.path.join(os.getcwd(), "output/non_shot_predictions.csv")
complete_non_shot_predictions.to_csv(path, index=False)