## Non-shot-based expected goals regression model

Predicting an expected number of goals for each match given the different in-game statistics excluding shots. 


In [37]:
import sys
import os
import numpy as np
import pandas as pd

In [38]:
FILEPATH = os.path.join(os.getcwd(), "data/non-shot-xG/non_shot_data.csv")
data = pd.read_csv(FILEPATH)

In [39]:
complete_data

Unnamed: 0,GameID,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HF,AF,HC,AC,HY,AY,HR,AR
0,1,16/08/2008,Arsenal,West Brom,1,0,H,1,0,H,H Webb,11,8,7,5,0,0,0,0
1,2,16/08/2008,Bolton,Stoke,3,1,H,3,0,H,C Foy,13,12,4,3,1,2,0,0
2,3,16/08/2008,Everton,Blackburn,2,3,A,1,1,D,A Marriner,11,9,3,5,2,2,0,0
3,4,16/08/2008,Hull,Fulham,2,1,H,1,1,D,P Walton,10,9,5,6,3,0,0,0
4,5,16/08/2008,Middlesbrough,Tottenham,2,1,H,0,0,D,M Atkinson,11,12,7,9,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4719,4720,02/01/2021,Brighton,Wolves,3,3,D,1,3,A,A Madley,13,8,5,8,2,1,0,0
4720,4721,02/01/2021,West Brom,Arsenal,0,4,A,0,2,A,M Atkinson,7,4,3,5,1,2,0,0
4721,4722,03/01/2021,Newcastle,Leicester,1,2,A,0,0,D,R Jones,10,11,3,6,0,2,0,0
4722,4723,03/01/2021,Chelsea,Man City,1,3,A,0,3,A,A Taylor,11,10,5,3,3,1,0,0


In [40]:
general_training_data = complete_data.drop(['GameID','Date'], axis=1)

In [41]:
general_training_data

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HF,AF,HC,AC,HY,AY,HR,AR
0,Arsenal,West Brom,1,0,H,1,0,H,H Webb,11,8,7,5,0,0,0,0
1,Bolton,Stoke,3,1,H,3,0,H,C Foy,13,12,4,3,1,2,0,0
2,Everton,Blackburn,2,3,A,1,1,D,A Marriner,11,9,3,5,2,2,0,0
3,Hull,Fulham,2,1,H,1,1,D,P Walton,10,9,5,6,3,0,0,0
4,Middlesbrough,Tottenham,2,1,H,0,0,D,M Atkinson,11,12,7,9,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4719,Brighton,Wolves,3,3,D,1,3,A,A Madley,13,8,5,8,2,1,0,0
4720,West Brom,Arsenal,0,4,A,0,2,A,M Atkinson,7,4,3,5,1,2,0,0
4721,Newcastle,Leicester,1,2,A,0,0,D,R Jones,10,11,3,6,0,2,0,0
4722,Chelsea,Man City,1,3,A,0,3,A,A Taylor,11,10,5,3,3,1,0,0


### Data preperation

Mapping team names and referee to respective unique IDs

__Note__: Standard team names and referee names along with their respective unique IDs are located in [this](data/standard) directory

In [42]:
teams_data = pd.read_csv(os.path.join(os.getcwd(), "data/standard/standard.teamnames.csv"))
referee_data = pd.read_csv(os.path.join(os.getcwd(), "data/standard/standard.referee.names.csv"))

In [43]:
# Generating teams mappings 
teamname, teamID = list(teams_data['Standard teamname']), list(teams_data['TeamID'])
teamID_mapping = dict(zip(teamname, teamID))

generate_teamID_mappings = lambda teamnames: [teamID_mapping[teamname] for teamname in teamnames]

In [44]:
# Generating referees mappings 
referee, refereeID = list(referee_data['Standard referee name']), list(referee_data['RefereeID'])
refereeID_mapping = dict(zip(referee, refereeID))

generate_refereeID_mappings = lambda referees: [refereeID_mapping[referee] for referee in referees]

Encoding result:

- A __Home__ win is encoded as 1
- An __Away__ win is encoded as 0
- A __Draw__ is encoded as 0.5

In [45]:
def encode_results(results):
    encode = {
        'H': 1,
        'A': 0,
        'D': 0.5
    }
    return [encode[result] for result in results]

Applying transformations to the (general) training dataset.

In [46]:
# Teams
general_training_data['HomeTeam'] = generate_teamID_mappings(general_training_data['HomeTeam'])
general_training_data['AwayTeam'] = generate_teamID_mappings(general_training_data['AwayTeam'])

# Referees
general_training_data['Referee'] = generate_refereeID_mappings(general_training_data['Referee'])

# Encoding Results
general_training_data['FTR'] = encode_results(general_training_data['FTR'])
general_training_data['HTR'] = encode_results(general_training_data['HTR'])

In [47]:
general_training_data

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HF,AF,HC,AC,HY,AY,HR,AR
0,1,37,1,0,1.0,1,0,1.0,11,11,8,7,5,0,0,0,0
1,7,32,3,1,1.0,3,0,1.0,5,13,12,4,3,1,2,0,0
2,14,5,2,3,0.0,1,1,0.5,2,11,9,3,5,2,2,0,0
3,18,15,2,1,1.0,1,1,0.5,31,10,9,5,6,3,0,0,0
4,24,35,2,1,1.0,0,0,0.5,18,11,12,7,9,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4719,9,40,3,3,0.5,1,3,0.0,1,13,8,5,8,2,1,0,0
4720,37,1,0,4,0.0,0,2,0.0,18,7,4,3,5,1,2,0,0
4721,25,20,1,2,0.0,0,0,0.5,33,10,11,3,6,0,2,0,0
4722,12,22,1,3,0.0,0,3,0.0,3,11,10,5,3,3,1,0,0


### Building the regression model for the home team

Removing Full Time Home and Away Team Goals, and Away Team Corners from the training data

In [48]:
X_home = home_training_data = general_training_data.drop(['HTAG', 'FTAG', 'FTHG', 'AC'], axis=1)

In [49]:
Y_home = general_training_data.FTHG

In evaluating the model performance, the standard practice is to split the dataset into 2 (or more partitions) partitions and here we will be using the 80/20 split ratio whereby the 80% subset will be used as the train set and the 20% subset the test set. As scikit-learn requires that the data be further separated to their X and Y components, the train_test_split() function can readily perform the above-mentioned task.

In [50]:
from sklearn.model_selection import train_test_split 

In [51]:
X_home_train, X_home_test, Y_home_train, Y_home_test = train_test_split(X_home, Y_home, test_size=0.2)
X_home_train

Unnamed: 0,HomeTeam,AwayTeam,FTR,HTHG,HTR,Referee,HF,AF,HC,HY,AY,HR,AR
3822,17,11,0.5,0,0.5,23,8,10,7,0,1,1,0
3823,21,9,1.0,1,1.0,6,8,14,8,1,1,0,0
2105,31,37,1.0,0,0.5,11,11,17,4,1,2,0,0
4341,10,25,1.0,0,0.5,43,15,12,4,0,2,0,0
1412,14,35,1.0,1,1.0,21,13,11,0,2,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631,22,2,1.0,1,1.0,12,9,4,9,0,1,0,0
1124,1,2,0.0,0,0.0,23,9,10,13,1,4,0,0
2835,25,14,0.0,0,0.5,16,16,12,1,2,2,0,0
500,18,38,0.5,3,1.0,19,9,17,5,0,4,1,0


Using Random Forest Regressor model with a forest size of 100 decision trees

In [52]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [53]:
model_home = RandomForestRegressor(n_estimators = 100)
model_home.fit(X_home_train, Y_home_train)

RandomForestRegressor()

We will now apply the trained model to make predictions on the training set (X_train).

In [54]:
Y_home_pred_train = model_home.predict(X_home_train)

In [55]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_train, Y_home_pred_train))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_train, Y_home_pred_train))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_train, Y_home_pred_train))

Mean squared error (MSE): 0.09
Mean absolute error (MAE): 0.23
Coefficient of determination (R^2): 0.95


We will now apply the trained model to make predictions on the test set (X_test).

In [56]:
Y_home_pred_test = model_home.predict(X_home_test)

In [57]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_test, Y_home_pred_test))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_test, Y_home_pred_test))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_test, Y_home_pred_test))

Mean squared error (MSE): 0.59
Mean absolute error (MAE): 0.60
Coefficient of determination (R^2): 0.64


### Now repeating the process for the away team

In [58]:
X_away = home_training_data = general_training_data.drop(['HTHG', 'FTHG', 'FTAG', 'HC'], axis=1)

In [59]:
Y_away = general_training_data.FTAG

In [60]:
from sklearn.model_selection import train_test_split 

In [61]:
X_away_train, X_away_test, Y_away_train, Y_away_test = train_test_split(X_away, Y_away, test_size=0.2)

In [62]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [63]:
model_away = RandomForestRegressor(n_estimators = 100)
model_away.fit(X_away_train, Y_away_train)

RandomForestRegressor()

In [64]:
Y_away_pred_train = model_away.predict(X_away_train)

In [65]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_train, Y_home_pred_train))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_train, Y_home_pred_train))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_train, Y_home_pred_train))

Mean squared error (MSE): 0.09
Mean absolute error (MAE): 0.23
Coefficient of determination (R^2): 0.95


In [66]:
Y_away_pred_test = model_away.predict(X_away_test)

In [67]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_away_test, Y_away_pred_test))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_away_test, Y_away_pred_test))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_away_test, Y_away_pred_test))

Mean squared error (MSE): 0.50
Mean absolute error (MAE): 0.55
Coefficient of determination (R^2): 0.67


### FTHG Results

Extract model input data from the training dataset.

In [80]:
training_data = general_training_data.copy().drop(['HTAG', 'FTAG', 'AC'], axis=1)
model_input_data = training_data.copy().drop(columns=['FTHG'])

In [81]:
training_data

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTR,HTHG,HTR,Referee,HF,AF,HC,HY,AY,HR,AR
0,1,37,1,1.0,1,1.0,11,11,8,7,0,0,0,0
1,7,32,3,1.0,3,1.0,5,13,12,4,1,2,0,0
2,14,5,2,0.0,1,0.5,2,11,9,3,2,2,0,0
3,18,15,2,1.0,1,0.5,31,10,9,5,3,0,0,0
4,24,35,2,1.0,0,0.5,18,11,12,7,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4719,9,40,3,0.5,1,0.0,1,13,8,5,2,1,0,0
4720,37,1,0,0.0,0,0.0,18,7,4,3,1,2,0,0
4721,25,20,1,0.0,0,0.5,33,10,11,3,0,2,0,0
4722,12,22,1,0.0,0,0.0,3,11,10,5,3,1,0,0


Extract predictions

In [78]:
pred_data = pd.get_dummies(model_input_data)
r = model_home.predict(pred_data)
r = pd.DataFrame(r)

ValueError: Number of features of the model must match the input. Model n_features is 13 and input n_features is 133 

In [79]:
r.columns= ['Predicted FTHG']
training_data.reset_index(drop=True, inplace=True)
result = pd.concat([training_data, r], axis=1)
result["Deviation"] = abs(result["Predicted FTHG"] - result["FTHG"])

In [35]:
path = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")
result.to_csv(path, index=False)

### FTAG Testing

In [36]:
training_data = general_training_data.copy().drop(['HTHG', 'FTHG', 'HC'], axis=1)
model_input_data = training_data.copy().drop(columns=['FTAG'])

In [37]:
pred_data = pd.get_dummies(model_input_data)
r = model_away.predict(pred_data)
r = pd.DataFrame(r)

In [38]:
r.columns= ['Predicted FTAG']
training_data.reset_index(drop=True, inplace=True)
result = pd.concat([training_data, r], axis=1)
result["Deviation"] = abs(result["Predicted FTAG"] - result["FTAG"])

In [39]:
path = os.path.join(os.getcwd(), "output/non_shot_FTAG_predictions.csv")
result.to_csv(path, index=False)