## Non-shot-based expected goals regression model

Predicting an expected number of goals for each match given the different in-game statistics excluding shots. 


In [1]:
import sys
import os
import numpy as np
import pandas as pd

In [2]:
FILEPATH = os.path.join(os.getcwd(), "data/non-shot-xG/non_shot_data.csv")
complete_data = pd.read_csv(FILEPATH)

In [3]:
complete_data

Unnamed: 0,GameID,Date,HomeTeam,AwayTeam,Referee,HC,AC,HF,AF,HY,AY,HR,AR,FTHG,FTAG,FTR
0,1,13/08/2016,Burnley,Swansea,J Moss,7,4,10,14,3,2,0,0,0,1,A
1,2,13/08/2016,Crystal Palace,West Brom,C Pawson,3,6,12,15,2,2,0,0,0,1,A
2,3,13/08/2016,Everton,Tottenham,M Atkinson,5,6,10,14,0,0,0,0,1,1,D
3,4,13/08/2016,Hull,Leicester,M Dean,5,3,8,17,2,2,0,0,2,1,H
4,5,13/08/2016,Man City,Sunderland,R Madley,9,6,11,14,1,2,0,0,2,1,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,1680,02/01/2021,Brighton,Wolves,A Madley,5,8,13,8,2,1,0,0,3,3,D
1680,1681,02/01/2021,West Brom,Arsenal,M Atkinson,3,5,7,4,1,2,0,0,0,4,A
1681,1682,03/01/2021,Newcastle,Leicester,R Jones,3,6,10,11,0,2,0,0,1,2,A
1682,1683,03/01/2021,Chelsea,Man City,A Taylor,5,3,11,10,3,1,0,0,1,3,A


In [4]:
general_training_data = complete_data.drop(['GameID','Date'], axis=1)

In [5]:
general_training_data

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,AC,HF,AF,HY,AY,HR,AR,FTHG,FTAG,FTR
0,Burnley,Swansea,J Moss,7,4,10,14,3,2,0,0,0,1,A
1,Crystal Palace,West Brom,C Pawson,3,6,12,15,2,2,0,0,0,1,A
2,Everton,Tottenham,M Atkinson,5,6,10,14,0,0,0,0,1,1,D
3,Hull,Leicester,M Dean,5,3,8,17,2,2,0,0,2,1,H
4,Man City,Sunderland,R Madley,9,6,11,14,1,2,0,0,2,1,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,Brighton,Wolves,A Madley,5,8,13,8,2,1,0,0,3,3,D
1680,West Brom,Arsenal,M Atkinson,3,5,7,4,1,2,0,0,0,4,A
1681,Newcastle,Leicester,R Jones,3,6,10,11,0,2,0,0,1,2,A
1682,Chelsea,Man City,A Taylor,5,3,11,10,3,1,0,0,1,3,A


### Data preperation

Mapping team names and referee to respective unique IDs

__Note__: Standard team names and referee names along with their respective unique IDs are located in [this](data/standard) directory

In [6]:
teams_data = pd.read_csv(os.path.join(os.getcwd(), "data/standard/standard.teamnames.csv"))
referee_data = pd.read_csv(os.path.join(os.getcwd(), "data/standard/standard.referee.names.csv"))

In [7]:
# Generating teams mappings 
teamname, teamID = list(teams_data['Standard teamname']), list(teams_data['TeamID'])
teamID_mapping = dict(zip(teamname, teamID))

generate_teamID_mappings = lambda teamnames: [teamID_mapping[teamname] for teamname in teamnames]

In [8]:
# Generating referees mappings 
referee, refereeID = list(referee_data['Standard referee name']), list(referee_data['RefereeID'])
refereeID_mapping = dict(zip(referee, refereeID))

generate_refereeID_mappings = lambda referees: [refereeID_mapping[referee] for referee in referees]

Encoding result:

- A __Home__ win is encoded as 1
- An __Away__ win is encoded as 0
- A __Draw__ is encoded as 0.5

In [9]:
def encode_results(results):
    encode = {
        'H': 1,
        'A': 0,
        'D': 0.5
    }
    return [encode[result] for result in results]

Applying transformations to the (general) training dataset.

In [10]:
# Teams
general_training_data['HomeTeam'] = generate_teamID_mappings(general_training_data['HomeTeam'])
general_training_data['AwayTeam'] = generate_teamID_mappings(general_training_data['AwayTeam'])

# Referees
general_training_data['Referee'] = generate_refereeID_mappings(general_training_data['Referee'])

### Integrating expected goals (xG) data from shots-based model 

Extracting all the output from the shots-based model

In [11]:
shots_xg_predictions = pd.read_csv(os.path.join(os.getcwd(), 'output/shots_xG_predictions.csv'))

In [12]:
# Add expected goals for the home team
general_training_data['xHG'] = shots_xg_predictions['xG_h']

# Add expected goals for the away team
general_training_data['xAG'] = shots_xg_predictions['xG_a']

In [13]:
general_training_data

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,AC,HF,AF,HY,AY,HR,AR,FTHG,FTAG,FTR,xHG,xAG
0,10,34,12,7,4,10,14,3,2,0,0,0,1,A,0.000000,0.995993
1,13,37,7,3,6,12,15,2,2,0,0,0,1,A,0.000000,0.995993
2,14,35,18,5,6,10,14,0,0,0,0,1,1,D,0.955742,0.995993
3,18,20,20,5,3,8,17,2,2,0,0,2,1,H,1.955742,0.999136
4,22,33,34,9,6,11,14,1,2,0,0,2,1,H,0.999340,0.995993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,9,40,1,5,8,13,8,2,1,0,0,3,3,D,0.994346,0.996148
1680,37,1,18,3,5,7,4,1,2,0,0,0,4,A,0.000000,3.995993
1681,25,20,33,3,6,10,11,0,2,0,0,1,2,A,0.955742,1.995993
1682,12,22,3,5,3,11,10,3,1,0,0,1,3,A,0.955742,2.995993


### Building the regression model for the home team

Removing Full Time Home and Away Team Goals, and Away Team Corners from the training data

In [14]:
X_home = home_training_data = general_training_data.drop(['FTAG', 'FTHG', 'AC', 'xAG', 'FTR'], axis=1)

In [15]:
Y_home = general_training_data.FTHG

In evaluating the model performance, the standard practice is to split the dataset into 2 (or more partitions) partitions and here we will be using the 80/20 split ratio whereby the 80% subset will be used as the train set and the 20% subset the test set. As scikit-learn requires that the data be further separated to their X and Y components, the train_test_split() function can readily perform the above-mentioned task.

In [16]:
from sklearn.model_selection import train_test_split 

In [17]:
X_home_train, X_home_test, Y_home_train, Y_home_test = train_test_split(X_home, Y_home, test_size=0.2)
X_home_train

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,HF,AF,HY,AY,HR,AR,xHG
407,21,1,7,4,6,9,2,4,0,0,2.949755
261,32,24,7,6,16,20,1,2,0,0,1.955742
883,23,13,16,10,13,12,1,2,0,0,1.098129
236,36,10,23,3,12,10,5,0,0,1,1.955742
507,10,1,16,5,8,14,2,0,0,0,0.577655
...,...,...,...,...,...,...,...,...,...,...,...
1034,12,35,2,2,7,14,1,1,0,0,0.682569
751,13,37,12,4,10,11,2,3,0,0,1.248354
779,13,21,23,6,6,13,1,1,1,0,0.987752
430,10,17,6,3,9,13,3,2,0,0,0.750236


Using Random Forest Regressor model with a forest size of 100 decision trees

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [19]:
model_home = RandomForestRegressor(n_estimators = 100)
model_home.fit(X_home_train, Y_home_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

We will now apply the trained model to make predictions on the training set (X_train).

In [20]:
Y_home_pred_train = model_home.predict(X_home_train)

In [21]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_train, Y_home_pred_train))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_train, Y_home_pred_train))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_train, Y_home_pred_train))

Mean squared error (MSE): 0.09
Mean absolute error (MAE): 0.18
Coefficient of determination (R^2): 0.95


We will now apply the trained model to make predictions on the test set (X_test).

In [22]:
Y_home_pred_test = model_home.predict(X_home_test)

In [23]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_test, Y_home_pred_test))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_test, Y_home_pred_test))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_test, Y_home_pred_test))

Mean squared error (MSE): 0.72
Mean absolute error (MAE): 0.52
Coefficient of determination (R^2): 0.56


### Now repeating the process for the away team

In [24]:
X_away = away_training_data = general_training_data.drop(['FTHG', 'FTAG', 'HC', 'xHG', 'FTR'], axis=1)

In [25]:
Y_away = general_training_data.FTAG

In [26]:
from sklearn.model_selection import train_test_split 

In [27]:
X_away_train, X_away_test, Y_away_train, Y_away_test = train_test_split(X_away, Y_away, test_size=0.2)

In [28]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [29]:
model_away = RandomForestRegressor(n_estimators = 100)
model_away.fit(X_away_train, Y_away_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [30]:
Y_away_pred_train = model_away.predict(X_away_train)

In [31]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_home_train, Y_home_pred_train))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_home_train, Y_home_pred_train))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_home_train, Y_home_pred_train))

Mean squared error (MSE): 0.09
Mean absolute error (MAE): 0.18
Coefficient of determination (R^2): 0.95


In [32]:
Y_away_pred_test = model_away.predict(X_away_test)

In [33]:
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_away_test, Y_away_pred_test))
print('Mean absolute error (MAE): %.2f'
      % mean_absolute_error(Y_away_test, Y_away_pred_test))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_away_test, Y_away_pred_test))

Mean squared error (MSE): 0.64
Mean absolute error (MAE): 0.47
Coefficient of determination (R^2): 0.62


### FTHG Results

Extract model input data from the training dataset.

In [34]:
home_training_data = general_training_data.copy().drop(['FTAG', 'AC', 'xAG', 'FTR'], axis=1)
home_model_input_data = home_training_data.copy().drop(columns=['FTHG'])

In [35]:
home_training_data

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,HF,AF,HY,AY,HR,AR,FTHG,xHG
0,10,34,12,7,10,14,3,2,0,0,0,0.000000
1,13,37,7,3,12,15,2,2,0,0,0,0.000000
2,14,35,18,5,10,14,0,0,0,0,1,0.955742
3,18,20,20,5,8,17,2,2,0,0,2,1.955742
4,22,33,34,9,11,14,1,2,0,0,2,0.999340
...,...,...,...,...,...,...,...,...,...,...,...,...
1679,9,40,1,5,13,8,2,1,0,0,3,0.994346
1680,37,1,18,3,7,4,1,2,0,0,0,0.000000
1681,25,20,33,3,10,11,0,2,0,0,1,0.955742
1682,12,22,3,5,11,10,3,1,0,0,1,0.955742


Extract predictions

In [36]:
home_pred_data = pd.get_dummies(home_model_input_data)
home_r = model_home.predict(home_pred_data)
home_r = pd.DataFrame(home_r)

In [37]:
home_r.columns= ['Predicted FTHG']
home_training_data.reset_index(drop=True, inplace=True)
home_results = pd.concat([home_training_data, home_r], axis=1)
home_results["Deviation in FTHG"] = abs(home_results["Predicted FTHG"] - home_results["FTHG"])

In [38]:
'''
path = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")
home_results.to_csv(path, index=False)
'''

'\npath = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")\nhome_results.to_csv(path, index=False)\n'

### FTAG Results

In [39]:
away_training_data = general_training_data.copy().drop(['FTHG', 'HC', 'xHG', 'FTR'], axis=1)
away_model_input_data = away_training_data.copy().drop(columns=['FTAG'])

In [40]:
away_pred_data = pd.get_dummies(away_model_input_data)
away_r = model_away.predict(away_pred_data)
away_r = pd.DataFrame(away_r)

In [41]:
away_r.columns= ['Predicted FTAG']
away_training_data.reset_index(drop=True, inplace=True)
away_results = pd.concat([away_training_data, away_r], axis=1)
away_results["Deviation in FTAG"] = abs(away_results["Predicted FTAG"] - away_results["FTAG"])

In [42]:
'''
path = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")
home_results.to_csv(path, index=False)
'''

'\npath = os.path.join(os.getcwd(), "output/non_shot_FTHG_predictions.csv")\nhome_results.to_csv(path, index=False)\n'

### Merging results of both models

In [43]:
complete_non_shot_predictions = general_training_data.copy()

# Add predicted FTHG
complete_non_shot_predictions['Predicted FTHG'] = home_results['Predicted FTHG']

# Add predicted FTAG
complete_non_shot_predictions['Predicted FTAG'] = away_results['Predicted FTAG']

complete_non_shot_predictions

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,AC,HF,AF,HY,AY,HR,AR,FTHG,FTAG,FTR,xHG,xAG,Predicted FTHG,Predicted FTAG
0,10,34,12,7,4,10,14,3,2,0,0,0,1,A,0.000000,0.995993,0.00,1.05
1,13,37,7,3,6,12,15,2,2,0,0,0,1,A,0.000000,0.995993,0.00,1.01
2,14,35,18,5,6,10,14,0,0,0,0,1,1,D,0.955742,0.995993,1.00,1.03
3,18,20,20,5,3,8,17,2,2,0,0,2,1,H,1.955742,0.999136,1.96,1.21
4,22,33,34,9,6,11,14,1,2,0,0,2,1,H,0.999340,0.995993,2.17,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,9,40,1,5,8,13,8,2,1,0,0,3,3,D,0.994346,0.996148,2.74,2.27
1680,37,1,18,3,5,7,4,1,2,0,0,0,4,A,0.000000,3.995993,0.00,3.85
1681,25,20,33,3,6,10,11,0,2,0,0,1,2,A,0.955742,1.995993,1.00,1.98
1682,12,22,3,5,3,11,10,3,1,0,0,1,3,A,0.955742,2.995993,1.00,2.98


In [44]:
path = os.path.join(os.getcwd(), "output/non_shot_predictions.csv")
complete_non_shot_predictions.to_csv(path, index=False)

### Results Analysis

In [45]:
import pandas
from sklearn.model_selection import KFold
from sklearn.svm import SVR

### FTHG model

In [46]:
scores = []
best_svr = SVR(kernel='rbf')
cv = KFold(n_splits=10, shuffle=True, random_state=42)
for train_index, test_index in cv.split(X_home):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_home_train, X_home_test, Y_home_train, Y_home_test = X_home.loc[train_index], X_home.loc[test_index], Y_home.loc[train_index], Y_home.loc[test_index]
    best_svr.fit(X_home_train, Y_home_train)
    scores.append(best_svr.score(X_home_test, Y_home_test))

Train Index:  [   0    1    2 ... 1681 1682 1683] 

Test Index:  [  23   29   30   32   44   49   59   65   69   70   73   78   99  109
  115  124  135  162  168  173  185  203  210  220  244  247  259  261
  270  271  275  289  297  298  300  303  339  342  351  352  366  371
  382  383  398  405  411  413  414  416  422  425  427  439  462  478
  479  481  485  490  493  495  497  522  527  532  548  551  552  567
  570  576  582  583  585  598  613  617  619  628  629  664  677  679
  680  701  707  722  730  736  741  756  765  781  787  839  855  859
  864  881  887  889  906  914  931  940  943  946 1000 1001 1006 1034
 1052 1053 1063 1080 1090 1091 1094 1131 1160 1163 1165 1169 1175 1189
 1190 1191 1193 1197 1200 1247 1262 1277 1298 1307 1311 1318 1331 1341
 1358 1364 1380 1395 1414 1422 1424 1433 1435 1440 1444 1450 1471 1524
 1535 1542 1550 1597 1605 1607 1621 1623 1625 1629 1635 1646 1652 1661
 1672]
Train Index:  [   0    1    2 ... 1681 1682 1683] 

Test Index:  [  15   43 

Train Index:  [   0    1    2 ... 1681 1682 1683] 

Test Index:  [  13   14   20   21   34   40   64   87   91   95   98  121  130  134
  161  166  187  189  200  201  205  206  216  230  241  252  269  276
  288  295  315  330  337  343  378  379  385  387  391  392  397  401
  406  418  455  459  466  474  484  492  502  508  510  520  562  563
  564  565  592  600  612  642  646  647  683  686  699  702  719  725
  729  742  747  763  766  769  775  776  779  791  794  804  805  815
  831  840  856  860  863  870  871  878  891  897  929  955  957  960
  975  995 1016 1017 1020 1021 1025 1028 1038 1044 1051 1056 1064 1071
 1076 1082 1086 1095 1104 1123 1126 1129 1130 1152 1154 1162 1180 1184
 1215 1238 1241 1264 1267 1275 1294 1297 1332 1337 1363 1367 1369 1390
 1396 1411 1413 1437 1451 1459 1478 1482 1484 1485 1495 1499 1500 1508
 1513 1515 1522 1528 1544 1579 1584 1585 1586 1590 1638 1640 1657 1680]


In [47]:
print('Accuracy: {n}%'.format(n= np.mean(scores) * 100))

Accuracy: 55.49671414510821%


### FTAG model

In [48]:
scores = []
best_svr = SVR(kernel='rbf')
cv = KFold(n_splits=10, shuffle=True, random_state=42)
for train_index, test_index in cv.split(X_away):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_away_train, X_away_test, Y_away_train, Y_away_test = X_away.loc[train_index], X_away.loc[test_index], Y_away.loc[train_index], Y_away.loc[test_index]
    best_svr.fit(X_away_train, Y_away_train)
    scores.append(best_svr.score(X_away_test, Y_away_test))

Train Index:  [   0    1    2 ... 1681 1682 1683] 

Test Index:  [  23   29   30   32   44   49   59   65   69   70   73   78   99  109
  115  124  135  162  168  173  185  203  210  220  244  247  259  261
  270  271  275  289  297  298  300  303  339  342  351  352  366  371
  382  383  398  405  411  413  414  416  422  425  427  439  462  478
  479  481  485  490  493  495  497  522  527  532  548  551  552  567
  570  576  582  583  585  598  613  617  619  628  629  664  677  679
  680  701  707  722  730  736  741  756  765  781  787  839  855  859
  864  881  887  889  906  914  931  940  943  946 1000 1001 1006 1034
 1052 1053 1063 1080 1090 1091 1094 1131 1160 1163 1165 1169 1175 1189
 1190 1191 1193 1197 1200 1247 1262 1277 1298 1307 1311 1318 1331 1341
 1358 1364 1380 1395 1414 1422 1424 1433 1435 1440 1444 1450 1471 1524
 1535 1542 1550 1597 1605 1607 1621 1623 1625 1629 1635 1646 1652 1661
 1672]
Train Index:  [   0    1    2 ... 1681 1682 1683] 

Test Index:  [  15   43 

In [49]:
print('Accuracy: {n}%'.format(n= np.mean(scores) * 100))

Accuracy: 55.412607142961804%
