# COMP0036: Group Coursework 

## Beat the Bookie

`<insert Introduction here>`

### Package Import

In [237]:
import os
import sys
from collections import defaultdict

#Standard Python libraries for data and visualisation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Import models
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC

#Import error metric
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, f1_score, accuracy_score

#Import data munging tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

#Display charts in the notebook
%matplotlib inline

### Data Import
``<insert a brief description of our datasets here>``

### Shots xG model

### Non-shots xG model

#### Load Data

For the non-shot-based expected goals (xG) model, we have obtained a dataset from [football-data.co.uk] (https://www.football-data.co.uk/) consisting of football match information over the past 10+ years. We have reduced this dataset to data from the last 5 seasons (including the current one), i.e., __2016-2021__. This version contains 1684 samples, each of which consists of 12 features and 2 labels of the full time home team goals (FTHG) and the full time away team goals (FTAG). The features are:

1. GameID = Unique ID for the match
2. Date = Match Date (dd/mm/yy)
3. HomeTeam = Home Team
4. AwayTeam = Away Team
5. Referee = Match Referee
6. HC = Home Team Corners
7. AC = Away Team Corners
8. HF = Home Team Fouls Committed
9. AF = Away Team Fouls Committed
10. HY = Home Team Yellow Cards
11. AY = Away Team Yellow Cards
12. HR = Home Team Red Cards
13. AR = Away Team Red Cards

In [238]:
DATASET_PATH = os.path.join(os.getcwd(), "data/non-shot-xG/non_shot_data.csv")
complete_data = pd.read_csv(DATASET_PATH)

In [239]:
complete_data.head()

Unnamed: 0,GameID,Date,HomeTeam,AwayTeam,Referee,HC,AC,HF,AF,HY,AY,HR,AR,FTHG,FTAG,FTR
0,1,13/08/2016,Burnley,Swansea,J Moss,7,4,10,14,3,2,0,0,0,1,A
1,2,13/08/2016,Crystal Palace,West Brom,C Pawson,3,6,12,15,2,2,0,0,0,1,A
2,3,13/08/2016,Everton,Tottenham,M Atkinson,5,6,10,14,0,0,0,0,1,1,D
3,4,13/08/2016,Hull,Leicester,M Dean,5,3,8,17,2,2,0,0,2,1,H
4,5,13/08/2016,Man City,Sunderland,R Madley,9,6,11,14,1,2,0,0,2,1,H


### Data Transformation and Exploration
``<insert a brief description of our preperation and standardisation process here>``

### Shots xG model

### Non-shots xG model

Now we will take a look at different features — sanitise and standardise them for use in other models.

__Note__: Standard team names and referee names along with their respective unique IDs are located in [this](data/standard) directory

#### 1. Dropping columns that are not essential to train our model.

In [240]:
general_training_data = complete_data.drop(['GameID','Date'], axis=1)

#### 2. Encode names of the teams and referees using standardised data

In [241]:
teams_data = pd.read_csv(os.path.join(os.getcwd(), "data/standard/standard.teamnames.csv"))
referee_data = pd.read_csv(os.path.join(os.getcwd(), "data/standard/standard.referee.names.csv"))

In [242]:
# Generating teams mappings 
teamname, teamID = list(teams_data['Standard teamname']), list(teams_data['TeamID'])
teamID_mapping = dict(zip(teamname, teamID))

generate_teamID_mappings = lambda teamnames: [teamID_mapping[teamname] for teamname in teamnames]

In [243]:
# Generating referees mappings 
referee, refereeID = list(referee_data['Standard referee name']), list(referee_data['RefereeID'])
refereeID_mapping = dict(zip(referee, refereeID))

generate_refereeID_mappings = lambda referees: [refereeID_mapping[referee] for referee in referees]

#### Applying transformations to the (general) training dataset.

In [244]:
# Teams
general_training_data['HomeTeam'] = generate_teamID_mappings(general_training_data['HomeTeam'])
general_training_data['AwayTeam'] = generate_teamID_mappings(general_training_data['AwayTeam'])

# Referees
general_training_data['Referee'] = generate_refereeID_mappings(general_training_data['Referee'])

#### 3. Integrating expected goals (xG) data from shots-based model 

Extracting the results of the shots-based model.

In [245]:
shots_xg_predictions = pd.read_csv(os.path.join(os.getcwd(), 'output/shots_xG_predictions.csv'))

In [246]:
# Add expected goals for the home team
general_training_data['xHG'] = shots_xg_predictions['xG_h']

# Add expected goals for the away team
general_training_data['xAG'] = shots_xg_predictions['xG_a']

In [247]:
general_training_data

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,AC,HF,AF,HY,AY,HR,AR,FTHG,FTAG,FTR,xHG,xAG
0,10,34,12,7,4,10,14,3,2,0,0,0,1,A,0.000000,1.008553
1,13,37,7,3,6,12,15,2,2,0,0,0,1,A,0.000000,1.008553
2,14,35,18,5,6,10,14,0,0,0,0,1,1,D,0.950882,1.008553
3,18,20,20,5,3,8,17,2,2,0,0,2,1,H,1.950882,1.000000
4,22,33,34,9,6,11,14,1,2,0,0,2,1,H,1.000000,1.008553
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,9,40,1,5,8,13,8,2,1,0,0,3,3,D,1.000000,1.000000
1680,37,1,18,3,5,7,4,1,2,0,0,0,4,A,0.000000,4.008553
1681,25,20,33,3,6,10,11,0,2,0,0,1,2,A,0.950882,2.008553
1682,12,22,3,5,3,11,10,3,1,0,0,1,3,A,0.950882,3.008553


For the purposes of the non-shot-based model, we will split the dataset into two: one containing data for predicting the __FTHG__ (Full Time Home Goals) and one containing data for predicting the __FTAG__ (Full Time Away Goals).

#### 4. Split the dataset into two parts: ``home_training_data`` and ``away_training_data``

In [248]:
X_home = home_training_data = general_training_data.drop(['FTAG', 'FTHG', 'AC', 'xAG', 'FTR'], axis=1)

In [249]:
X_home.head()

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,HF,AF,HY,AY,HR,AR,xHG
0,10,34,12,7,10,14,3,2,0,0,0.0
1,13,37,7,3,12,15,2,2,0,0,0.0
2,14,35,18,5,10,14,0,0,0,0,0.950882
3,18,20,20,5,8,17,2,2,0,0,1.950882
4,22,33,34,9,11,14,1,2,0,0,1.0


In [250]:
X_away = home_training_data = general_training_data.drop(['FTHG', 'FTAG', 'HC', 'xHG', 'FTR'], axis=1)

In [251]:
X_away.head()

Unnamed: 0,HomeTeam,AwayTeam,Referee,AC,HF,AF,HY,AY,HR,AR,xAG
0,10,34,12,4,10,14,3,2,0,0,1.008553
1,13,37,7,6,12,15,2,2,0,0,1.008553
2,14,35,18,6,10,14,0,0,0,0,1.008553
3,18,20,20,3,8,17,2,2,0,0,1.0
4,22,33,34,6,11,14,1,2,0,0,1.008553


### Methodology Overview
``Pipeline overview``

### Model training and validation

### Shots xG model

### Non-shots xG model

### FTHG model

In [252]:
Y_home = general_training_data.FTHG

In [253]:
X_home_train, X_home_test, Y_home_train, Y_home_test = train_test_split(X_home, Y_home, test_size=0.2)

In [254]:
model_home = RandomForestRegressor(n_estimators = 100)
model_home.fit(X_home_train, Y_home_train)

RandomForestRegressor()

### FTAG model

In [255]:
Y_away = general_training_data.FTAG

In [256]:
X_away_train, X_away_test, Y_away_train, Y_away_test = train_test_split(X_away, Y_away, test_size=0.2)

In [257]:
model_away = RandomForestRegressor(n_estimators = 100)
model_away.fit(X_away_train, Y_away_train)

RandomForestRegressor()

### FTHG Results

In [258]:
home_training_data = general_training_data.copy().drop(['FTAG', 'AC', 'xAG', 'FTR'], axis=1)
home_model_input_data = home_training_data.copy().drop(columns=['FTHG'])

In [259]:
home_training_data

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,HF,AF,HY,AY,HR,AR,FTHG,xHG
0,10,34,12,7,10,14,3,2,0,0,0,0.000000
1,13,37,7,3,12,15,2,2,0,0,0,0.000000
2,14,35,18,5,10,14,0,0,0,0,1,0.950882
3,18,20,20,5,8,17,2,2,0,0,2,1.950882
4,22,33,34,9,11,14,1,2,0,0,2,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1679,9,40,1,5,13,8,2,1,0,0,3,1.000000
1680,37,1,18,3,7,4,1,2,0,0,0,0.000000
1681,25,20,33,3,10,11,0,2,0,0,1,0.950882
1682,12,22,3,5,11,10,3,1,0,0,1,0.950882


In [260]:
home_pred_data = pd.get_dummies(home_model_input_data)
home_r = model_home.predict(home_pred_data)
home_r = pd.DataFrame(home_r)

In [261]:
home_r.columns= ['Predicted FTHG']
home_training_data.reset_index(drop=True, inplace=True)
home_results = pd.concat([home_training_data, home_r], axis=1)
home_results["Deviation in FTHG"] = abs(home_results["Predicted FTHG"] - home_results["FTHG"])

### FTAG Results

In [262]:
away_training_data = general_training_data.copy().drop(['FTHG', 'HC', 'xHG', 'FTR'], axis=1)
away_model_input_data = away_training_data.copy().drop(columns=['FTAG'])

In [263]:
away_pred_data = pd.get_dummies(away_model_input_data)
away_r = model_away.predict(away_pred_data)
away_r = pd.DataFrame(away_r)

In [264]:
away_r.columns= ['Predicted FTAG']
away_training_data.reset_index(drop=True, inplace=True)
away_results = pd.concat([away_training_data, away_r], axis=1)
away_results["Deviation in FTAG"] = abs(away_results["Predicted FTAG"] - away_results["FTAG"])

### Merging results of both models

In [265]:
complete_non_shot_predictions = general_training_data.copy()

# Add predicted FTHG
complete_non_shot_predictions['Predicted FTHG'] = home_results['Predicted FTHG']

# Add predicted FTAG
complete_non_shot_predictions['Predicted FTAG'] = away_results['Predicted FTAG']

complete_non_shot_predictions

Unnamed: 0,HomeTeam,AwayTeam,Referee,HC,AC,HF,AF,HY,AY,HR,AR,FTHG,FTAG,FTR,xHG,xAG,Predicted FTHG,Predicted FTAG
0,10,34,12,7,4,10,14,3,2,0,0,0,1,A,0.000000,1.008553,0.00,1.01
1,13,37,7,3,6,12,15,2,2,0,0,0,1,A,0.000000,1.008553,0.00,1.00
2,14,35,18,5,6,10,14,0,0,0,0,1,1,D,0.950882,1.008553,0.99,1.00
3,18,20,20,5,3,8,17,2,2,0,0,2,1,H,1.950882,1.000000,2.00,1.34
4,22,33,34,9,6,11,14,1,2,0,0,2,1,H,1.000000,1.008553,1.96,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,9,40,1,5,8,13,8,2,1,0,0,3,3,D,1.000000,1.000000,2.35,2.76
1680,37,1,18,3,5,7,4,1,2,0,0,0,4,A,0.000000,4.008553,0.00,3.59
1681,25,20,33,3,6,10,11,0,2,0,0,1,2,A,0.950882,2.008553,1.00,2.00
1682,12,22,3,5,3,11,10,3,1,0,0,1,3,A,0.950882,3.008553,1.00,2.99


In [266]:
path = os.path.join(os.getcwd(), "output/non_shot_predictions.csv")
complete_non_shot_predictions.to_csv(path, index=False)

### ELO Rating classifier

In [267]:
class GoalElo:
    def __init__(self, initial_rating=0, learning_rate=0.05, draw_size=0.5):
        self.offensive_ratings = defaultdict(lambda: initial_rating)
        self.defensive_ratings = defaultdict(lambda: initial_rating)
        self.match_count = defaultdict(lambda: 0)
        self.learning_rate = learning_rate
        self.draw_size = draw_size

    def predict(self, team, opponent):
        ''' Predicts the number of goals team will score against opponent. '''
        return self.offensive_ratings[team] - self.defensive_ratings[opponent]

    def predict_result(self, team, opponent):
        ''' Predicts the result of a match. 1 if team wins, 0 if opponent wins and 0.5 if it is a draw. '''
        goals_scored = self.predict(team, opponent)
        goals_conceded = self.predict(opponent, team)
        return self.classify_result(goals_scored, goals_conceded)

    def classify_result(self, goals_scored, goals_conceded):
        ''' Piecewise function to predict result from number of goals '''
        goal_difference = goals_scored - goals_conceded
        # result = round(1 / (1 + 10**(-goal_difference)))
        result = 1 if goal_difference > 0 else 0
        if abs(goal_difference) < self.draw_size:
            result = 0.5
        return result


    def predict_data(self, df):
        ''' Takes a data frame of home and away teams to predict the number of goals and result of '''
        out = df.copy()
        for i, row in out.iterrows():
            out.at[i, 'EHG'] = self.predict(row['HomeTeam'], row['AwayTeam'])
            out.at[i, 'EAG'] = self.predict(row['AwayTeam'], row['HomeTeam'])
            out.at[i, 'ER'] = decode_result(self.predict_result(row['HomeTeam'], row['AwayTeam']))
        return out

    def update_match(self, home, away, home_actual_goals, away_actual_goals):
        ''' Updates the offensive and defensive ratings of both teams in a match. '''
        home_expected_goals = self.predict(home, away)
        away_expected_goals = self.predict(away, home)
        self.offensive_ratings[home] += self.learning_rate * (home_actual_goals - home_expected_goals)
        self.offensive_ratings[away] += self.learning_rate * (away_actual_goals - away_expected_goals)
        self.defensive_ratings[home] += self.learning_rate * (away_expected_goals - away_actual_goals)
        self.defensive_ratings[away] += self.learning_rate * (home_expected_goals - home_actual_goals)
        self.match_count[home] += 1
        self.match_count[away] += 1

    def ratings_dataframe(self):
        ''' Creates an easy to read dataframe of the ratings '''
        df = pd.DataFrame(self.offensive_ratings.items(), columns=['Team', 'Offensive Rating'])
        df['Defensive Rating'] = df['Team'].map(self.defensive_ratings)
        df['Matches'] = df['Team'].map(self.match_count)
        df = df.sort_values('Offensive Rating', ascending=False)
        return df

    def fit(self, df):
        ''' Takes a data frame of matches with columns HomeTeam, AwayTeam, Predicted FTHG, Predicted FTAG and updates teams ratings using the data in order. '''
        for i, row in df.iterrows():
            if 'Predicted FTHG' in row:
                self.update_match(row['HomeTeam'], row['AwayTeam'], row['Predicted FTHG'], row['Predicted FTAG'])
            else:
                self.update_match(row['HomeTeam'], row['AwayTeam'], row['FTHG'], row['FTAG'])


In [268]:
def decode_result(result):
    if result == 1:
        return 'H'
    elif result == 0:
        return 'A'
    return 'D'

In [269]:
match_data = pd.read_csv(os.path.join(os.getcwd(), "output/non_shot_predictions.csv"))

In [270]:
training, test = train_test_split(match_data, test_size=0.05, shuffle=False)

In [271]:
# Train elo ratings
goal_elo = GoalElo()
goal_elo.fit(training)


In [272]:
# Predict number of goals for use in training result classifier
goal_predicted_data = goal_elo.predict_data(training)

X = np.array([goal_predicted_data['EHG'].to_numpy(), goal_predicted_data['EAG'].to_numpy()]).T
y = goal_predicted_data['FTR'].to_numpy()
# y = [decode_result(r) for r in goal_predicted_data['FTR'].to_numpy()]

# Train classifier to predict result from number of goals
goal_result_classifier = SVC(gamma='auto')
goal_result_classifier.fit(X, y)

SVC(gamma='auto')

In [273]:
# Predict number of goals scored using goal_elo
goal_prediction = goal_elo.predict_data(test)
X_test = np.array([goal_prediction['EHG'].to_numpy(), goal_prediction['EAG'].to_numpy()]).T
y_test = goal_prediction['FTR'].to_numpy()
# Predict result using SVC and elo predicted number of goals
y_pred = goal_result_classifier.predict(X_test)
# Predict result using piecewise function and elo predicted number of goals
y_pred2 = goal_prediction['ER'].to_numpy()

# Measure accuracy
print("Accuracy:")
print("DS: ", accuracy_score(y_test, y_pred2))
print("SVC: ", accuracy_score(y_test, y_pred))

# Measure f1 score
print("F1 Score:")
print("DS: ", f1_score(y_test, y_pred2, average='weighted'))
print("SVC: ", f1_score(y_test, y_pred, average='weighted'))

Accuracy:
DS:  0.43529411764705883
SVC:  0.47058823529411764
F1 Score:
DS:  0.4425421086378445
SVC:  0.3753501400560224


### Results

``<insert model tests here>``

### Final predictions

In [274]:
# Generating reverse teams mappings 
teamname, teamID = list(teams_data['Standard teamname']), list(teams_data['TeamID'])
teamname_mapping = dict(zip(teamname, teamID))

In [275]:
final_test_data = pd.read_csv(os.path.join(os.getcwd(), 'data/epl-test.csv'))
final_test_data

Unnamed: 0,Date,HomeTeam,AwayTeam
0,16 Jan 21,Arsenal,Newcastle
1,16 Jan 21,Aston Villa,Everton
2,16 Jan 21,Fulham,Chelsea
3,16 Jan 21,Leeds,Brighton
4,16 Jan 21,Leicester,Southampton
5,16 Jan 21,Liverpool,Man United
6,16 Jan 21,Man City,Crystal Palace
7,16 Jan 21,Sheffield United,Tottenham
8,16 Jan 21,West Ham,Burnley
9,16 Jan 21,Wolves,West Brom


In [276]:
for i, r in final_test_data.iterrows():
    final_test_data.at[i, 'HomeGoals'] = goal_elo.predict(teamname_mapping[r['HomeTeam']], teamname_mapping[r['AwayTeam']])
    final_test_data.at[i, 'AwayGoals'] = goal_elo.predict(teamname_mapping[r['AwayTeam']], teamname_mapping[r['HomeTeam']])

In [277]:
final_test_data

Unnamed: 0,Date,HomeTeam,AwayTeam,HomeGoals,AwayGoals
0,16 Jan 21,Arsenal,Newcastle,1.565024,0.953415
1,16 Jan 21,Aston Villa,Everton,1.570183,1.488105
2,16 Jan 21,Fulham,Chelsea,0.919229,2.475648
3,16 Jan 21,Leeds,Brighton,1.125199,0.848336
4,16 Jan 21,Leicester,Southampton,1.681845,1.387079
5,16 Jan 21,Liverpool,Man United,2.005681,1.700238
6,16 Jan 21,Man City,Crystal Palace,2.294442,0.705616
7,16 Jan 21,Sheffield United,Tottenham,0.813384,1.688809
8,16 Jan 21,West Ham,Burnley,1.53305,0.857576
9,16 Jan 21,Wolves,West Brom,1.383862,0.510904


In [278]:
goal_predictions = np.array([final_test_data['HomeGoals'].to_numpy(), final_test_data['AwayGoals'].to_numpy()]).T

In [279]:
final_test_data['FTR'] = goal_result_classifier.predict(goal_predictions)

In [280]:
final_test_data

Unnamed: 0,Date,HomeTeam,AwayTeam,HomeGoals,AwayGoals,FTR
0,16 Jan 21,Arsenal,Newcastle,1.565024,0.953415,H
1,16 Jan 21,Aston Villa,Everton,1.570183,1.488105,H
2,16 Jan 21,Fulham,Chelsea,0.919229,2.475648,A
3,16 Jan 21,Leeds,Brighton,1.125199,0.848336,H
4,16 Jan 21,Leicester,Southampton,1.681845,1.387079,H
5,16 Jan 21,Liverpool,Man United,2.005681,1.700238,H
6,16 Jan 21,Man City,Crystal Palace,2.294442,0.705616,H
7,16 Jan 21,Sheffield United,Tottenham,0.813384,1.688809,A
8,16 Jan 21,West Ham,Burnley,1.53305,0.857576,H
9,16 Jan 21,Wolves,West Brom,1.383862,0.510904,H
