## Offensive and defensive ELO ratings to predict number of goals

We keep track of two ratings for all teams offensive rating ($R_O$) and defensive rating ($R_D$).

We can then predict the number of goals a team will score by taking the difference of their offensive rating and the opponent's defensive rating.

The number of goals scored against them can be calculated by considering it from the opponent's perspective.

$E[\text{team}] = R_O[\text{team}] - R_D[\text{opponent}]$

We can update a team's offensive rating by adding the difference between the actual number of goals and the expected goals multiplied by the learning rate.
We can update a team's defensive rating by adding the difference between the expected goals scored against them and the actual number of goals scored against them multiplied by the learning rate.

$R_O[\text{team}] = R_O[\text{team}] + k(G[\text{team}] - E[\text{team}])$

$R_D[\text{team}] = R_D[\text{team}] + k(E[\text{opponent}] - G[\text{opponent}])$

We start every team with a rating of 0. The order of the training data makes a difference to the model and so the training data should be in chronological order in order to account for teams changing over team.


In [76]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm import tqdm, tqdm_notebook
import sklearn.model_selection
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [77]:
match_data = pd.read_csv(os.path.join(os.getcwd(), "output/non_shot_predictions.csv"))

In [78]:
def encode_result(result):
    if result == 'H':
        return 1
    elif result == 'A':
        return 0
    return 0.5

In [79]:
def decode_result(result):
    if result == 1:
        return 'H'
    elif result == 0:
        return 'A'
    return 'D'

In [80]:
class GoalElo:
    def __init__(self, initial_rating=0, learning_rate=0.05, draw_size=0.5):
        self.offensive_ratings = defaultdict(lambda: initial_rating)
        self.defensive_ratings = defaultdict(lambda: initial_rating)
        self.match_count = defaultdict(lambda: 0)
        self.learning_rate = learning_rate
        self.draw_size = draw_size

    def predict(self, team, opponent):
        ''' Predicts the number of goals team will score against opponent. '''
        return self.offensive_ratings[team] - self.defensive_ratings[opponent]

    def predict_result(self, team, opponent):
        ''' Predicts the result of a match. 1 if team wins, 0 if opponent wins and 0.5 if it is a draw. '''
        goals_scored = self.predict(team, opponent)
        goals_conceded = self.predict(opponent, team)
        return self.classify_result(goals_scored, goals_conceded)

    def classify_result(self, goals_scored, goals_conceded):
        goal_difference = goals_scored - goals_conceded
        # result = round(1 / (1 + 10**(-goal_difference)))
        result = 1 if goal_difference > 0 else 0
        if abs(goal_difference) < self.draw_size:
            result = 0.5
        return result


    def predict_data(self, df):
        out = df.copy()
        for i, row in out.iterrows():
            out.at[i, 'EHG'] = self.predict(row['HomeTeam'], row['AwayTeam'])
            out.at[i, 'EAG'] = self.predict(row['AwayTeam'], row['HomeTeam'])
            out.at[i, 'ER'] = decode_result(self.predict_result(row['HomeTeam'], row['AwayTeam']))
        return out

    def update_match(self, home, away, home_actual_goals, away_actual_goals):
        ''' Updates the offensive and defensive ratings of both teams in a match. '''
        home_expected_goals = self.predict(home, away)
        away_expected_goals = self.predict(away, home)
        self.offensive_ratings[home] += self.learning_rate * (home_actual_goals - home_expected_goals)
        self.offensive_ratings[away] += self.learning_rate * (away_actual_goals - away_expected_goals)
        self.defensive_ratings[home] += self.learning_rate * (away_expected_goals - away_actual_goals)
        self.defensive_ratings[away] += self.learning_rate * (home_expected_goals - home_actual_goals)
        self.match_count[home] += 1
        self.match_count[away] += 1

    def ratings_dataframe(self):
        ''' Creates an easy to read dataframe of the ratings '''
        df = pd.DataFrame(self.offensive_ratings.items(), columns=['Team', 'Offensive Rating'])
        df['Defensive Rating'] = df['Team'].map(self.defensive_ratings)
        df['Matches'] = df['Team'].map(self.match_count)
        df = df.sort_values('Offensive Rating', ascending=False)
        return df

    def fit(self, df):
        ''' Takes a data frame of matches with columns HomeTeam, AwayTeam, Predicted FTHG, Predicted FTAG and updates teams ratings using the data in order. '''
        for i, row in df.iterrows():
            self.update_match(row['HomeTeam'], row['AwayTeam'], row['Predicted FTHG'], row['Predicted FTAG'])

    def test(self, df):
        ''' Takes a data frame of matches with columns HomeTeam, AwayTeam, FTHG, FTAG and uses the ratings to predict the number of goals scored by each side. It measures the average mean square error and average mean absolute error per match. '''
        mse = 0
        mae = 0
        count = 0
        for i, row in df.iterrows():
            error_home = self.predict(row['HomeTeam'], row['AwayTeam']) - row['FTHG']
            error_away = self.predict(row['AwayTeam'], row['HomeTeam']) - row['FTAG']
            mse += error_home ** 2 + error_away ** 2
            mae += abs(error_home) + abs(error_away)
            count += 1
        return mse / count, mae / count

    def test_result(self, df):
        ''' Takes a data frame of matches with columns HomeTeam, AwayTeam, FTR and predicts the outcome using the ratings. It measures the number of correct predictions as a percentage of the size of the data. '''
        correct = 0
        count = 0
        for i, row in df.iterrows():
            result = self.predict_result(row['HomeTeam'], row['AwayTeam'])
            if decode_result(result) == row['FTR']:
                correct += 1
            count += 1
        return correct / count



In [81]:
training, test = sklearn.model_selection.train_test_split(match_data, test_size=0.05, shuffle=False)

In [82]:
goal_elo = GoalElo()
goal_elo.fit(training)

data = goal_elo.predict_data(training)

X = np.array([data['EHG'].to_numpy(), data['EAG'].to_numpy()]).T

# decode FTR
y = [decode_result(r) for r in data['FTR'].to_numpy()]

model = SVC(gamma='auto')
model.fit(X, y)

data = goal_elo.predict_data(test)
X_test = np.array([data['EHG'].to_numpy(), data['EAG'].to_numpy()]).T

# decode FTR
y_test = [decode_result(r) for r in data['FTR'].to_numpy()]
y_pred = model.predict(X_test)
y_pred2 = data['ER'].to_numpy()

print("DS: ", accuracy_score(y_test, y_pred2))
print("SVC: ", accuracy_score(y_test, y_pred))



DS:  0.4472573839662447
SVC:  0.48523206751054854


In [83]:
data

Unnamed: 0,HomeTeam,AwayTeam,FTR,Predicted FTHG,Predicted FTAG,EHG,EAG,ER
4487,10,36,1.0,1.23,0.03,1.182680,1.283071,D
4488,31,1,0.0,0.73,2.04,1.456899,1.677019,D
4489,12,22,1.0,2.24,0.40,1.157836,2.139827,A
4490,2,40,0.0,0.04,1.17,0.823476,2.003290,A
4491,36,31,0.0,0.80,1.78,1.242987,1.368657,D
...,...,...,...,...,...,...,...,...
4719,9,40,0.5,2.71,3.21,0.828468,1.505999,A
4720,37,1,0.0,0.15,3.76,1.028317,1.570481,A
4721,25,20,0.0,0.70,1.56,0.685969,1.356323,A
4722,12,22,0.0,0.85,3.18,1.157836,2.139827,A


In [84]:
goal_elo.ratings_dataframe()

Unnamed: 0,Team,Offensive Rating,Defensive Rating,Matches
15,22.0,1.856353,-0.040949,448
11,21.0,1.726955,0.144985,449
18,23.0,1.139023,-0.15706,449
16,12.0,1.116887,-0.283474,448
9,35.0,1.07606,-0.36351,449
0,1.0,1.051467,-0.590851,448
31,20.0,0.950492,-0.108064,221
21,40.0,0.943878,-0.248146,183
28,31.0,0.866048,-0.625552,296
4,14.0,0.849213,-0.408862,449


In [85]:
training_data = training.copy().drop(['HTHG', 'FTHG', 'HC'], axis=1)
model_input_data = training_data.copy().drop(columns=['FTAG'])

KeyError: "['HTHG' 'FTHG' 'HC'] not found in axis"

In [86]:
pred_data = pd.get_dummies(model_input_data)
r = model_away.predict(pred_data)
r = pd.DataFrame(r)

NameError: name 'model_input_data' is not defined

In [None]:
r.columns= ['Predicted FTAG']
training_data.reset_index(drop=True, inplace=True)
result = pd.concat([training_data, r], axis=1)
result["Deviation"] = abs(result["Predicted FTAG"] - result["FTAG"])

In [None]:
path = os.path.join(os.getcwd(), "output/non_shot_FTAG_predictions.csv")
result.to_csv(path, index=False)

In [43]:
learning_rates = np.logspace(-4, -1, num=50)
mse_list = []
mae_list = []
accuracy = []

for learning_rate in tqdm(learning_rates):
    goal_elo = GoalElo(learning_rate=learning_rate)
    goal_elo.fit(training)
    mse, mae = goal_elo.test(test)
    mse_list.append(mse)
    mae_list.append(mae)
    accuracy.append(goal_elo.test_result(test))
print("Accuracy:") 
pd.DataFrame(zip(learning_rates, mse_list, mae_list, accuracy), columns=['Learning Rate', 'Mean Square Error / Match', 'Mean Absolute Error / Match', 'Accuracy'])

  0%|          | 0/50 [00:01<?, ?it/s]


KeyError: 'FTHG'

In [65]:
draw_sizes = np.logspace(-4, 1, num=50)
mse_list = []
mae_list = []
accuracy = []

for draw_size in tqdm(draw_sizes):
    goal_elo = GoalElo(draw_size=draw_size)
    goal_elo.fit(training)
    mse, mae = goal_elo.test(test)
    mse_list.append(mse)
    mae_list.append(mae)
    accuracy.append(goal_elo.test_result(test))
print("Accuracy:") 
pd.DataFrame(zip(draw_sizes, mse_list, mae_list, accuracy), columns=['Draw Size', 'Mean Square Error / Match', 'Mean Absolute Error / Match', 'Accuracy'])

100%|██████████| 50/50 [00:16<00:00,  3.03it/s]Accuracy:



Unnamed: 0,Draw Size,Mean Square Error / Match,Mean Absolute Error / Match,Accuracy
0,0.0001,3.565736,2.034976,0.510549
1,0.000126,3.565736,2.034976,0.510549
2,0.00016,3.565736,2.034976,0.510549
3,0.000202,3.565736,2.034976,0.510549
4,0.000256,3.565736,2.034976,0.510549
5,0.000324,3.565736,2.034976,0.510549
6,0.000409,3.565736,2.034976,0.510549
7,0.000518,3.565736,2.034976,0.510549
8,0.000655,3.565736,2.034976,0.510549
9,0.000829,3.565736,2.034976,0.510549


In [44]:
goal_elo = GoalElo(learning_rate=0.1)
goal_elo.train(training)
# print(goal_elo.ratings_dataframe())
print(goal_elo.predict('Man City', 'Chelsea'))
print(goal_elo.predict('Chelsea', 'Man City'))

AttributeError: 'GoalElo' object has no attribute 'train'