# Fantasy Football - Predicting Team Goals

In this notebook, we use the dataset from the previous notebook to create a BayesianRidge linear regression model. This model helps us predict the number of goals a team will score and concede in future matches.

In [1]:
import pandas as pd
import warnings
from functools import reduce
import itertools
import numpy as np
import sklearn.preprocessing as preprocessing
import sklearn.model_selection as model_selection
from sklearn import linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import linear_model
from joblib import dump, load

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings("ignore")

Importing data.

In [3]:
team_fixtures = pd.read_csv('data/wrangled_data_final.csv')

In [4]:
team_fixtures.head()

Unnamed: 0,game_week,week_day,date,team,opponent,team_xg,opponent_xg,season,game_id,time,team_goals,opponent_goals,fpl_game_week,team_elo,opponent_elo,team_goals_scored_xg_3game_form,team_goals_conceded_xg_3game_form,opponent_goals_scored_xg_3game_form,opponent_goals_conceded_xg_3game_form,team_goals_scored_actual_3game_form,team_goals_conceded_actual_3game_form,opponent_goals_scored_actual_3game_form,opponent_goals_conceded_actual_3game_form,team_goals_scored_xg_5game_form,team_goals_conceded_xg_5game_form,opponent_goals_scored_xg_5game_form,opponent_goals_conceded_xg_5game_form,team_goals_scored_actual_5game_form,team_goals_conceded_actual_5game_form,opponent_goals_scored_actual_5game_form,opponent_goals_conceded_actual_5game_form,team_goals_scored_xg_10game_form,team_goals_conceded_xg_10game_form,opponent_goals_scored_xg_10game_form,opponent_goals_conceded_xg_10game_form,team_goals_scored_actual_10game_form,team_goals_conceded_actual_10game_form,opponent_goals_scored_actual_10game_form,opponent_goals_conceded_actual_10game_form,elodiff,home
0,6,Sat,2017-09-23,Watford,Swansea City,1.2,1.7,2017-2018,51,15:00,2.0,1.0,6,1635.701538,1660.233643,0.5,1.566667,0.7,1.0,0.666667,2.0,0.666667,0.333333,1.2,1.66,0.56,1.6,1.4,1.8,0.4,1.0,1.0,1.48,1.48,1.0,0.9,1.7,1.7,0.9,-24.532105,0
1,6,Sat,2017-09-23,Swansea City,Watford,1.7,1.2,2017-2018,51,15:00,1.0,2.0,6,1660.233643,1635.701538,0.7,1.0,0.5,1.566667,0.666667,0.333333,0.666667,2.0,0.56,1.6,1.2,1.66,0.4,1.0,1.4,1.8,1.0,1.48,1.48,1.0,0.9,1.7,1.7,0.9,24.532105,1
2,6,Sat,2017-09-23,Burnley,Huddersfield,0.3,0.3,2017-2018,52,15:00,0.0,0.0,6,1661.930664,1518.883423,0.466667,2.166667,0.833333,1.233333,1.0,0.666667,0.333333,1.0,0.66,1.78,0.86,1.1,1.2,1.0,1.0,0.6,1.0,1.48,1.48,1.0,0.9,1.7,1.7,0.9,143.047241,1
3,6,Sat,2017-09-23,Huddersfield,Burnley,0.3,0.3,2017-2018,52,15:00,0.0,0.0,6,1518.883423,1661.930664,0.833333,1.233333,0.466667,2.166667,0.333333,1.0,1.0,0.666667,0.86,1.1,0.66,1.78,1.0,0.6,1.2,1.0,1.0,1.48,1.48,1.0,0.9,1.7,1.7,0.9,-143.047241,0
4,6,Sat,2017-09-23,Crystal Palace,Manchester City,0.6,4.5,2017-2018,53,15:00,0.0,5.0,6,1598.233276,1908.661133,1.633333,0.666667,2.366667,0.566667,0.0,1.333333,4.333333,0.333333,1.34,1.2,2.02,0.52,0.0,1.6,3.2,0.4,1.0,1.48,1.48,1.0,0.9,1.7,1.7,0.9,-310.427857,0


Assessing the accuracy of various models by comparing their performance using 3, 5, and 10 expected and actual form features.

In [5]:
kind  = ['xg_3', 'xg_5', 'xg_10', 'actual_3', 'actual_5', 'actual_10']

In [6]:
for k in kind:
    # set features
    x = team_fixtures[['fpl_game_week', 'team_elo', 'opponent_elo', f'team_goals_scored_{k}game_form',
                                          f'opponent_goals_conceded_{k}game_form', 'elodiff', 'home']]
    
    # sclae data
    x = x[x.columns[0:]] 
    ss = preprocessing.StandardScaler()
    x = pd.DataFrame(ss.fit_transform(x),columns = x.columns)
    
    y = team_fixtures.team_goals
    
    # train and test model
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        x, y, train_size=0.75, test_size=0.25, random_state=1)
    
    # fit model to data
    reg_bay = linear_model.BayesianRidge()
    reg_bay.fit(x_train, y_train)
    
    # accuracy scores
    cv_mae = cross_val_score(reg_bay, x_train, y_train, cv=10,scoring='neg_mean_absolute_error')
    cv_rmse = cross_val_score(reg_bay, x_train, y_train, cv=10,scoring='neg_root_mean_squared_error')
    cv_r2 = cross_val_score(reg_bay, x_train, y_train, cv=10,scoring='r2')
    print(f'{k}, MAE =',round(cv_mae.mean(),4))
    print(f'{k}, RSME =',round(cv_rmse.mean(),4))
    print(f'{k}, R2 =',round(cv_r2.mean(),4))
    print('----------------------')

xg_3, MAE = -0.9006
xg_3, RSME = -1.1348
xg_3, R2 = 0.1654
----------------------
xg_5, MAE = -0.8984
xg_5, RSME = -1.1313
xg_5, R2 = 0.1706
----------------------
xg_10, MAE = -0.9021
xg_10, RSME = -1.1369
xg_10, R2 = 0.1625
----------------------
actual_3, MAE = -0.9061
actual_3, RSME = -1.1396
actual_3, R2 = 0.1584
----------------------
actual_5, MAE = -0.907
actual_5, RSME = -1.1408
actual_5, R2 = 0.1566
----------------------
actual_10, MAE = -0.906
actual_10, RSME = -1.14
actual_10, R2 = 0.1581
----------------------


After comparing different models and features, BayesianRidge linear regression using 5 game xg form with the features outlined below produces the best results.

In [7]:
x = team_fixtures[['fpl_game_week', 'team_elo', 'opponent_elo', f'team_goals_scored_xg_5game_form',
                                      f'opponent_goals_conceded_xg_5game_form', 'elodiff', 'home']]

In [8]:
x = x[x.columns[0:]] 
ss = preprocessing.StandardScaler()
x = pd.DataFrame(ss.fit_transform(x),columns = x.columns)

In [9]:
y = team_fixtures.team_goals

In [10]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    x, y, train_size=0.75, test_size=0.25, random_state=1)

In [11]:
reg_bay = linear_model.BayesianRidge()
reg_bay.fit(x_train, y_train)

Importance of each feature used.

In [12]:
print(x.columns)
importance = reg_bay.coef_
for i, v in enumerate(importance):
    print(f'Feature: %0d, Score: %.5f' % (i, v))  

Index(['fpl_game_week', 'team_elo', 'opponent_elo',
       'team_goals_scored_xg_5game_form',
       'opponent_goals_conceded_xg_5game_form', 'elodiff', 'home'],
      dtype='object')
Feature: 0, Score: -0.03407
Feature: 1, Score: 0.21502
Feature: 2, Score: -0.05630
Feature: 3, Score: 0.11087
Feature: 4, Score: 0.14764
Feature: 5, Score: 0.18579
Feature: 6, Score: 0.18230


Export model for future use.

In [13]:
dump(reg_bay, 'predict_goals_model.joblib') 

['predict_goals_model.joblib']

Fitting model do data set resulting in predicted goals scored columns.

In [14]:
team_fixtures['pred_goals_scored'] = reg_bay.predict(x)

In [15]:
team_fixtures[['fpl_game_week', 'team', 'opponent', 'team_goals', 'pred_goals_scored']].head()

Unnamed: 0,fpl_game_week,team,opponent,team_goals,pred_goals_scored
0,6,Watford,Swansea City,2.0,1.213222
1,6,Swansea City,Watford,1.0,1.545529
2,6,Burnley,Huddersfield,0.0,1.53414
3,6,Huddersfield,Burnley,0.0,0.882562
4,6,Crystal Palace,Manchester City,0.0,0.389978


On average, our model exhibits a deviation of 0.8984 goals from the actual goals scored. Considering that we employ team goals as a metric for predicting player points, it is not necessary for this metric to be absolutely precise. Rather, it serves as a reliable indicator of team performance and the efficacy of their offensive and defensive units in any given gameweek.

Since football is a zero-sum game, 
the number of goals one team scores will be equal to 
the number of goals the opposing team concedes. 
Therefore, once we can predict the goals scored by 
each team, we can also predict the goals conceded.

In [16]:
match_team_1 = team_fixtures.query('home==1')[['game_id', 'pred_goals_scored', 'home', 'team']]

In [17]:
match_team_2 = team_fixtures.query('home==0')[['game_id', 'pred_goals_scored', 'home', 'team']]

In [18]:
combine = pd.merge(match_team_1,match_team_2, on='game_id')

In [19]:
combine.head()

Unnamed: 0,game_id,pred_goals_scored_x,home_x,team_x,pred_goals_scored_y,home_y,team_y
0,51,1.545529,1,Swansea City,1.213222,0,Watford
1,52,1.53414,1,Burnley,0.882562,0,Huddersfield
2,53,2.447532,1,Manchester City,0.389978,0,Crystal Palace
3,54,1.426761,1,Leicester City,1.868528,0,Liverpool
4,55,1.109021,1,Southampton,2.068537,0,Manchester Utd


In [20]:
cols = ['game_id', 'pred_goals_scored_x', 'pred_goals_scored_y']
team_fixtures = pd.merge(team_fixtures, combine[cols], on='game_id')

In [21]:
team_fixtures[['fpl_game_week', 'team', 'opponent', 'team_goals', 'pred_goals_scored',
                                  'pred_goals_scored_x', 'pred_goals_scored_y']].head()

Unnamed: 0,fpl_game_week,team,opponent,team_goals,pred_goals_scored,pred_goals_scored_x,pred_goals_scored_y
0,6,Watford,Swansea City,2.0,1.213222,1.545529,1.213222
1,6,Swansea City,Watford,1.0,1.545529,1.545529,1.213222
2,6,Burnley,Huddersfield,0.0,1.53414,1.53414,0.882562
3,6,Huddersfield,Burnley,0.0,0.882562,1.53414,0.882562
4,6,Crystal Palace,Manchester City,0.0,0.389978,2.447532,0.389978


Renaming columns and removing negative values for goals predicted.

In [22]:
team_fixtures = (
    team_fixtures
    .assign(pred_goals_conceded=team_fixtures.pred_goals_scored_x)
    .assign(pred_goals_conceded=lambda x: x.pred_goals_conceded.where(x.pred_goals_scored_x != x.pred_goals_scored, x.pred_goals_scored_y))
    .assign(pred_goals_conceded=lambda x: x.pred_goals_conceded.clip(lower=0.119887576083739))
    .assign(pred_goals_scored=lambda x: x.pred_goals_scored.clip(lower=0.119887576083739))
    .drop(columns=['pred_goals_scored_x', 'pred_goals_scored_y'])
)

In [23]:
team_fixtures.head()

Unnamed: 0,game_week,week_day,date,team,opponent,team_xg,opponent_xg,season,game_id,time,team_goals,opponent_goals,fpl_game_week,team_elo,opponent_elo,team_goals_scored_xg_3game_form,team_goals_conceded_xg_3game_form,opponent_goals_scored_xg_3game_form,opponent_goals_conceded_xg_3game_form,team_goals_scored_actual_3game_form,team_goals_conceded_actual_3game_form,opponent_goals_scored_actual_3game_form,opponent_goals_conceded_actual_3game_form,team_goals_scored_xg_5game_form,team_goals_conceded_xg_5game_form,opponent_goals_scored_xg_5game_form,opponent_goals_conceded_xg_5game_form,team_goals_scored_actual_5game_form,team_goals_conceded_actual_5game_form,opponent_goals_scored_actual_5game_form,opponent_goals_conceded_actual_5game_form,team_goals_scored_xg_10game_form,team_goals_conceded_xg_10game_form,opponent_goals_scored_xg_10game_form,opponent_goals_conceded_xg_10game_form,team_goals_scored_actual_10game_form,team_goals_conceded_actual_10game_form,opponent_goals_scored_actual_10game_form,opponent_goals_conceded_actual_10game_form,elodiff,home,pred_goals_scored,pred_goals_conceded
0,6,Sat,2017-09-23,Watford,Swansea City,1.2,1.7,2017-2018,51,15:00,2.0,1.0,6,1635.701538,1660.233643,0.5,1.566667,0.7,1.0,0.666667,2.0,0.666667,0.333333,1.2,1.66,0.56,1.6,1.4,1.8,0.4,1.0,1.0,1.48,1.48,1.0,0.9,1.7,1.7,0.9,-24.532105,0,1.213222,1.545529
1,6,Sat,2017-09-23,Swansea City,Watford,1.7,1.2,2017-2018,51,15:00,1.0,2.0,6,1660.233643,1635.701538,0.7,1.0,0.5,1.566667,0.666667,0.333333,0.666667,2.0,0.56,1.6,1.2,1.66,0.4,1.0,1.4,1.8,1.0,1.48,1.48,1.0,0.9,1.7,1.7,0.9,24.532105,1,1.545529,1.213222
2,6,Sat,2017-09-23,Burnley,Huddersfield,0.3,0.3,2017-2018,52,15:00,0.0,0.0,6,1661.930664,1518.883423,0.466667,2.166667,0.833333,1.233333,1.0,0.666667,0.333333,1.0,0.66,1.78,0.86,1.1,1.2,1.0,1.0,0.6,1.0,1.48,1.48,1.0,0.9,1.7,1.7,0.9,143.047241,1,1.53414,0.882562
3,6,Sat,2017-09-23,Huddersfield,Burnley,0.3,0.3,2017-2018,52,15:00,0.0,0.0,6,1518.883423,1661.930664,0.833333,1.233333,0.466667,2.166667,0.333333,1.0,1.0,0.666667,0.86,1.1,0.66,1.78,1.0,0.6,1.2,1.0,1.0,1.48,1.48,1.0,0.9,1.7,1.7,0.9,-143.047241,0,0.882562,1.53414
4,6,Sat,2017-09-23,Crystal Palace,Manchester City,0.6,4.5,2017-2018,53,15:00,0.0,5.0,6,1598.233276,1908.661133,1.633333,0.666667,2.366667,0.566667,0.0,1.333333,4.333333,0.333333,1.34,1.2,2.02,0.52,0.0,1.6,3.2,0.4,1.0,1.48,1.48,1.0,0.9,1.7,1.7,0.9,-310.427857,0,0.389978,2.447532


In [24]:
team_fixtures.to_csv('predicting_team_goals.csv', index=False)

We have successfully generated predictions for goals scored and conceded for every game using the available data. However, as we are required to forecast FPL teams well in advance of each game, rather than just before each game week, we need to predict goals scored and conceded for each specific game based on past game data from all preceding weeks. For example, in GW10, we will predict the goals scored and conceded from GW10 until the end of the season using game data from GW1 to GW9. 

In [25]:
final_team_data = team_fixtures[['season', 'fpl_game_week', 'date', 'team', 'opponent', 'game_id', 
                                 'team_elo', 'opponent_elo', 'elodiff', 'home', 'team_goals_scored_xg_5game_form',
                                 'team_goals_conceded_xg_5game_form', 'opponent_goals_conceded_xg_5game_form',
                                 'team_goals', 'opponent_goals']]

In [26]:
final_team_data.head()

Unnamed: 0,season,fpl_game_week,date,team,opponent,game_id,team_elo,opponent_elo,elodiff,home,team_goals_scored_xg_5game_form,team_goals_conceded_xg_5game_form,opponent_goals_conceded_xg_5game_form,team_goals,opponent_goals
0,2017-2018,6,2017-09-23,Watford,Swansea City,51,1635.701538,1660.233643,-24.532105,0,1.2,1.66,1.6,2.0,1.0
1,2017-2018,6,2017-09-23,Swansea City,Watford,51,1660.233643,1635.701538,24.532105,1,0.56,1.6,1.66,1.0,2.0
2,2017-2018,6,2017-09-23,Burnley,Huddersfield,52,1661.930664,1518.883423,143.047241,1,0.66,1.78,1.1,0.0,0.0
3,2017-2018,6,2017-09-23,Huddersfield,Burnley,52,1518.883423,1661.930664,-143.047241,0,0.86,1.1,1.78,0.0,0.0
4,2017-2018,6,2017-09-23,Crystal Palace,Manchester City,53,1598.233276,1908.661133,-310.427857,0,1.34,1.2,0.52,0.0,5.0


In [27]:
game_weeks = [i for i in range(1,39)]

Getting current form and future games we need to predict goals for based on current form.

In [28]:
def get_current_form(df):
    return (df
            .query(f'fpl_game_week <= {week} & season == "2018-2019"')
            .loc[:, ['team', 'team_elo', 'team_goals_scored_xg_5game_form', 'team_goals_conceded_xg_5game_form']]
            .drop_duplicates(subset='team', keep='last'))

Getting future games and convert future and current form into data sets that matches model format.

In [29]:
def get_future_games(df, final_team_data):
    # future games
    future_games = (final_team_data
                    .query(f'fpl_game_week >= {week} & season == "2018-2019"')
                    .loc[:, ['fpl_game_week', 'date', 'team', 'opponent', 'home']])
    
    return (df
            # merging current form and future games
            .merge(future_games, on='team', how='outer')
            .merge(df, left_on='opponent', right_on='team', how='outer')

            # setting column names
            .set_axis(['team', 'team_elo', 'team_goals_scored_xg_5game_form', 'team_goals_conceded_xg_5game_form', 
                       'fpl_game_week', 'date', 'opponent', 'home', 'opponent_2', 'opponent_elo', 
                       'opponent_goals_scored_xg_5game_form', 'opponent_goals_conceded_xg_5game_form'], axis='columns')

            # dropping columns we dont need and sort, reset, index of df
            .drop(columns=['team_goals_conceded_xg_5game_form', 'opponent_2', 'opponent_goals_scored_xg_5game_form'])
            .sort_values(by='fpl_game_week')
            .reset_index(drop=True)

            # create elo difference column and drop na
            .assign(elodiff=lambda x: x['team_elo'] - x['opponent_elo'])
            .dropna())

Predict goals scored for all future matches based on current form.

In [30]:
def fitting_model(df):
    # scale data
    x = df[['fpl_game_week', 'team_elo', 'opponent_elo', 'team_goals_scored_xg_5game_form',
                   'opponent_goals_conceded_xg_5game_form', 'elodiff', 'home']]
    x = x[x.columns[0:]] 
    ss = preprocessing.StandardScaler()
    x = pd.DataFrame(ss.fit_transform(x),columns = x.columns)

    # predict goals
    df['pred_goals_scored'] = reg_bay.predict(x)
    return df

Adding predicted goals conceded and adjust for when teams have multiple matches in a single game week by getting total predicted goals scored and conceded from all games played in game week rather than having too seperate rows for each game played.

In [31]:
def add_conceded_adjust(df):
    return (df[['team', 'pred_goals_scored', 'fpl_game_week', 'date']]

            # getting predicted goals conceded from opponents predicted goals scored
            .merge(df[['opponent', 'pred_goals_scored', 'fpl_game_week']], 
                   left_on=['fpl_game_week', 'team'], right_on=['fpl_game_week', 'opponent'])
            .loc[:, ['date', 'fpl_game_week', 'team', 'pred_goals_scored_x', 'pred_goals_scored_y']]
            .rename(columns={'pred_goals_scored_x': 'pred_goals_scored', 
                             'pred_goals_scored_y': 'pred_goals_conceded'})

            # where teams play multiple games in a single game week getting mean of both games and 
            # multiplying by 2 to get final predicted scored and conceded
            .groupby(['team', 'fpl_game_week'])
            .agg(count=('team', 'count'), 
                 team=('team', 'first'), 
                 fpl_gw=('fpl_game_week', 'first'), 
                 date=('date', 'first'), 
                 scored=('pred_goals_scored', 'mean'), 
                 conceded=('pred_goals_conceded', 'mean'))
            .assign(scored=lambda x: x['scored'].where(x['count'] != 4, x['scored'] * 2))
            .assign(conceded=lambda x: x['conceded'].where(x['count'] != 4, x['conceded'] * 2))
            .assign(count=lambda x: x['count'].where(x['count'] != 4, 2)))

Looping through each game week getting updated form and predicting future goals scored and conceded based on updated form.

In [32]:
for week in game_weeks:
    predict_goals = (
        get_current_form(final_team_data)
        .pipe(get_future_games, final_team_data)
        .pipe(fitting_model)
        .pipe(add_conceded_adjust)
    )
    predict_goals.to_csv(f'predicting_weekly_team_goals/2018-19/game_week_{week}.csv', index=False)

Having obtained the predicted goals scored and conceded for every game played by a team, we are now ready to progress to the next notebook, where we will focus on predicting specific FPL player points.