In [21]:
# Custom package
import footballanalysis.model.bayes as fmb

import pickle
import numpy as np
import pandas as pd

import pymc3 as pm
from sklearn.metrics import log_loss

In [2]:
results = pd.read_csv('../data/results.csv')
results['Outcome'] = results.apply(
    lambda row: 'HomeWin' if row['HomeScore'] > row['AwayScore'] 
    else ('Draw' if row['HomeScore'] == row['AwayScore'] else 'AwayWin'), axis=1)
season1 = results[results['SeasonID']==1]

In [3]:
num_teams = season1['HomeTeamID'].nunique()

home_ad_results = season1[['HomeTeamID', 'AwayTeamID', 'HomeScore']].copy()
home_ad_results.columns = ['Team', 'Opponent', 'goals']
home_ad_results['Home_ad'] = 1
away_ad_results = season1[['AwayTeamID', 'HomeTeamID', 'AwayScore']].copy()
away_ad_results.columns = ['Team', 'Opponent', 'goals']
away_ad_results['Home_ad'] = 0
all_ad_results = pd.concat([home_ad_results, away_ad_results])
all_ad_results['Team'] -= 1
all_ad_results['Opponent'] -= 1

In [4]:
# Create LM for expected goals, w/ team, opponent, home
with pm.Model() as model:
    # Priors: team, opponent, home, error
    team_effect = pm.Normal("team_effect", mu=0, sigma=1, shape=num_teams)
    opponent_effect = pm.Normal("opponent_effect", mu=0, sigma=1, shape=num_teams)
    home_adv_effect = pm.Normal("home_adv_effect", mu=0, sigma=1)
    sigma = pm.HalfNormal("sigma", sigma=1)
    
    # data
    teams = all_ad_results['Team'].to_list()
    opponents = all_ad_results['Opponent'].to_list()
    home_advantage = all_ad_results['Home_ad'].to_list()
    goals = all_ad_results['goals'].to_list()
    # model
    mu = (team_effect[teams] - opponent_effect[opponents] + home_adv_effect * home_advantage)
    
    # y-values
    y_obs = pm.Poisson("y_obs", mu=np.exp(mu), observed=goals)
    trace = pm.sample(10000, return_inferencedata=True)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, home_adv_effect, opponent_effect, team_effect]


Sampling 4 chains for 1_000 tune and 10_000 draw iterations (4_000 + 40_000 draws total) took 344 seconds.
There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
The number of effective samples is smaller than 10% for some parameters.


#### Use model to predict outcomes and test

In [5]:
# Get HomeID, Away ID
fixtures = pd.read_csv('../data/fixtures.csv')
homeids = fixtures['HomeTeamID'].to_list()
awayids = fixtures['AwayTeamID'].to_list()
gameweeks = fixtures['Gameweek'].to_list()

In [6]:
match_probs = pd.DataFrame([
    fmb.get_probs(homeids[i], awayids[i], gameweeks[i], trace) for i in range(fixtures.shape[0])
    ])
    
match_probs['Prob_outcome'] = match_probs.apply(lambda row: fmb.prob_result(row['HomeWinPred'], row['AwayWinPred'],
                                                             row['DrawPred']), axis=1)

##### Test: Compare to true outcome (accuracy), odds vs outcome (log loss)

In [7]:
season2 = results[results['SeasonID']==2].copy()
season2['Outcome_encoded'] = season2['Outcome'].map({'HomeWin': 0, 'AwayWin': 1, 'Draw': 2})
season2_outcome = season2[['Outcome', 'Outcome_encoded']].reset_index()
match_probs[['Outcome', 'Outcome_encoded']] = season2_outcome[['Outcome', 'Outcome_encoded']]

In [8]:
### Accuracy
lm_accuracy = (match_probs['Prob_outcome']==match_probs['Outcome']).sum() / match_probs.shape[0]

In [9]:
### Log-loss of new pred Vs. odds
log_loss_lm = log_loss(match_probs['Outcome_encoded'].values, match_probs[['HomeWinPred', 'AwayWinPred', 'DrawPred']].values)

In [10]:
odds = pd.read_csv('../data/odds.csv')
odds[['Home', 'Draw', 'Away']] = 1 / odds[['Home', 'Draw', 'Away']]
results_odds = results.merge(odds, on='MatchID')
season2_ro = results_odds[results_odds['SeasonID']==2]
season2_ro = season2_ro.reset_index(drop=True)
season2_ro['Outcome_encoded'] = season2_ro['Outcome'].map({'HomeWin': 0, 'AwayWin': 1, 'Draw': 2})

In [11]:
season2_ro['Prob_outcome_odds'] = season2_ro.apply(lambda row: fmb.prob_result(row['Home'], row['Away'],
                                                             row['Draw']), axis=1)
### Accuracy
odds_accuracy = (season2_ro['Prob_outcome_odds']==season2_ro['Outcome']).sum() / season2_ro.shape[0]

In [12]:
# Normalise odds: didn't add to 1 when inverted
season2_ro['norm_factor'] = season2_ro[['Home', 'Away', 'Draw']].sum(axis=1)
season2_ro[['Home_n', 'Away_n', 'Draw_n']] = season2_ro[['Home', 'Away', 'Draw']].div(season2_ro['norm_factor'], axis=0)

log_loss_odds = log_loss(season2_ro['Outcome_encoded'].values, season2_ro[['Home_n', 'Away_n', 'Draw_n']].values)

In [13]:
lm_accuracy, odds_accuracy

(0.6124338624338624, 0.6309523809523809)

In [14]:
log_loss_lm, log_loss_odds

(0.8735667797459136, 0.8623406373108837)

#### Table Finishes Data 

In [16]:
# Predict season outcome x1000 - done, nice work
homeids = fixtures['HomeTeamID'].to_list()
awayids = fixtures['AwayTeamID'].to_list()
sim_results = np.zeros([756, 2, 1000])
for i in range(fixtures.shape[0]):
    homeid = homeids[i]
    awayid = awayids[i]
    h, a = fmb.get_match_predictions(homeid, awayid, trace)
    homescore = pd.Series(h).sample(1000).values
    awayscore = pd.Series(a).sample(1000).values
    sim_results[i, 0, :] = homescore
    sim_results[i, 1, :] = awayscore

In [19]:
table_finishes = {x : [] for x in fixtures['HomeTeamID'].unique()}
for x in range(1000):
    points_by_week_sim = fmb.create_sim_season(fixtures, sim_results, x)
    final_table_sim = (points_by_week_sim.groupby(['ID']).agg({'Points': 'sum', 'For': 'sum', 'Against': 'sum'})
                .reset_index()
                .sort_values('Points', ascending=False)
                )
    final_table_sim = final_table_sim.reset_index(drop=True)
    [table_finishes[final_table_sim['ID'][i]].append(i+1) for i in range(final_table_sim.shape[0])]

In [22]:
with open('table_finishes_lm.pkl', 'wb') as file:
    pickle.dump(table_finishes, file)