In [1]:
import pickle

import pandas as pd
import numpy as np
import pymc3 as pm
from scipy.stats import gamma

import footballanalysis.transform.transform as ft
import footballanalysis.model.bayes as fmb



In [3]:
# Load data from results.csv
results = pd.read_csv('../data/results.csv')
teams = pd.read_csv('../data/teams.csv')
fixtures = pd.read_csv('../data/fixtures.csv')
# Define the outcome variable
results['Outcome'] = results.apply(
    lambda row: 'HomeWin' if row['HomeScore'] > row['AwayScore'] 
    else ('Draw' if row['HomeScore'] == row['AwayScore'] else 'AwayWin'), axis=1)

In [4]:
home = results[['Gameweek', 'HomeTeamID', 'HomeScore']]
home.columns = ['Gameweek', 'TeamID', 'Score']
away = results[['Gameweek', 'AwayTeamID', 'AwayScore']]
away.columns = ['Gameweek', 'TeamID', 'Score']
goals_df = pd.concat([home, away])

teamids = sorted(goals_df['TeamID'].drop_duplicates().to_list())
past_goals = {str(x): goals_df[goals_df['TeamID']==x]['Score'].to_list() for x in teamids}

In [5]:
model = pm.Model()

with model:
    alpha = pm.Exponential('alpha', lam=1)
    beta = pm.Exponential('beta', lam=1)
    
    mu = dict()
    goals = dict()
    for name, observed in past_goals.items():
        mu[name] = pm.Gamma('mu_'+ str(name), alpha, beta)
        goals[name] = pm.Poisson(name, mu[name], observed=observed)
        
    trace = pm.sample(500)

  trace = pm.sample(500)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [mu_28, mu_27, mu_26, mu_25, mu_24, mu_23, mu_22, mu_21, mu_20, mu_19, mu_18, mu_17, mu_16, mu_15, mu_14, mu_13, mu_12, mu_11, mu_10, mu_9, mu_8, mu_7, mu_6, mu_5, mu_4, mu_3, mu_2, mu_1, beta, alpha]


Sampling 4 chains for 1_000 tune and 500 draw iterations (4_000 + 2_000 draws total) took 52 seconds.
There were 7 divergences after tuning. Increase `target_accept` or reparameterize.
There were 2 divergences after tuning. Increase `target_accept` or reparameterize.
There were 4 divergences after tuning. Increase `target_accept` or reparameterize.


In [6]:
with model:
    post_pred = pm.sample_posterior_predictive(trace, samples=10000)

In [7]:
# Create grid of results
results_df = pd.DataFrame({x : post_pred[x].flatten() for x in post_pred.keys()})
# Get HomeID, Away ID
homeid = fixtures['HomeTeamID'].to_list()
awayid = fixtures['AwayTeamID'].to_list()
gameweek = fixtures['Gameweek'].to_list()

In [8]:
# Get results from all seasons, filter for season 2
season2 = results[results['SeasonID']==2]
results_df_m = pd.DataFrame([fmb.predict_match(str(homeid[i]),
                                           str(awayid[i]),
                                           gameweek[i],
                                           results_df) for i in range(len(homeid))])
results_df_m['HomeTeamID'] = results_df_m['HomeTeamID'].astype('int')
results_df_m['AwayTeamID'] = results_df_m['AwayTeamID'].astype('int')

results_df_m['Prob_outcome'] = results_df_m.apply(lambda row: fmb.prob_result(row['HomeWin'], row['AwayWin'],
                                                             row['Draw']), axis=1)
season2_outcome = season2['Outcome'].reset_index()
results_df_m['Outcome'] = season2_outcome['Outcome']

In [9]:
(results_df_m['Prob_outcome']==results_df_m['Outcome']).sum() / results_df_m.shape[0]
# to add: precision, recall, f1 score, confusion matrix, Cohen's kappa?, log loss

0.6084656084656085

In [10]:
# Predict season outcome x1000 - done, nice work
homeids = fixtures['HomeTeamID'].to_list()
awayids = fixtures['AwayTeamID'].to_list()
sim_results = np.zeros([756, 2, 1000])
for i in range(fixtures.shape[0]):
    homeid = str(homeids[i])
    awayid = str(awayids[i])
    homescore = results_df[homeid].sample(1000).values
    awayscore = results_df[awayid].sample(1000).values
    sim_results[i, 0, :] = homescore
    sim_results[i, 1, :] = awayscore
    
table_finishes = {x : [] for x in fixtures['HomeTeamID'].unique()}

In [11]:
table_finishes = {x : [] for x in fixtures['HomeTeamID'].unique()}
for x in range(1000):
    points_by_week_sim = fmb.create_sim_season(fixtures, sim_results, x)
    final_table_sim = (points_by_week_sim.groupby(['ID']).agg({'Points': 'sum', 'For': 'sum', 'Against': 'sum'})
                .reset_index()
                .sort_values('Points', ascending=False)
                )
    final_table_sim = final_table_sim.reset_index(drop=True)
    [table_finishes[final_table_sim['ID'][i]].append(i+1) for i in range(final_table_sim.shape[0])]

In [12]:
with open('table_finishes.pkl', 'wb') as file:
    pickle.dump(table_finishes, file)