In [1]:
import pandas as pd
import pymc3 as pm
import numpy as np
from scipy.stats import gamma

from empiricaldist import Pmf
import matplotlib.pyplot as plt



In [2]:
# Load data from results.csv
results = pd.read_csv('../data/results.csv')
teams = pd.read_csv('../data/teams.csv')
fixtures = pd.read_csv('../data/fixtures.csv')
# Define the outcome variable
results['Outcome'] = results.apply(
    lambda row: 'HomeWin' if row['HomeScore'] > row['AwayScore'] 
    else ('Draw' if row['HomeScore'] == row['AwayScore'] else 'AwayWin'), axis=1
)

In [3]:
home = results[['Gameweek', 'HomeTeamID', 'HomeScore']]
home.columns = ['Gameweek', 'TeamID', 'Score']
away = results[['Gameweek', 'AwayTeamID', 'AwayScore']]
away.columns = ['Gameweek', 'TeamID', 'Score']
goals_df = pd.concat([home, away])

In [4]:
teamids = sorted(goals_df['TeamID'].drop_duplicates().to_list())
past_goals = {str(x): goals_df[goals_df['TeamID']==x]['Score'].to_list() for x in teamids}

In [5]:
model = pm.Model()

with model:
    alpha = pm.Exponential('alpha', lam=1)
    beta = pm.Exponential('beta', lam=1)
    
    mu = dict()
    goals = dict()
    for name, observed in past_goals.items():
        mu[name] = pm.Gamma('mu_'+ str(name), alpha, beta)
        goals[name] = pm.Poisson(name, mu[name], observed=observed)
        
    trace = pm.sample(500)#, nuts_kwargs=dict(target_accept=0.95))

  trace = pm.sample(500)#, nuts_kwargs=dict(target_accept=0.95))
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [mu_28, mu_27, mu_26, mu_25, mu_24, mu_23, mu_22, mu_21, mu_20, mu_19, mu_18, mu_17, mu_16, mu_15, mu_14, mu_13, mu_12, mu_11, mu_10, mu_9, mu_8, mu_7, mu_6, mu_5, mu_4, mu_3, mu_2, mu_1, beta, alpha]


Sampling 4 chains for 1_000 tune and 500 draw iterations (4_000 + 2_000 draws total) took 309 seconds.
There were 5 divergences after tuning. Increase `target_accept` or reparameterize.


In [6]:
with model:
    post_pred = pm.sample_posterior_predictive(trace, samples=10000)

In [7]:
goals_1 = post_pred['1'].flatten()
goals_2 = post_pred['2'].flatten()

In [8]:
win = np.mean(goals_1 > goals_2)
lose = np.mean(goals_1 < goals_2)
draw = np.mean(goals_1 == goals_2)

In [9]:
win, lose, draw

(0.5088824074074074, 0.24066018518518517, 0.2504574074074074)

In [10]:
## Predict the outcome of all results

In [11]:
# Create grid of results
results_df = pd.DataFrame({x : post_pred[x].flatten() for x in post_pred.keys()})
# Get HomeID, Away ID
homeid = fixtures['HomeTeamID'].to_list()
awayid = fixtures['AwayTeamID'].to_list()
gameweek = fixtures['Gameweek'].to_list()
# Generate W, L, D
# Create Output

In [12]:
def predict_match(homeid, awayid, gameweek, results_df):
    """ Take Home and Away IDs for a fixture and
    predict the result from the means of the simulated goals.
    Assign W, L, D and return dict
    """

    goals_1 = results_df[homeid]
    goals_2 = results_df[awayid]

    win = np.mean(goals_1 > goals_2)
    lose = np.mean(goals_1 < goals_2)
    draw = np.mean(goals_1 == goals_2)

    results = {'Gameweek': gameweek,'HomeTeamID': homeid,
               'AwayTeamID': awayid,
               'HomeWin': win,
               'AwayWin': lose,
               'Draw': draw}

    return results

In [13]:
results_df = pd.DataFrame([predict_match(str(homeid[i]), str(awayid[i]), gameweek[i], results_df) for i in range(len(homeid))])

In [14]:
results_df['HomeTeamID'] = results_df['HomeTeamID'].astype('int')
results_df['AwayTeamID'] = results_df['AwayTeamID'].astype('int')

In [15]:
results_df_n = (results_df.merge(teams, left_on='HomeTeamID', right_on='TeamID')
                .rename(columns={'TeamName': 'HomeTeamName'})
                .drop('TeamID', axis=1)
                .merge(teams, left_on='AwayTeamID', right_on='TeamID')
                .rename(columns={'TeamName': 'AwayTeamName'})
                .drop('TeamID', axis=1)
)
results_df_n[['Gameweek','HomeTeamName', 'AwayTeamName', 'HomeWin', 'AwayWin', 'Draw']].sort_values('Gameweek').head()

Unnamed: 0,Gameweek,HomeTeamName,AwayTeamName,HomeWin,AwayWin,Draw
0,1,Anaheim,Arlington,0.24066,0.508882,0.250457
30,1,San Francisco,Boston,0.134476,0.68083,0.184694
223,1,Philadelphia,Cleveland Queens,0.472811,0.268348,0.258841
416,1,Montreal,Kansas City,0.306266,0.436746,0.256988
191,1,Seattle,Baltimore,0.374327,0.424727,0.200946


In [16]:
# Get results from all seasons, filter for season 2