In [18]:
import pandas as pd
import pymc3 as pm
import numpy as np
from scipy.stats import gamma

import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
def results_points(a, b):
    """ Win, lose, draw 3, 0, 1 """
    if a > b:
        return [3, 0]
    if b > a:
        return [0, 3]
    if b == a:
        return [1, 1]
    else:
        return ValueError
    
def prob_result(hw, aw, d):
    
    value = np.argmax(np.array([hw, aw, d])
                                )
    if value == 0:
        return 'HomeWin'
    if value == 1:
        return 'AwayWin'
    if value == 2:
        return 'Draw'
    else:
        return ValueError
    
def get_points_by_week(df):
    new_df = df.copy()
    home = new_df[['Gameweek', 'HomeTeamID', 'HomePoints', 'HomeScore', 'AwayScore']].copy()
    away = new_df[['Gameweek', 'AwayTeamID', 'AwayPoints', 'AwayScore', 'HomeScore']].copy()
    cols = ['Gameweek', 'ID', 'Points', 'For', 'Against']
    home.columns = cols
    away.columns = cols
    points_by_week = pd.concat([home, away])
    return points_by_week

def predict_match(homeid, awayid, gameweek, results_df):
    """ Take Home and Away IDs for a fixture and
    predict the result from the means of the simulated goals.
    Assign W, L, D and return dict
    """
    goals_1 = results_df[homeid]
    goals_2 = results_df[awayid]

    win = np.mean(goals_1 > goals_2)
    lose = np.mean(goals_1 < goals_2)
    draw = np.mean(goals_1 == goals_2)

    results = {'Gameweek': gameweek,'HomeTeamID': homeid, 'AwayTeamID': awayid,
               'HomeWin': win, 'AwayWin': lose, 'Draw': draw}
    return results

def create_sim_season(fixtures, sim_results, i):
    """Combine the base fixtures with a set of simulation
    results for goals scored. Get points per team by week using
     results_points, get_points_by_week
     """
    sim_fixtures = fixtures[['Gameweek', 'HomeTeamID', 'AwayTeamID']].copy()
    sim_fixtures['HomeScore'] = sim_results[:, 0, i]
    sim_fixtures['AwayScore'] = sim_results[:, 1, i]
    sim_fixtures['Points'] = sim_fixtures.apply(lambda row: results_points(row['HomeScore'], row['AwayScore']), axis=1)
    sim_fixtures[['HomePoints', 'AwayPoints']] = pd.DataFrame(sim_fixtures['Points'].tolist(), index=sim_fixtures.index)

    return get_points_by_week(sim_fixtures)

In [11]:
# Load data from results.csv
results = pd.read_csv('../data/results.csv')
teams = pd.read_csv('../data/teams.csv')
fixtures = pd.read_csv('../data/fixtures.csv')
# Define the outcome variable
results['Outcome'] = results.apply(
    lambda row: 'HomeWin' if row['HomeScore'] > row['AwayScore'] 
    else ('Draw' if row['HomeScore'] == row['AwayScore'] else 'AwayWin'), axis=1)

In [12]:
home = results[['Gameweek', 'HomeTeamID', 'HomeScore']]
home.columns = ['Gameweek', 'TeamID', 'Score']
away = results[['Gameweek', 'AwayTeamID', 'AwayScore']]
away.columns = ['Gameweek', 'TeamID', 'Score']
goals_df = pd.concat([home, away])

teamids = sorted(goals_df['TeamID'].drop_duplicates().to_list())
past_goals = {str(x): goals_df[goals_df['TeamID']==x]['Score'].to_list() for x in teamids}

In [13]:
model = pm.Model()

with model:
    alpha = pm.Exponential('alpha', lam=1)
    beta = pm.Exponential('beta', lam=1)
    
    mu = dict()
    goals = dict()
    for name, observed in past_goals.items():
        mu[name] = pm.Gamma('mu_'+ str(name), alpha, beta)
        goals[name] = pm.Poisson(name, mu[name], observed=observed)
        
    trace = pm.sample(500)#, nuts_kwargs=dict(target_accept=0.95))

  trace = pm.sample(500)#, nuts_kwargs=dict(target_accept=0.95))
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [mu_28, mu_27, mu_26, mu_25, mu_24, mu_23, mu_22, mu_21, mu_20, mu_19, mu_18, mu_17, mu_16, mu_15, mu_14, mu_13, mu_12, mu_11, mu_10, mu_9, mu_8, mu_7, mu_6, mu_5, mu_4, mu_3, mu_2, mu_1, beta, alpha]


Sampling 4 chains for 1_000 tune and 500 draw iterations (4_000 + 2_000 draws total) took 52 seconds.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
There were 2 divergences after tuning. Increase `target_accept` or reparameterize.


In [14]:
with model:
    post_pred = pm.sample_posterior_predictive(trace, samples=10000)

In [15]:
# Create grid of results
results_df = pd.DataFrame({x : post_pred[x].flatten() for x in post_pred.keys()})
# Get HomeID, Away ID
homeid = fixtures['HomeTeamID'].to_list()
awayid = fixtures['AwayTeamID'].to_list()
gameweek = fixtures['Gameweek'].to_list()

In [22]:
# Get results from all seasons, filter for season 2
season2 = results[results['SeasonID']==2]
results_df_m = pd.DataFrame([predict_match(str(homeid[i]),
                                           str(awayid[i]),
                                           gameweek[i],
                                           results_df) for i in range(len(homeid))])
results_df_m['HomeTeamID'] = results_df_m['HomeTeamID'].astype('int')
results_df_m['AwayTeamID'] = results_df_m['AwayTeamID'].astype('int')

results_df_m['Prob_outcome'] = results_df_m.apply(lambda row: prob_result(row['HomeWin'], row['AwayWin'],
                                                             row['Draw']), axis=1)
season2_outcome = season2['Outcome'].reset_index()
results_df_m['Outcome'] = season2_outcome['Outcome']

In [23]:
(results_df_m['Prob_outcome']==results_df_m['Outcome']).sum() / results_df_m.shape[0]
# to add: precision, recall, f1 score, confusion matrix, Cohen's kappa?, log loss

0.6084656084656085

In [25]:
# Predict season outcome x1000 - done, nice work
homeids = fixtures['HomeTeamID'].to_list()
awayids = fixtures['AwayTeamID'].to_list()
sim_results = np.zeros([756, 2, 1000])
for i in range(fixtures.shape[0]):
    homeid = str(homeids[i])
    awayid = str(awayids[i])
    homescore = results_df[homeid].sample(1000).values
    awayscore = results_df[awayid].sample(1000).values
    sim_results[i, 0, :] = homescore
    sim_results[i, 1, :] = awayscore
    
table_finishes = {x : [] for x in fixtures['HomeTeamID'].unique()}

In [26]:
table_finishes = {x : [] for x in fixtures['HomeTeamID'].unique()}
for x in range(1000):
    points_by_week_sim = create_sim_season(fixtures, sim_results, x)
    final_table_sim = (points_by_week_sim.groupby(['ID']).agg({'Points': 'sum', 'For': 'sum', 'Against': 'sum'})
                .reset_index()
                .sort_values('Points', ascending=False)
                )
    final_table_sim = final_table_sim.reset_index(drop=True)
    [table_finishes[final_table_sim['ID'][i]].append(i+1) for i in range(final_table_sim.shape[0])]

In [None]:
pd.Series(table_finishes[15]).values_counts()