In [189]:
import pandas as pd
import pymc3 as pm
import numpy as np
import theano.tensor.nnet as nnet
import theano.tensor as tt
from scipy.stats import gamma
from empiricaldist import Pmf

import matplotlib.pyplot as plt

In [190]:
# Load data from results.csv
results = pd.read_csv('../data/results.csv')
teams = pd.read_csv('../data/teams.csv')
fixtures = pd.read_csv('../data/fixtures.csv')
# Define the outcome variable
results['Outcome'] = results.apply(
    lambda row: 'HomeWin' if row['HomeScore'] > row['AwayScore'] 
    else ('Draw' if row['HomeScore'] == row['AwayScore'] else 'AwayWin'), axis=1
)

In [191]:
def results_points(a, b):
    """ Win, lose, draw 3, 0, 1 """
    if a > b:
        return [3, 0]
    if b > a:
        return [0, 3]
    if b == a:
        return [1, 1]
    else:
        return ValueError

In [192]:
results['Points'] = results.apply(lambda row: results_points(row['HomeScore'], row['AwayScore']), axis=1)
results[['HomePoints', 'AwayPoints']] = pd.DataFrame(results['Points'].tolist(), index=results.index)
season1 = results[results['SeasonID']==1]

In [193]:
def get_points_by_week(df):
    new_df = df.copy()
    home = new_df[['Gameweek', 'HomeTeamID', 'HomePoints', 'HomeScore', 'AwayScore']].copy()
    away = new_df[['Gameweek', 'AwayTeamID', 'AwayPoints', 'AwayScore', 'HomeScore']].copy()
    cols = ['Gameweek', 'ID', 'Points', 'For', 'Against']
    home.columns = cols
    away.columns = cols
    points_by_week = pd.concat([home, away])
    return points_by_week

In [194]:
points_by_week_s1 = get_points_by_week(season1)

In [195]:
# Sum points over all weeks by Team
final_table_s1 = (points_by_week_s1.groupby(['ID']).agg({'Points': 'sum', 'For': 'sum', 'Against': 'sum',
                                                         'Gameweek': 'count'})
               .reset_index()
               .sort_values('Points', ascending=False)
               )
final_table_s1['GD'] = final_table_s1['For'] - final_table_s1['Against']
final_table_s1 = final_table_s1.reset_index().drop('index', axis=1)

final_table_s1 = pd.merge(final_table_s1, teams, left_on='ID', right_on='TeamID')
final_table_s1[['TeamName', 'Gameweek','Points', 'For', 'Against', 'GD']].head(5)

Unnamed: 0,TeamName,Gameweek,Points,For,Against,GD
0,Miami,54,138,159,41,118
1,Cincinnati,54,125,130,51,79
2,Baltimore,54,117,136,41,95
3,New York S,54,113,108,52,56
4,Boston,54,106,130,58,72


In [197]:
# for fixture - get fixture
fixture_predict = fixtures[fixtures.index==0]
def get_goals_data(selected_fixture, results_data):
    """Given a fixture row from the data
    filter the data to return the head to head records for
    the teams and then give vectors for their goal scoring
    # TODO: (1) remove head to from all fixtures - remove double count
    (2) Treats home and away as the same - needs to factor this in - create separate model?
    (3) Doesn't use the data for shots taken - can be added into score as XG
    (4) Doesn't account for goals conceded - !! need to look up
    """
    # Assign team A home, and team B away 
    teams_ab = [selected_fixture['HomeTeamID'].values[0], selected_fixture['AwayTeamID'].values[0]]
    # Get matches for each scores for teams against each other
    head2head = results_data[results_data['HomeTeamID'].isin(teams_ab) & 
                             results_data['AwayTeamID'].isin(teams_ab)]
    # create team 1 and team 2 list of goals
    team_a_scores = [head2head[head2head['HomeTeamID']==teams_ab[0]]['HomeScore'].values[0],
                     head2head[head2head['AwayTeamID']==teams_ab[0]]['AwayScore'].values[0]]
    team_b_scores = [head2head[head2head['AwayTeamID']==teams_ab[1]]['AwayScore'].values[0],
                     head2head[head2head['HomeTeamID']==teams_ab[1]]['HomeScore'].values[0]]
    # get all goals for teams
    team_a_goals = pd.concat([results_data[results_data['HomeTeamID']==teams_ab[0]]['HomeScore'],
                              results_data[results_data['AwayTeamID']==teams_ab[0]]['AwayScore']])
    team_b_goals = pd.concat([results_data[results_data['AwayTeamID']==teams_ab[1]]['AwayScore'],
                              results_data[results_data['HomeTeamID']==teams_ab[1]]['HomeScore']])
    
    return {'teams': teams_ab, 'head2head': [team_a_scores, team_b_scores], 'team_goals': [team_a_goals, team_b_goals]}

In [198]:
def get_alpha_beta(teams_data):
    """Take in Teams data and create alpha, beta
    for gamma dist team A, team B
    TODO: (1)check formula for calculating alpha, beta,
    (2) Check what data should go into alpha, beta for goals
    Is the distribution of goals scored before a good measure?
    """
    team_goals = teams_data['team_goals']
    alpha_a = team_goals[0].mean()**2 / team_goals[0].var()
    beta_a = team_goals[0].var() / team_goals[0].mean()

    alpha_b = team_goals[1].mean()**2 / team_goals[1].var()
    beta_b = team_goals[1].var() / team_goals[1].mean()

    return [[alpha_a, beta_a], [alpha_b, beta_b]]

In [199]:
## Single prediction for ID 2 Vs. 1
def fit_gp_model(teams_data, alpha_beta):
    """ Initialise model, fit priors w/ alpha, beta
    update with observed results
    """
    model = pm.Model()
    with model:
        mu_a = pm.Gamma('mu_a', alpha_beta[0][0], alpha_beta[0][1])
        mu_b = pm.Gamma('mu_b', alpha_beta[1][0], alpha_beta[1][1])
        goals_1 = pm.Poisson('goals_a', mu_a, observed=teams_data['head2head'][0])
        goals_2 = pm.Poisson('goals_b', mu_b, observed=teams_data['head2head'][1])
        trace = pm.sample(1000)
    return model, trace

In [200]:
def predict_results(model, trace):
    """
    Sample posterior for predictions and return 1000 runs of
    goals for team A and team B
    TODO: Why do I get 2000 results from 1000 samples?
    """
    with model:
        post_pred = pm.sample_posterior_predictive(trace, samples=1000)
    goals_a = post_pred['goals_a'].flatten()
    goals_b = post_pred['goals_b'].flatten()
    return [goals_a, goals_b]

In [201]:
def get_probabilities(teams_data, match_results):
    """Calculate the result probabilities for
    win, lose, draw for the fixture and format
    with the teams data
    """
    win = np.mean(match_results[0] > match_results[1])
    lose = np.mean(match_results[0] < match_results[1])
    draw = np.mean(match_results[0] == match_results[1])
    result = pd.DataFrame({'HomeTeamID': [teams_data['teams'][0]],
                           'AwayTeamID': [teams_data['teams'][1]],
                           'HomeWin': [win],
                           'AwayWin': [lose],
                           'Draw': [draw]
                           })
    return result

In [202]:
# TODO: Create Bayes data object

In [203]:
# Example: get data
selected_fix = fixtures[fixtures.index==0]
season1 = results[results['SeasonID']==1]
teams_data = get_goals_data(selected_fix, season1)
alpha_beta = get_alpha_beta(teams_data)
model, trace = fit_gp_model(teams_data, alpha_beta)
match_results = predict_results(model, trace)
results_df = get_probabilities(teams_data, match_results)

  trace = pm.sample(1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [mu_b, mu_a]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 39 seconds.


In [204]:
results_df

Unnamed: 0,HomeTeamID,AwayTeamID,HomeWin,AwayWin,Draw
0,2,1,0.093,0.742,0.165


In [34]:
season1 = results[results['SeasonID']==1]

In [37]:
season1[season1['HomeTeamID'].isin([1, 2]) & season1['AwayTeamID'].isin([1, 2])]

Unnamed: 0,SeasonID,Gameweek,MatchID,HomeTeamID,HomeScore,HomeShots,AwayTeamID,AwayScore,AwayShots,Outcome,Points,HomePoints,AwayPoints
14,1,2,15,2,0,9,1,2,18,AwayWin,"[0, 3]",0,3
154,1,12,155,1,6,28,2,1,8,HomeWin,"[3, 0]",3,0


In [63]:
team_1_r = [0, 1]
team_2_r = [2, 6]

In [64]:
# get data for teams
team_1 = 2
team_2 = 1
team_1_goals = pd.concat([season1[season1['HomeTeamID']==team_1]['HomeScore'], season1[season1['AwayTeamID']==team_1]['AwayScore']])
team_2_goals = pd.concat([season1[season1['HomeTeamID']==team_2]['HomeScore'], season1[season1['AwayTeamID']==team_2]['AwayScore']])

In [57]:
alpha1 = team_1_goals.mean()**2 / team_1_goals.var()
beta1 = team_1_goals.var() / team_1_goals.mean()

alpha2 = team_2_goals.mean()**2 / team_2_goals.var()
beta2 = team_2_goals.var() / team_2_goals.mean()

In [65]:
## Single prediction for ID 2 Vs. 1
model = pm.Model()
with model:
    mu_1 = pm.Gamma('mu_1', alpha1, beta1)
    mu_2 = pm.Gamma('mu_2', alpha2, beta2)
    goals_1 = pm.Poisson('goals_1', mu_1, observed=team_1_r)
    goals_2 = pm.Poisson('goals_2', mu_2, observed=team_2_r)
    trace = pm.sample(1000)

  trace = pm.sample(1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [mu_2, mu_1]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 33 seconds.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.


In [93]:
# from empiricaldist import Cdf

# def plot_cdf(sample, **options):
#     """Compute and plot the CDF of a sample."""
#     Cdf.from_seq(sample).plot(**options)
    
# def decorate_rate(ylabel='PDF'):
#     """Decorate the axes."""
#     plt.xlabel('Goals per game (mu)')
#     plt.ylabel(ylabel)
#     plt.title('Distribution of goal scoring rate')
    
#     handles, labels = plt.gca().get_legend_handles_labels()
#     if len(labels):
#         plt.legend()

In [92]:
# mu_1 = trace['mu_1']
# plot_cdf(mu_1, label='mu_TBL posterior')

# mu_2 = trace['mu_2']
# plot_cdf(mu_2, label='mu_BOS posterior')

# decorate_rate('CDF')
# np.mean(mu_2), np.mean(mu_1)

# plt.savefig('zigzag14.png', dpi=150)

In [78]:
with model:
    post_pred = pm.sample_posterior_predictive(trace, samples=1000)



In [82]:
goals_1 = post_pred['goals_1'].flatten()
goals_2 = post_pred['goals_2'].flatten()

In [91]:
# def set_colors():
#     """Set the color cycle for goals"""
#     plt.gca().set_prop_cycle(color=['#2ca02c', '#9467bd',])

# def decorate_goals(ylabel='PMF'):
#     """Decorate the axes."""
#     plt.xlabel('Number of goals')
#     plt.ylabel(ylabel)
#     plt.title('Distribution of goals scored')
#     plt.legend()

# set_colors()
# plot_cdf(goals_1, label='TBL')
# plot_cdf(goals_2, label='BOS')
# decorate_goals('CDF')

# plt.savefig('zigzag15.png', dpi=150)

In [87]:
win = np.mean(goals_1 > goals_2)
lose = np.mean(goals_1 < goals_2)
draw = np.mean(goals_1 == goals_2)

In [89]:
win, lose, draw

(0.088, 0.757, 0.155)