In [72]:
import pandas as pd
import pymc3 as pm
import numpy as np
import theano.tensor.nnet as nnet
import theano.tensor as tt
from scipy.stats import gamma
from empiricaldist import Pmf

import matplotlib.pyplot as plt

In [21]:
# Load data from results.csv
results = pd.read_csv('../data/results.csv')
teams = pd.read_csv('../data/teams.csv')
fixtures = pd.read_csv('../data/fixtures.csv')
# Define the outcome variable
results['Outcome'] = results.apply(
    lambda row: 'HomeWin' if row['HomeScore'] > row['AwayScore'] 
    else ('Draw' if row['HomeScore'] == row['AwayScore'] else 'AwayWin'), axis=1
)

In [22]:
def results_points(a, b):
    """ Win, lose, draw 3, 0, 1 """
    if a > b:
        return [3, 0]
    if b > a:
        return [0, 3]
    if b == a:
        return [1, 1]
    else:
        return ValueError

In [23]:
results['Points'] = results.apply(lambda row: results_points(row['HomeScore'], row['AwayScore']), axis=1)
results[['HomePoints', 'AwayPoints']] = pd.DataFrame(results['Points'].tolist(), index=results.index)
season1 = results[results['SeasonID']==1]

In [24]:
def get_points_by_week(df):
    new_df = df.copy()
    home = new_df[['Gameweek', 'HomeTeamID', 'HomePoints', 'HomeScore', 'AwayScore']].copy()
    away = new_df[['Gameweek', 'AwayTeamID', 'AwayPoints', 'AwayScore', 'HomeScore']].copy()
    cols = ['Gameweek', 'ID', 'Points', 'For', 'Against']
    home.columns = cols
    away.columns = cols
    points_by_week = pd.concat([home, away])
    return points_by_week

In [25]:
points_by_week_s1 = get_points_by_week(season1)

In [26]:
# Sum points over all weeks by Team
final_table_s1 = (points_by_week_s1.groupby(['ID']).agg({'Points': 'sum', 'For': 'sum', 'Against': 'sum',
                                                         'Gameweek': 'count'})
               .reset_index()
               .sort_values('Points', ascending=False)
               )
final_table_s1['GD'] = final_table_s1['For'] - final_table_s1['Against']
final_table_s1 = final_table_s1.reset_index().drop('index', axis=1)

final_table_s1 = pd.merge(final_table_s1, teams, left_on='ID', right_on='TeamID')
final_table_s1[['TeamName', 'Gameweek','Points', 'For', 'Against', 'GD']].head(5)

Unnamed: 0,TeamName,Gameweek,Points,For,Against,GD
0,Miami,54,138,159,41,118
1,Cincinnati,54,125,130,51,79
2,Baltimore,54,117,136,41,95
3,New York S,54,113,108,52,56
4,Boston,54,106,130,58,72


In [31]:
fixtures.head()

Unnamed: 0,SeasonID,Gameweek,MatchID,HomeTeamID,AwayTeamID
0,2,1,757,2,1
1,2,1,758,28,3
2,2,1,759,27,4
3,2,1,760,26,5
4,2,1,761,25,6


In [131]:
# for fixture - get fixture
fixture_predict = fixtures[fixtures.index==0]
def get_goals_data(selected_fixture):
    """Given a fixture row from the data
    filter the data to return the head to head records for
    the teams and then give vectors for their goal scoring
    # TODO: (1) remove head to from all fixtures - remove double count
    (2) Treats home and away as the same - needs to factor this in - create separate model?
    (3) Doesn't use the data for shots taken - can be added into score as XG
    (4) Doesn't account for goals conceded - !! need to look up
    """
    # Assign team A home, and team B away 
    teams_ab = [selected_fixture['HomeTeamID'].values[0], selected_fixture['AwayTeamID'].values[0]]
    # Get matches for each scores for teams against each other
    head2head = season1[season1['HomeTeamID'].isin(teams_ab) & season1['AwayTeamID'].isin(teams_ab)]
    # create team 1 and team 2 list of goals
    team_a_scores = [head2head[head2head['HomeTeamID']==teams_ab[0]]['HomeScore'].values[0],
                     head2head[head2head['AwayTeamID']==teams_ab[0]]['AwayScore'].values[0]]
    team_b_scores = [head2head[head2head['AwayTeamID']==teams_ab[1]]['AwayScore'].values[0],
                     head2head[head2head['HomeTeamID']==teams_ab[1]]['HomeScore'].values[0]]
    # get all goals for teams
    team_a_goals = pd.concat([season1[season1['HomeTeamID']==teams_ab[0]]['HomeScore'],
                              season1[season1['AwayTeamID']==teams_ab[0]]['AwayScore']])
    team_b_goals = pd.concat([season1[season1['AwayTeamID']==teams_ab[1]]['AwayScore'],
                              season1[season1['HomeTeamID']==teams_ab[1]]['HomeScore']])
    
    return {'teams': teams, 'head2head': head2head, 'team_goals': [team_a_goals, team_b_goals]}


In [None]:
get_goals_data()

In [128]:
team_a_goals = pd.concat([season1[season1['HomeTeamID']==teams_ab[0]]['HomeScore'],
 season1[season1['AwayTeamID']==teams_ab[0]]['AwayScore']])
team_b_goals = pd.concat([season1[season1['AwayTeamID']==teams_ab[1]]['AwayScore'],
 season1[season1['HomeTeamID']==teams_ab[1]]['HomeScore']])

In [34]:
season1 = results[results['SeasonID']==1]

In [37]:
season1[season1['HomeTeamID'].isin([1, 2]) & season1['AwayTeamID'].isin([1, 2])]

Unnamed: 0,SeasonID,Gameweek,MatchID,HomeTeamID,HomeScore,HomeShots,AwayTeamID,AwayScore,AwayShots,Outcome,Points,HomePoints,AwayPoints
14,1,2,15,2,0,9,1,2,18,AwayWin,"[0, 3]",0,3
154,1,12,155,1,6,28,2,1,8,HomeWin,"[3, 0]",3,0


In [63]:
team_1_r = [0, 1]
team_2_r = [2, 6]

In [64]:
# get data for teams
team_1 = 2
team_2 = 1
team_1_goals = pd.concat([season1[season1['HomeTeamID']==team_1]['HomeScore'], season1[season1['AwayTeamID']==team_1]['AwayScore']])
team_2_goals = pd.concat([season1[season1['HomeTeamID']==team_2]['HomeScore'], season1[season1['AwayTeamID']==team_2]['AwayScore']])

In [57]:
alpha1 = team_1_goals.mean()**2 / team_1_goals.var()
beta1 = team_1_goals.var() / team_1_goals.mean()

alpha2 = team_2_goals.mean()**2 / team_2_goals.var()
beta2 = team_2_goals.var() / team_2_goals.mean()

In [65]:
## Single prediction for ID 2 Vs. 1
model = pm.Model()
with model:
    mu_1 = pm.Gamma('mu_1', alpha1, beta1)
    mu_2 = pm.Gamma('mu_2', alpha2, beta2)
    goals_1 = pm.Poisson('goals_1', mu_1, observed=team_1_r)
    goals_2 = pm.Poisson('goals_2', mu_2, observed=team_2_r)
    trace = pm.sample(1000)

  trace = pm.sample(1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [mu_2, mu_1]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 33 seconds.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.


In [93]:
# from empiricaldist import Cdf

# def plot_cdf(sample, **options):
#     """Compute and plot the CDF of a sample."""
#     Cdf.from_seq(sample).plot(**options)
    
# def decorate_rate(ylabel='PDF'):
#     """Decorate the axes."""
#     plt.xlabel('Goals per game (mu)')
#     plt.ylabel(ylabel)
#     plt.title('Distribution of goal scoring rate')
    
#     handles, labels = plt.gca().get_legend_handles_labels()
#     if len(labels):
#         plt.legend()

In [92]:
# mu_1 = trace['mu_1']
# plot_cdf(mu_1, label='mu_TBL posterior')

# mu_2 = trace['mu_2']
# plot_cdf(mu_2, label='mu_BOS posterior')

# decorate_rate('CDF')
# np.mean(mu_2), np.mean(mu_1)

# plt.savefig('zigzag14.png', dpi=150)

In [78]:
with model:
    post_pred = pm.sample_posterior_predictive(trace, samples=1000)



In [82]:
goals_1 = post_pred['goals_1'].flatten()
goals_2 = post_pred['goals_2'].flatten()

In [91]:
# def set_colors():
#     """Set the color cycle for goals"""
#     plt.gca().set_prop_cycle(color=['#2ca02c', '#9467bd',])

# def decorate_goals(ylabel='PMF'):
#     """Decorate the axes."""
#     plt.xlabel('Number of goals')
#     plt.ylabel(ylabel)
#     plt.title('Distribution of goals scored')
#     plt.legend()

# set_colors()
# plot_cdf(goals_1, label='TBL')
# plot_cdf(goals_2, label='BOS')
# decorate_goals('CDF')

# plt.savefig('zigzag15.png', dpi=150)

In [87]:
win = np.mean(goals_1 > goals_2)
lose = np.mean(goals_1 < goals_2)
draw = np.mean(goals_1 == goals_2)

In [89]:
win, lose, draw

(0.088, 0.757, 0.155)

array([0, 0, 1, ..., 0, 2, 0])