In [1]:
# Loading libraries
import arviz as az
import pymc3 as pm
import theano.tensor as tt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Reading the data
df = pd.read_csv('rugby.csv')

In [6]:
observed_home_goals = df.home_score.values
observed_away_goals = df.away_score.values

home_team = df.i_home.values
away_team = df.i_away.values

num_teams = len(df.i_home.drop_duplicates())
num_games = len(home_team)

teams = np.array(['Wales', 'France', 'Ireland', 'Scotland', 'Italy', 'England'])
matches = [f"{home} {away}" for home, away in zip(df.home_team, df.away_team)]

In [7]:
# building the model
with pm.Model() as model:
    # global model parameters
    home = pm.Normal('home', mu=0, sigma=1)
    sd_att = pm.HalfNormal('sd_att', sigma=2)
    sd_def = pm.HalfNormal('sd_def', sigma=2)
    intercept = pm.Normal('intercept', mu=3, sigma=1)
    
    # team-specific model parameters
    atts_star = pm.Normal("atts_star", mu=0, sigma=sd_att, shape=num_teams)
    defs_star = pm.Normal("defs_star", mu=0, sigma=sd_def, shape=num_teams)
 
    atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))
    defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))
    home_theta = tt.exp(intercept + home + atts[home_team] + defs[away_team])
    away_theta = tt.exp(intercept + atts[away_team] + defs[home_team])
    
    # likelihood of observed data
    home_points = pm.Poisson('home_points', mu=home_theta, observed=observed_home_goals)
    away_points = pm.Poisson('away_points', mu=away_theta, observed=observed_away_goals)

In [8]:
with model:
    prior = pm.sample_prior_predictive()
    trace = pm.sample(500, tune=500, cores=4)
    posterior_predictive = pm.sample_posterior_predictive(trace)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [defs_star, atts_star, intercept, sd_def, sd_att, home]
Sampling 4 chains, 0 divergences: 100%|████████████████████████████████████████| 4000/4000 [00:10<00:00, 381.80draws/s]
The number of effective samples is smaller than 25% for some parameters.
100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:02<00:00, 764.82it/s]


In [9]:
# Generating the arviz data object
data = az.from_pymc3(
    trace = trace,
    prior = prior,
    posterior_predictive = posterior_predictive,
    model = model,
    coords = {"team": teams, "match": matches},
    dims = {"atts": ["team"], "defs": ["team"], "atts_star": ["team"], 
            "defs_star": ["team"], "home_points": ["match"], "away_points": ["match"]}, 
)

data

In [10]:
# Storing the model to .nc format
data.to_netcdf('rugby.nc')

'rugby.nc'