# Model Comparisons

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import bpl

from wcpredictor.src.utils import get_and_train_model, test_model

- fit up to end of 2020
- test on 2021 and 2022
- no covariates

In [2]:
years = [
    "1872-1-1", "1925-1-1", "1975-1-1", "2000-1-1", "2008-1-1",
    "2012-1-1", "2014-1-1", "2015-1-1", "2016-1-1", "2017-1-1",
    "2018-1-1", "2019-1-1", "2020-1-1"
]
train_end = "2020-12-31"
test_start = "2021-1-1"
test_end = "2022-12-31"

models = [
    bpl.DixonColesMatchPredictor(),
    bpl.ExtendedDixonColesMatchPredictor(),
    bpl.NeutralDixonColesMatchPredictor(),
    bpl.NeutralDixonColesMatchPredictorWC()
]
likelihoods = {}

for model in models:
    key = type(model).__name__
    print(key)
    likelihoods[key] = []
    for train_start in years:
        print("=======")
        print(train_start)
        print("=======")
        model = get_and_train_model(
            start_date=train_start, end_date=train_end, model=model, rankings_source=None
        ).model
        likelihoods[key].append(test_model(
            model=model, start_date=test_start, end_date=test_end
        ))


DixonColesMatchPredictor
1872-1-1
in get_and_train_model
Using 34942 rows in training data
[MODEL FITTING] Setting training data for the model


KeyError: 'Czechoslovakia'

In [None]:
for mod, lik in likelihoods.items():
    plt.plot(years, lik, "o-", label=mod)
plt.xlabel(f"train from 'year' until {train_end}")
plt.ylabel(f"log likelihood ({test_start} - {test_end})")
plt.xticks(rotation=90)
plt.legend()

In [None]:
pd.DataFrame(likelihoods, index=years)

Max likelihood at 2015, i.e. when training with 6 full years of data.

So for World Cup 2022 may want to include 2016-2022 results.