In [1]:
%load_ext kedro.ipython
%reload_kedro
%load_ext autoreload
%autoreload 2

In [2]:
from collections import OrderedDict

import numpy as np
import pandas as pd

GOALS_HOME = "FTHG"
GOALS_AWAY = "FTAG"
df = catalog.load("D1_24-25")


def _get_teams(df: pd.DataFrame) -> np.array:
    team1 = df["HomeTeam"].unique().astype("U")
    team2 = df["AwayTeam"].unique().astype("U")
    teams = np.unique(np.concatenate((team1, team2)))

    assert len(teams) == 18
    # nb_teams = len(teams)

    return teams


def _get_teams_new(df: pd.DataFrame) -> np.array:
    teams, uniques = pd.factorize(
        df[["HomeTeam", "AwayTeam"]].values.flatten(), sort=True
    )

    return teams, uniques


def _build_team_lexicon(teams: np.array) -> pd.DataFrame:
    team_indices = OrderedDict()
    for i, t in enumerate(teams):
        team_indices[t] = i
    lex = pd.DataFrame(list(team_indices.items()), columns=["team", "index"])
    lex.set_index("team", inplace=True)
    return lex


def _get_goal_results(df: pd.DataFrame, team_indices: pd.DataFrame):
    home_goals = list()
    away_goals = list()
    for _, r in df.iterrows():
        home_team = r.HomeTeam

        away_team = r.AwayTeam
        home_goals.append(
            (
                team_indices.loc[home_team, "index"],
                team_indices.loc[away_team, "index"],
                r[GOALS_HOME],
            )
        )
        away_goals.append(
            (
                team_indices.loc[home_team, "index"],
                team_indices.loc[away_team, "index"],
                r[GOALS_AWAY],
            )
        )

    return home_goals, away_goals


def _vectorized_data(home_goals_, away_goals_) -> pd.DataFrame:
    home_id = np.array([hg[0] for hg in home_goals_])
    away_id = np.array([hg[1] for hg in home_goals_])
    home_goals = np.array([hg[2] for hg in home_goals_])
    away_goals = np.array([ag[2] for ag in away_goals_])
    toto = np.where(
        home_goals == away_goals, 0, np.where(home_goals > away_goals, 1, 2)
    )
    vectorized_data = pd.DataFrame(
        {
            "home_id": home_id,
            "away_id": away_id,
            "home_goals": home_goals,
            "away_goals": away_goals,
            "toto": toto,
        }
    )

    return vectorized_data


def preprocess_league_data(df: pd.DataFrame) -> pd.DataFrame:
    teams = _get_teams(df)
    team_indices = _build_team_lexicon(teams=teams)
    home_goals_, away_goals_ = _get_goal_results(df=df, team_indices=team_indices)

    # list of tuples
    # (home_team-index, away_team_index, scored_goals of home team resp. away team)

    return _vectorized_data(home_goals_, away_goals_), team_indices


def create_model_input_data(df: pd.DataFrame) -> pd.DataFrame:
    # merge if we use more then D1 Bundesliga data
    model_input_table = df
    return model_input_table

In [3]:
import pymc as pm

model_coords = {
    "team": _get_teams_new(df)[1],
    "match": np.arange(len(df)),
    "skill": ["offence", "defence"],
    "field": ["home", "away"],
}
model_config: dict = {
    "offence_mu_prior": 0.0,
    "offence_tau_prior": 1.0,
    "defence_mu_prior": 0.0,
    "defence_tau_prior": 1.0,
    "shape_priors": 18,  # toDo: check this for the number of teams
}
vec_df = preprocess_league_data(df)[0]

x_data = preprocess_league_data(df)[0]

In [4]:
# Set random seed
np.random.seed(42)
min_mu = 0.0001
with pm.Model(coords=model_coords) as model:
    # Create mutable data containers
    team_idx = pm.Data(
        "team_idx", vec_df[["home_id", "away_id"]].values, dims=("match", "field")
    )
    goals = pm.Data(
        "goals", vec_df[["home_goals", "away_goals"]], dims=("match", "field")
    )
    off_mu_prior = model_config.get("off_mu_prior", 3 / 2)
    def_mu_prior = model_config.get("def_mu_prior", 0.0)
    off_tau_prior = model_config.get("off_tau_prior", 1.0)
    def_tau_prior = model_config.get("def_tau_prior", 1.0)

    # priors
    offence = pm.Normal(
        "offence",
        mu=off_mu_prior,
        sigma=off_tau_prior,
        # shape=shape_priors, ### kann ich auf shapes verzichten, wenn dims verwendet werden?
        dims="team",
    )
    defence = pm.Normal(
        "defence",
        mu=def_mu_prior,
        tau=def_tau_prior,
        dims="team",
    )
    # offence2 = pm.Normal(
    #     "offence2",
    #     mu=off_mu_prior,
    #     sigma=off_tau_prior,
    #     shape=18,
    # )

    # print(team_idx.eval()[:5])
    # print("Offence NEW: \n", offence_home.eval()[:5])
    # print("Offence OLD: \n", x_data["home_id"].values[:5])
    # print("Offence OLD: \n", offence_home_old.eval()[:5])

    offence_home_away = offence[team_idx]
    defence_home_away = defence[team_idx]
    print(defence_home_away.eval()[:1][:1])

    mu_home_away = offence_home_away - defence_home_away.eval()[:, [1, 0]]

    mu_home = mu_home_away[:, 0]
    mu_away = mu_home_away[:, 1]

    print("OFFENCE: ", offence_home_away.eval()[:1])
    print("DEFENCE: ", defence_home_away.eval()[:1])
    print("Total: ", mu_home_away.eval()[:1])
    print("GOALS: ", mu_home_away.eval()[:1, 1])
    # # note: use exponent in practice instead of switch
    mu_home = pm.math.switch(mu_home > min_mu, mu_home, min_mu)
    mu_away = pm.math.switch(mu_away > min_mu, mu_away, min_mu)

    # observed
    # pm.Poisson(
    #     "points",
    #     observed=vec_df[["home_goals", "away_goals"]],
    #     mu=np.transpose([mu_home.eval(), mu_away.eval()]), #toDO: transpose in aesera auslagen, performance schlecht
    #     dims=("match", "field"),
    # )
    pm.Poisson("home_goals", observed=vec_df["home_goals"], mu=mu_home, dims="match")
    pm.Poisson("away_goals", observed=vec_df["away_goals"], mu=mu_away, dims="match")


[[1.52709373 0.56008613]]
OFFENCE:  [[0.53841742 1.3476087 ]]
DEFENCE:  [[1.52709373 0.56008613]]
Total:  [[-0.02166871 -0.17948503]]
GOALS:  [-0.17948503]


In [5]:
nb_samples = 2000
### tune adds additional number of samples.
### after sampling these additional samples get discarded
### as the very first samples are very inaccurate
tune = nb_samples // 10

with model:
    # with model1_home_advantage:
    trace = pm.sample(draws=nb_samples, tune=tune)

Output()

In [151]:
posterior = trace.posterior.stack(sample=["chain", "draw"])
offence = posterior["offence"]
defence = posterior["defence"]
# home_advantage = posterior["home_advantage"]

[autoreload of cutils_ext failed: Traceback (most recent call last):
  File "h:\Programs\Anaconda\envs\.conda_ba_env\Lib\site-packages\IPython\extensions\autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "h:\Programs\Anaconda\envs\.conda_ba_env\Lib\site-packages\IPython\extensions\autoreload.py", line 475, in superreload
    module = reload(module)
             ^^^^^^^^^^^^^^
  File "h:\Programs\Anaconda\envs\.conda_ba_env\Lib\importlib\__init__.py", line 130, in reload
    raise ModuleNotFoundError(f"spec not found for the module {name!r}", name=name)
ModuleNotFoundError: spec not found for the module 'cutils_ext'
]


In [152]:
print("Mean offence strenght per team:")
for i, t in enumerate(model_coords["team"]):
    print(t, " ", offence[i].mean().values)

Mean offence strenght per team:
Augsburg   1.8081668635871209
Bayern Munich   2.868903103847895
Bochum   1.6311599195036814
Darmstadt   1.4537332181871214
Dortmund   2.4115209925587653
Ein Frankfurt   1.7779422480186813
FC Koln   1.4084998782748248
Freiburg   1.6744086686216746
Heidenheim   1.8936196782829025
Hoffenheim   2.165119499848583
Leverkusen   2.8744730625970343
M'gladbach   1.9011109601995968
Mainz   1.5547430984259645
RB Leipzig   2.5107550157168306
Stuttgart   2.6209707557437545
Union Berlin   1.345922946496733
Werder Bremen   1.8128745338828118
Wolfsburg   1.7214169774238688


In [144]:
print("Mean offence strenght per team:")
for i, t in enumerate(model_coords["team"]):
    print(t, " ", defence[i].mean().values)

Mean offence strenght per team:
Augsburg   -0.007959309733199563
Bayern Munich   -0.011272908908565157
Bochum   -0.0031239432849411337
Darmstadt   0.0025963671530729905
Dortmund   0.006944529417375259
Ein Frankfurt   -0.014074155662830224
FC Koln   -0.0014080217587279632
Freiburg   0.0028746969854293604
Heidenheim   -0.00603717441324017
Hoffenheim   -0.0035334183670442973
Leverkusen   -0.01212433780132489
M'gladbach   -0.0032155458607579725
Mainz   0.005676573363724497
RB Leipzig   0.0077647002066565975
Stuttgart   -0.0033859249364242235
Union Berlin   0.008040982993080436
Werder Bremen   -0.011724093584508922
Wolfsburg   0.0006025411713604596


In [None]:
model_coords

In [None]:
# # Evaluate the data container
# data = model["team_idx"].eval()

# # Show the data by dimension
# home_field_data = data[
#     :, model_coords["field"].index("home")
# ]  # Data for the "home" field
# away_field_data = data[
#     :, model_coords["field"].index("away")
# ]  # Data for the "away" field

# print("Home Field Data:", home_field_data)
# print("Away Field Data:", away_field_data)


