In [2]:
%load_ext kedro.ipython
%reload_kedro
%load_ext autoreload
%autoreload 2

In [None]:
from collections import OrderedDict

import numpy as np
import pandas as pd

GOALS_HOME = "FTHG"
GOALS_AWAY = "FTAG"
df = catalog.load("D1_24-25")


def _get_teams(df: pd.DataFrame) -> np.array:
    team1 = df["HomeTeam"].unique().astype("U")
    team2 = df["AwayTeam"].unique().astype("U")
    teams = np.unique(np.concatenate((team1, team2)))

    assert len(teams) == 18
    # nb_teams = len(teams)

    return teams


def _get_teams_new(df: pd.DataFrame) -> np.array:
    teams, uniques = pd.factorize(
        df[["HomeTeam", "AwayTeam"]].values.flatten(), sort=True
    )

    return teams, uniques


def _build_team_lexicon(teams: np.array) -> pd.DataFrame:
    team_indices = OrderedDict()
    for i, t in enumerate(teams):
        team_indices[t] = i
    lex = pd.DataFrame(list(team_indices.items()), columns=["team", "index"])
    lex.set_index("team", inplace=True)
    return lex


def _get_goal_results(df: pd.DataFrame, team_indices: pd.DataFrame):
    home_goals = list()
    away_goals = list()
    for _, r in df.iterrows():
        home_team = r.HomeTeam

        away_team = r.AwayTeam
        home_goals.append(
            (
                team_indices.loc[home_team, "index"],
                team_indices.loc[away_team, "index"],
                r[GOALS_HOME],
            )
        )
        away_goals.append(
            (
                team_indices.loc[home_team, "index"],
                team_indices.loc[away_team, "index"],
                r[GOALS_AWAY],
            )
        )

    return home_goals, away_goals


def _vectorized_data(home_goals_, away_goals_) -> pd.DataFrame:
    home_id = np.array([hg[0] for hg in home_goals_])
    away_id = np.array([hg[1] for hg in home_goals_])
    home_goals = np.array([hg[2] for hg in home_goals_])
    away_goals = np.array([ag[2] for ag in away_goals_])
    toto = np.where(
        home_goals == away_goals, 0, np.where(home_goals > away_goals, 1, 2)
    )
    vectorized_data = pd.DataFrame(
        {
            "home_id": home_id,
            "away_id": away_id,
            "home_goals": home_goals,
            "away_goals": away_goals,
            "toto": toto,
        }
    )

    return vectorized_data


def preprocess_league_data(df: pd.DataFrame) -> pd.DataFrame:
    teams = _get_teams(df)
    team_indices = _build_team_lexicon(teams=teams)
    home_goals_, away_goals_ = _get_goal_results(df=df, team_indices=team_indices)

    # list of tuples
    # (home_team-index, away_team_index, scored_goals of home team resp. away team)

    return _vectorized_data(home_goals_, away_goals_), team_indices


def create_model_input_data(df: pd.DataFrame) -> pd.DataFrame:
    # merge if we use more then D1 Bundesliga data
    model_input_table = df
    return model_input_table

In [19]:
pre = preprocess_league_data(df)
pre[0].head()



Unnamed: 0,home_id,away_id,home_goals,away_goals,toto
0,16,1,0,4,2
1,0,11,4,4,0
2,9,7,1,2,2
3,10,13,3,2,1
4,14,2,5,0,1


In [20]:
pre[0][['home_goals','away_goals']]

Unnamed: 0,home_goals,away_goals
0,0,4
1,4,4
2,1,2
3,3,2
4,5,0
...,...,...
301,2,1
302,4,0
303,2,1
304,4,1


In [76]:
df[["HomeTeam", "AwayTeam", "FTHG", "FTAG"]]["HomeTeam"][10:14]


[1;36m10[0m       Bochum
[1;36m11[0m    Darmstadt
[1;36m12[0m      FC Koln
[1;36m13[0m     Freiburg
Name: HomeTeam, dtype: object

In [11]:
teams = _get_teams(df)
team_indices = _build_team_lexicon(teams=teams)
home_goals_, away_goals_ = _get_goal_results(df=df, team_indices=team_indices)

In [17]:
np.unique(pre[0][["home_id"]].values)


[1;35marray[0m[1m([0m[1m[[0m [1;36m0[0m,  [1;36m1[0m,  [1;36m2[0m,  [1;36m3[0m,  [1;36m4[0m,  [1;36m5[0m,  [1;36m6[0m,  [1;36m7[0m,  [1;36m8[0m,  [1;36m9[0m, [1;36m10[0m, [1;36m11[0m, [1;36m12[0m, [1;36m13[0m, [1;36m14[0m, [1;36m15[0m, [1;36m16[0m,
       [1;36m17[0m[1m][0m, [33mdtype[0m=[35mint64[0m[1m)[0m

In [77]:
[print(hg) for hg in home_goals_ if hg[0] == 10 and hg[1] == 13]
[print(hg) for hg in away_goals_ if hg[0] == 10 and hg[1] == 13]
[print(hg) for hg in home_goals_ if hg[0] == 13 and hg[1] == 10]
[print(hg) for hg in away_goals_ if hg[0] == 13 and hg[1] == 10]

(10, 13, 3)
(10, 13, 2)
(13, 10, 2)
(13, 10, 3)


[1m[[0m[3;35mNone[0m[1m][0m

In [3]:
_vectorized_data(home_goals_, away_goals_)


In [None]:
pre = preprocess_league_data(df)

d = pre[0]
print(d[(d["home_id"] == 10) & (d["away_id"] == 13)])
print(d[(d["home_id"] == 13) & (d["away_id"] == 10)])

   home_id  away_id  home_goals  away_goals  toto
3       10       13           3           2     1
     home_id  away_id  home_goals  away_goals  toto
157       13       10           2           3     2


In [58]:
def _get_teams2(df: pd.DataFrame) -> np.array:
    teams, uniques = pd.factorize(
        df[["HomeTeam", "AwayTeam"]].values.flatten(), sort=True
    )

    return teams, uniques


_get_teams2(df)


[1m([0m
    [1;35marray[0m[1m([0m[1m[[0m[1;36m16[0m,  [1;36m1[0m,  [1;36m0[0m, [1;36m11[0m,  [1;36m9[0m,  [1;36m7[0m, [1;36m10[0m, [1;36m13[0m, [1;36m14[0m,  [1;36m2[0m, [1;36m17[0m,  [1;36m8[0m,  [1;36m4[0m,  [1;36m6[0m, [1;36m15[0m, [1;36m12[0m,  [1;36m5[0m,
        [1;36m3[0m, [1;36m13[0m, [1;36m14[0m,  [1;36m2[0m,  [1;36m4[0m,  [1;36m3[0m, [1;36m15[0m,  [1;36m6[0m, [1;36m17[0m,  [1;36m7[0m, [1;36m16[0m,  [1;36m8[0m,  [1;36m9[0m, [1;36m11[0m, [1;36m10[0m, [1;36m12[0m,  [1;36m5[0m,
        [1;36m1[0m,  [1;36m0[0m,  [1;36m4[0m,  [1;36m8[0m,  [1;36m0[0m,  [1;36m2[0m,  [1;36m9[0m, [1;36m17[0m, [1;36m10[0m,  [1;36m3[0m, [1;36m14[0m,  [1;36m7[0m, [1;36m16[0m, [1;36m12[0m, [1;36m11[0m,  [1;36m1[0m,  [1;36m5[0m,
        [1;36m6[0m, [1;36m15[0m, [1;36m13[0m,  [1;36m1[0m, [1;36m10[0m,  [1;36m6[0m,  [1;36m9[0m,  [1;36m7[0m,  [1;36m4[0m, [1;36m12[0m, [1;36m14[0m, 

In [5]:
# %reload_kedro
from modelbuilder import FootballModel

model = FootballModel()
model_input = catalog.load("model_input_table")
x_data = model_input[["home_id", "away_id"]]
y_data = model_input[["home_goals", "away_goals"]]
y_data_series = y_data.apply(lambda row: (row["home_goals"], row["away_goals"]), axis=1)

model.fit(X=x_data, home_goals=y_data["home_goals"], away_goals=y_data["away_goals"])

In [4]:
catalog.load("model_input_table")

Unnamed: 0,home_id,away_id,home_goals,away_goals,toto
0,16,1,0,4,2
1,0,11,4,4,0
2,9,7,1,2,2
3,10,13,3,2,1
4,14,2,5,0,1
...,...,...,...,...,...
301,10,0,2,1,1
302,14,11,4,0,1
303,15,7,2,1,1
304,16,2,4,1,1


In [11]:
y_data_series = y_data.apply(lambda row: (row["home_goals"], row["away_goals"]), axis=1)

y_data_series


[1;36m0[0m      [1m([0m[1;36m0[0m, [1;36m4[0m[1m)[0m
[1;36m1[0m      [1m([0m[1;36m4[0m, [1;36m4[0m[1m)[0m
[1;36m2[0m      [1m([0m[1;36m1[0m, [1;36m2[0m[1m)[0m
[1;36m3[0m      [1m([0m[1;36m3[0m, [1;36m2[0m[1m)[0m
[1;36m4[0m      [1m([0m[1;36m5[0m, [1;36m0[0m[1m)[0m
        [33m...[0m  
[1;36m301[0m    [1m([0m[1;36m2[0m, [1;36m1[0m[1m)[0m
[1;36m302[0m    [1m([0m[1;36m4[0m, [1;36m0[0m[1m)[0m
[1;36m303[0m    [1m([0m[1;36m2[0m, [1;36m1[0m[1m)[0m
[1;36m304[0m    [1m([0m[1;36m4[0m, [1;36m1[0m[1m)[0m
[1;36m305[0m    [1m([0m[1;36m1[0m, [1;36m3[0m[1m)[0m
Length: [1;36m306[0m, dtype: object

In [26]:
import pandas as pd

In [32]:
# Unpack the Series of tuples back into two separate Series
home_goals_series, away_goals_series = zip(*y_data_series)
home_goals_series = pd.Series(home_goals_series, name="home_goals")
away_goals_series = pd.Series(away_goals_series, name="away_goals")

In [35]:
y_data_series.values.flatten()


[1;35marray[0m[1m([0m[1m[[0m[1m([0m[1;36m0[0m, [1;36m4[0m[1m)[0m, [1m([0m[1;36m4[0m, [1;36m4[0m[1m)[0m, [1m([0m[1;36m1[0m, [1;36m2[0m[1m)[0m, [1m([0m[1;36m3[0m, [1;36m2[0m[1m)[0m, [1m([0m[1;36m5[0m, [1;36m0[0m[1m)[0m, [1m([0m[1;36m2[0m, [1;36m0[0m[1m)[0m, [1m([0m[1;36m1[0m, [1;36m0[0m[1m)[0m, [1m([0m[1;36m4[0m, [1;36m1[0m[1m)[0m,
       [1m([0m[1;36m1[0m, [1;36m0[0m[1m)[0m, [1m([0m[1;36m5[0m, [1;36m1[0m[1m)[0m, [1m([0m[1;36m1[0m, [1;36m1[0m[1m)[0m, [1m([0m[1;36m1[0m, [1;36m4[0m[1m)[0m, [1m([0m[1;36m1[0m, [1;36m2[0m[1m)[0m, [1m([0m[1;36m1[0m, [1;36m0[0m[1m)[0m, [1m([0m[1;36m2[0m, [1;36m3[0m[1m)[0m, [1m([0m[1;36m0[0m, [1;36m3[0m[1m)[0m,
       [1m([0m[1;36m1[0m, [1;36m1[0m[1m)[0m, [1m([0m[1;36m3[0m, [1;36m1[0m[1m)[0m, [1m([0m[1;36m2[0m, [1;36m2[0m[1m)[0m, [1m([0m[1;36m2[0m, [1;36m2[0m[1m)[0m, [1m([0m[1;36m3[0m, [1;36m1