# Libraries

In [None]:
import shutup
import pandas as pd
import numpy as np
import os as os
import matplotlib.pyplot as plt
import re
from sklearn.linear_model import PoissonRegressor

# Load train data

- iterate through train data folder 
- feature engineer the **country** and **league** from parent folder name
- join the loaded csv files by rows
- removed quotation marks that would in some rows merge two columns into one

In [None]:
def replace_last(string, old, new):
    return new.join(string.rsplit(old, 1))

def delete_quotation_marks(path):
    with open(path, "r") as f:
        raw_file = f.readlines()
        count=0
        for i in range(len(raw_file)):
            match = re.search("\"(\+|-)?\d+\.?\d*,(\+|-)?\d+\.?\d*\"", raw_file[i])
            if match:
                count+=1
                raw_file[i] = raw_file[i].replace("\"","")
                raw_file[i] = replace_last(raw_file[i], ",","\n")
    with open(path, "w") as f:
        f.writelines(raw_file)

In [None]:
### Load train data
df = pd.DataFrame()
for root, directory, files in os.walk("data/train", topdown=False):
    if files:
        for file in files:
            tmp = pd.read_csv(f"{root}/{file}")
            # Remove empty rows and columns
            tmp = tmp.dropna(how='all', axis=0)
            tmp = tmp.dropna(how='all', axis=1)
            # Derive additional columns
            tmp["league"] = int(root.split("\\")[2])
            tmp["country"] = root.split("\\")[1]
            tmp["season"] = int(file[:2]) # no. of season - 00/01 - 0th season, 21/22 - 21st season
            df = pd.concat([df, tmp], axis = 0)

### Load test data
df_test = pd.DataFrame()
for root, directory, files in os.walk("data/test", topdown=False):
    if files:
        for file in files:
            tmp = pd.read_csv(f"{root}/{file}")
            # Remove empty rows and columns
            tmp = tmp.dropna(how='all', axis=0)
            tmp = tmp.dropna(how='all', axis=1)
            # Derive additional columns
            tmp["league"] = int(root.split("\\")[2])
            tmp["country"] = root.split("\\")[1]
            tmp["season"] = 22 # no. of season - 00/01 - 0th season, 21/22 - 21st season
            df_test = pd.concat([df_test, tmp], axis = 0)

df = pd.concat([df, df_test], axis=0, join="outer")

In [None]:
# df[df["season"] == 22 ]["FTAG"]

In [None]:
df.to_csv("combined.csv")

In [None]:
delete_quotation_marks("combined.csv")

In [None]:
shutup.please()
df = pd.read_csv("combined.csv")

# Data validation

In some cases, the data is wrong. This section corrects the loaded data.

## Same name for different things

In some cases, columns are named differently. We will standardize to use column names that are in *'notes.txt'*


In [None]:
for root, directory, files in os.walk("data/train", topdown=False):
    if files:
        for file in files:
            tmp = pd.read_csv(f"{root}/{file}")
            if "HT" in tmp.columns:
                print(f"Cases for HT: {root}, {file}")
            if "AT" in tmp.columns:
                print(f"Cases for AT: {root}, {file}")

The only affected data are in Greece.

In [None]:
df["HomeTeam"] = df["HomeTeam"].mask(df["HT"].notnull(), df["HT"])
df["AwayTeam"] = df["AwayTeam"].mask(df["AT"].notnull(), df["AT"])
df = df.drop(columns = ["HT", "AT"])

In [None]:
df.drop(df[df["HomeTeam"] == df["AwayTeam"]].index, inplace=True) # remove cases where teams play against themselves

In [None]:
# df["Date"] = pd.to_datetime(df["Date"], format = '%d/%m/%Y', dayfirst=True)

## Not imputable data

In some cases, missing data cause the data to be unusable and can't be computed. This applies to variables

- HomeTeam
- AwayTeam

In [None]:
crucial_cols = ['Date', 'HomeTeam', 'AwayTeam']
df = df.dropna(subset = crucial_cols)

In [None]:
df[["HFKC", "AFKC"]].describe()

## Goal validation

Some goals are incorrect and they need to be fixed.



In [None]:
goal_cols = ["FTHG", "FTAG", "HTHG", "HTAG"]
df[goal_cols].describe()

There seem to be several issues:

- the max value for **FTHG** is absurd
- the min value for **FTAG** does not make sense
- some values are missing

In [None]:
df["FTHG"].value_counts().head(15)

First, we will remove all rows where the Full Time Home Goals are greater than 15.

In [None]:
df_22 = df[df["season"] == 22] # this split is necessary as goal values are unknown for test season
df = df[df["FTHG"] <= 15] # this line gets also rid of all NaNs
df = pd.concat([df, df_22])

Next, all Full Time Away goals that are less than zero are inspected. Since we are not sure which values are good and bad, all of them are removed.

In [None]:
# df_22 = df[df["season"] == 22] # keep season 22
df = df[df["FTAG"] >= 0]
df = pd.concat([df, df_22])

In [None]:
df["FTHG"].isna().sum() # there should be 7277, which is the count of rows in season 22

In [None]:
df["FTAG"].isna().sum()

What's left are the missing values in 'HTHG' and 'HTAG' columns. We have no way of recomputing these as well and these rows are again excluded.

In [None]:
cols = ["HTHG", "HTAG"]
df_22 = df[df["season"] == 22]
# df_pre22 = df[df["season"] != 22]
df = df[df[cols].isna().sum(axis=1) == 0]
df = pd.concat([df, df_22], axis=0)
# df["season"].unique()

If we run the describe() function again, we can see that the counts in all columns match and min/max statistics make sense.

In [None]:
df[goal_cols].describe()

### Shots on target imputation

In [None]:
lm = PoissonRegressor()

y_train = df[df["season"] != 22][["AST", "HST"]]
X_train = df[df["season"] != 22][["AST", "HST", "FTAG", "FTHG"]]
X_train.dropna(subset=["AST", "HST"],axis=0, inplace=True, how='any')
y_train.dropna(axis=0, inplace=True, how='any')

In [None]:
X_train = pd.concat([X_train["FTAG"],X_train["FTHG"]], axis=0)
y_train = pd.concat([y_train["AST"], y_train["HST"]], axis=0)

X_train = np.array(X_train).reshape(-1,1)
y_train = np.array(y_train).reshape(-1,1)

In [None]:
lm.fit(X_train, y_train)

In [None]:
X_pred_HST = df[(df["season"] != 22) & (df["HST"].isna())][["HST", "FTHG"]]

X_pred_AST = df[(df["season"] != 22) & (df["AST"].isna())][["AST", "FTAG"]]

In [None]:
X_pred_AST =   np.array(X_pred_AST["FTAG"]).reshape(-1,1)
X_pred_HST =  np.array(X_pred_HST["FTHG"]).reshape(-1,1)


In [None]:
y_pred_HST = lm.predict(X_pred_HST)
y_pred_AST = lm.predict(X_pred_AST)

In [None]:
X_pred_HST = df[df["season"]!= 22][["HST", "FTHG"]]
X_pred_AST = df[df["season"]!= 22][["AST", "FTAG"]]
X_pred_HST = X_pred_HST[X_pred_HST.isna().any(axis=1)]
X_pred_AST = X_pred_AST[X_pred_AST.isna().any(axis=1)]

In [None]:
X_pred_AST["AST"] = y_pred_AST
X_pred_HST["HST"] = y_pred_HST

In [None]:
X_pred_HST

In [None]:
df[(df["season"] != 22) & (df["HST"].isna())]["HST"]

In [None]:
df.loc[(df["season"] != 22) & (df["AST"].isna()), "AST"] = X_pred_AST["AST"]
df.loc[(df["season"] != 22) & (df["HST"].isna()), "HST"] = X_pred_HST["HST"]

In [None]:
df[df["season"] != 22]["HST"]

## Result validation

Another thing that needs to be validated is the result classification, which are re-classified - this is the easiest data validation process.

In [None]:
half_conds = [df["HTHG"] > df["HTAG"], df["HTHG"] < df["HTAG"], df["HTHG"] == df["HTAG"]]
half_choic = ["H"                    , "A"                    , "D"]
df.loc[:, "HTR"] = np.select(half_conds, half_choic)
full_conds = [df["FTHG"] > df["FTAG"], df["FTHG"] < df["FTAG"], df["FTHG"] == df["FTAG"]]
full_choic = ["H"                    , "A"                    , "D"]
df.loc[:, "FTR"] = np.select(full_conds, full_choic)

## Validation of other statistics

First step is to look at simple descriptive statistics.

In [None]:
stat_cols = [
    'Attendance', 'HS', 'AS', 'HST', 'AST', 'HHW', 'AHW',
    'HC', 'AC', 'HF', 'AF', 'HFKC', 'AFKC', 'HO', 'AO', 'HY', 'AY', 'HR', 'AR', "HBP", "ABP"
]

df[stat_cols].describe()

Then we take a look at boxplots for variables where at least one variable falls out of the interval

$$
(\text{q}_{0.25} - 3 * \text{IQR} ; \text{q}_{0.75} + 3 * \text{IQR})
$$

In [None]:
def outliers(x, multi = 3):
    q25 = x.quantile(0.25)
    q75 = x.quantile(0.75)
    iqr = q75 - q25
    outliers = x[(x < q25 - multi * iqr) | (x > q75 + multi * iqr)]
    return len(outliers)

stat_outliers = df[stat_cols].apply(lambda x: outliers(x))
stat_outliers = stat_outliers[stat_outliers != 0]
stat_outliers

In [None]:
fig, axes = plt.subplots(nrows=3, ncols = 6, figsize = (12, 10))
axes = axes.ravel()

for i, col in enumerate(stat_outliers.index):
    # Data
    df[col].plot(kind='box', ax = axes[i])
    # Styling
    axes[i].set_title(col)
    axes[i].axes.get_xaxis().set_visible(False)

plt.show()


We can se that some outliers are not that extreme. For example variables *HO*, *AO* or *HY* have extreme values, but won't remove them. This is because even tho they can be considered as extreme, there seems to be a natural way how they occured and tehre are no huge jumps between them.

On the other hand, variables *AHW*, *HF*, *AF* and *AR* seem to have some variables that are far away from the other data. Note that we do not consider variables in *HR* as outliers because it looks like it's a discrete variable with mostly zeroes.

In [None]:
stat_jump = [
    "AS", "AHW", "HF", "AF", "AR", "ABP"
]
stats_cut = [
    38, 6, 100, 60, 6, 130
]

print("Number of extreme values:")
for i, col in enumerate(stat_jump):
    cutoff = stats_cut[i]
    x = df[col].to_numpy()
    extreme = x[x > cutoff]
    print(f"{col}: {extreme}, (count: {len(extreme)})")
    
plt.show()

Based on the analysis, we see that these extreme values occur at most twice. This is why we chose to remove them.

In [None]:
criteria = dict(zip(stat_jump, stats_cut))
for column, value in criteria.items():
    # .isna() is important because otherwise all na rows are dropped
    df = df[df[column].lt(value) | df[column].isna()]

Finally, we will double-check with boxplots.

In [None]:
fig, axes = plt.subplots(nrows=2, ncols = 3, figsize = (8, 8))
axes = axes.ravel()

for i, col in enumerate(stat_jump):
    df[col].plot(kind='box', ax = axes[i])
    axes[i].set_title(col)
    axes[i].axes.get_xaxis().set_visible(False)

plt.show()

IT looks like variable *HF* contains another outlier, but this process should be done only once. Hence, no outliers are removed.

## Random unnamed data

File *'data/train/portugal/1/0304.csv'* contains random data in columns *'Unnamed: 33'* and *'Unnamed: 34'*.

In [None]:
unnamed_cols_df = df[['Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34']]
unnamed_cols_df[unnamed_cols_df.notnull().any(axis=1)]

Since we do not know what these columns represent, and it is only one non-NA row from the whole dataset, this column is removed.

In [None]:
df = df.drop(columns = unnamed_cols_df.columns)

## Wrong betting odds names

File *'data/train/germany/2/0405.csv'* contains columns **LB**, **LB.1** and **LB.2**, which are unique only to this file. After further investigation, they represent the betting odds data for Ladbrokers. After looking at the data more thoroughly, it can be guessed that all three columns represent odds for home win, away win, and draw.

In [None]:
tmp = pd.read_csv("data/train/germany/2/0405.csv")
# Remove empty rows and columns
tmp = tmp.dropna(how='all', axis=0)
tmp = tmp.dropna(how='all', axis=1)
tmp = tmp.loc[:, ~tmp.columns.str.startswith('Unnamed:')]
tmp = tmp[tmp[['LB', 'LB.1', 'LB.2']].notnull().any(axis=1)]
tmp.filter(regex='[HDAB12]$').iloc[:, -12:]

Based on column similarity, we can make an edjucated guess that

- **LB** should be **LBH**,
- **LB.1** should be **LBD**, and
- **LB.2** should be **LBA**

In [None]:
# If LB is not null, use that value and replace it in LBH
df["LBH"] = df["LBH"].mask(df["LB"  ].notnull(), df["LB"])
df["LBD"] = df["LBD"].mask(df["LB.1"].notnull(), df["LB.1"])
df["LBA"] = df["LBA"].mask(df["LB.2"].notnull(), df["LB.2"])
df = df.drop(columns = ["LB", "LB.1", "LB.2"])

## Date normalization

Date is not consistent and it needs to be unified in order to format it as date.

In [None]:
potential_fixes = pd.to_datetime(df['Date'], format='mixed', dayfirst=True)
equal = potential_fixes[potential_fixes.isnull()].index.equals(df[df["Date"].isnull()].index)
# if True, all non-na dates have been converted
if equal:
    df["Date"] = potential_fixes
else:
   print("Date indexes are not equal. Something wrong with the conversion??")

## Bookies analysis

In [None]:
#minor warning suppression
shutup.please()

away_odds = df[["B365A", "BSA", "BWA", "GBA", "IWA", "LBA", "PSA", "SOA", "SBA", "SJA", "SYA", "VCA", "WHA"]]
home_odds = df[["B365H", "BSH", "BWH", "GBH", "IWH", "LBH", "PSH", "SOH", "SBH", "SJH", "SYH", "VCH", "WHH"]]
draw_odds = df[["B365D", "BSD", "BWD", "GBD", "IWD", "LBD", "PSD", "SOD", "SBD", "SJD", "SYD", "VCD", "WHD"]]

#Average of away/home/draw odds
df["Avg_away_odds"] = away_odds.mean(axis=1)
df["Avg_home_odds"] = home_odds.mean(axis=1)
df["Avg_draw_odds"] = draw_odds.mean(axis=1)

#Predcition based on averages - Odds with smallest average have the highest probability -> returns A/H/D
df["Avg_bookie_prediction"] = df[["Avg_away_odds", "Avg_home_odds", "Avg_draw_odds"]].idxmin(axis=1).fillna("").astype(str).str[4]
df["Avg_bookie_prediction"] = df["Avg_bookie_prediction"].str.upper()

#Certainity of odds, the smaller variance implies, that bookies are more "sure"
df["Var_away_odds"] = away_odds.var(axis=1)
df["Var_home_odds"] = home_odds.var(axis=1)
df["Var_draw_odds"] = draw_odds.var(axis=1)

print(df.loc[:, "Avg_away_odds":"Var_draw_odds"])

In [None]:
#Columns for predictions of all bookies
bookies_predictions = pd.DataFrame(columns = ["B365P", "BSP", "BWP", "GBP", "IWP", "LBP", "PSP", "SOP", "SBP", "SJP", "SYP", "VCP", "WHP"])

#dataframe with away/home/draw odds for each bookie
df_bookies_accuracy = pd.concat([away_odds, home_odds, draw_odds, bookies_predictions], axis = 1).sort_index(axis = 1)
df_bookies_accuracy["Outcome"] = df[["FTR"]]

seq = list(range(3, 53, 4))
#"prediction" of each bookie based on their odds
for i in seq:
    df_bookies_accuracy.iloc[:, i] = df_bookies_accuracy.iloc[:, i-3:i].idxmin(axis=1).fillna("").astype(str).str[-1]

print(df_bookies_accuracy)

In [None]:
#Accuracy of each bookie - all around 50% - accuracy of a bookie cannot be used as a weight for prediction
for bookie in bookies_predictions:
    filter_df = df_bookies_accuracy[df_bookies_accuracy[bookie].notna()]
    matching_values = (filter_df[bookie] == filter_df['Outcome']).sum()
    total_values = len(filter_df)
    percent= (matching_values/total_values) *100
    print("Percentage of matching values for " + bookie[:-1] + f": {percent:.2f}%")

- - -

In [None]:
df.shape

**Handicap values fix**
Handicaps are written in quarters - divisible by 0.25

In [None]:
handicap_cols = ['GBAH', 'B365AH', 'BbAHh', 'AHh', 'LBAH', 'AHCh']
df[handicap_cols].describe()

In [None]:
# handicaps are usually counted in quarters (ending with .0, .25, .50, .75)
for col in handicap_cols:
    df = df[(df[col] % 0.25 == 0) | df[col].isna()] # drop rows with numeric value indivisible by 0.25
for i in handicap_cols:
    print(
        "number of invalid rows left: ", # difference between # of all rows and validated rows
        len(df[i]) - len(df[((df[i] % 0.25 == 0) | (df[i].isna()))])
    )

Betting odds validation (betting odds have to be positive)

In [None]:
odds_cols = ['GBH', 'GBD', 'GBA', 'IWH', 'IWD', 'IWA', 'SBH', 'SBD', 'SBA', 'WHH', 'WHD', 'WHA', 'B365CH', 'B365CD', 'B365CA', 'BWCH', 'BWCD', 'BWCA', 'IWCH', 'IWCD', 'IWCA', 'WHCH', 'WHCD', 'WHCA', 'VCCH', 'VCCD', 'VCCA', 'MaxCH', 'MaxCD', 'MaxCA', 'AvgCH','AvgCD', 'AvgCA', 'B365C>2.5', 'B365C<2.5', 'PC>2.5', 'PC<2.5', 'MaxC>2.5', 'MaxC<2.5', 'AvgC>2.5', 'AvgC<2.5', 'AHCh', 'B365CAHH', 'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA', 'AvgCAHH', 'AvgCAHA']
df[odds_cols].describe()

In [None]:
# only column 'AHCh' has negative values -> no need to run on all 50 columns
df["AHCh"] = abs(df["AHCh"])

Drop all completely empty columns and rows which there is a lot of, dropped 41 empty columns in total.

In [None]:
df.dropna(how='all', axis=1, inplace=True)

In [None]:
df.shape

In [None]:
len(df[df["season"]==22])

# Data Exploration

Total overview of all variables.

In [None]:
df.describe()

Percentage of missing values.

In [None]:
na_vals = df[df["season"] != 22].isna().sum()       # AVOID DATA LEAKAGE ;)
na_vals = na_vals/df[df["season"] != 22].shape[0]
na_vals.sort_values(ascending = False).head(20)

In [None]:
df = df.reset_index(drop = True)

# Get first and last occurrences
first_occurrences = df.apply(lambda x: x.first_valid_index())
last_occurrences = df.apply(lambda x: x.last_valid_index())

missing_counts = {}

for column in df.columns:
    # Indexes of the first and last occurrences
    first_idx = df[column].first_valid_index()
    last_idx = df[column].last_valid_index()

    # Select the range between first and last occurrence, count missing values
    missing_count = df[column][first_idx:last_idx].isnull().sum()
    missing_counts[column] = missing_count

result_df = pd.DataFrame({
    'Variable':          df.columns,
    'First Occurrence':  df["Date"].to_numpy()[first_occurrences.values],
    'Last Occurrence':   df["Date"].to_numpy()[last_occurrences.values],
    "Missing within": missing_counts.values()
})

result_df.sort_values(by="Missing within", ascending=False)

# Feature Engineering
We decided to add several features to the dataset.
These features are:
- Result of the last match between the two contending teams
- Goal score during the last match between the two contending teams
- Average amount of goals scored in the current season
- Average amount of goals received in the current season

We believe that these features will prove useful in the training of our model as they can reveal things such as momentum and strenghts/weaknesses against certain teams.

### Result of the last match between the two contending teams

Create a string of participating teams, append them alphabetically behind each other so it is easier to slice them.

In [None]:
df["Index"] = df.index
df["MatchTeams"] = df[["HomeTeam","AwayTeam"]].values.tolist()
df["MatchTeams"] = df["MatchTeams"].sort_values().apply(lambda x: sorted(x))

In [None]:
df = df.assign(MatchTeams=df["MatchTeams"].apply(lambda l: "_".join(l)))

In [None]:
df.sort_values(['MatchTeams','Date'],ascending=True).groupby('MatchTeams').shift()

Use groupby to group by same matches, create ["LastMatchIndex","LastMatchAwayGoals", "LastMatchHomeGoals"] columns with unsorted values.
First match of two teams gets empty column index as LastMatchIndex

In [None]:
df[["LastMatchIndex","LastMatchAwayGoals", "LastMatchHomeGoals"]] = df.sort_values(['MatchTeams','Date'],ascending=True).groupby('MatchTeams').shift()[["Index","FTAG", "FTHG"]]
df.loc[np.isnan(df["LastMatchIndex"]), "LastMatchIndex"] = len(df.index)-1
df.loc[len(df.index)] = [np.nan for _ in range(df.shape[1])]
df.loc[len(df.index)-1]
df["LastMatchIndex"] = df["LastMatchIndex"].replace(np.nan, len(df.index)-1)

df["LastMatchIndex"].fillna(len(df.index)-1)
arr = df["LastMatchIndex"]

Switch LastMatchHomeGoals and LastMatchAwayGoals if they do not correspond to the teams accordingly, calculate who won the match

In [None]:
df["SameHomeTeam"] = (df.iloc[arr]["HomeTeam"].values == df["HomeTeam"].values)
df.loc[df["SameHomeTeam"],['LastMatchHomeGoals','LastMatchAwayGoals']] = df.loc[df["SameHomeTeam"],['LastMatchHomeGoals','LastMatchAwayGoals']].values
df["LastMatchAwayWin"] = (df["LastMatchAwayGoals"] > df["LastMatchHomeGoals"]).astype(int)
df["LastMatchHomeWin"] = (df["LastMatchAwayGoals"] < df["LastMatchHomeGoals"]).astype(int)
df["LastMatchDraw"] = (df["LastMatchAwayGoals"] == df["LastMatchHomeGoals"]).astype(int)

In [None]:
df.drop(["SameHomeTeam", "LastMatchIndex", "Index"], axis=1)

In [None]:
df[df["MatchTeams"] == "Chelsea_Liverpool"][["Date","FTHG", "FTAG", "LastMatchHomeGoals", "LastMatchAwayGoals", "LastMatchHomeWin","LastMatchAwayWin", "LastMatchDraw"]]

### Split data back to individual files based on countries

In [None]:
df[df["season"] == 22]

In [None]:
dfs = {}
land_list = df["country"].unique()[:-1]
for country in land_list:
    dfs[f"df_{country}"] = df[df["country"] == country]
    dfs[f"df_{country}"].dropna(axis=1, inplace=True, how="all")

### Create dummies from teams

In [None]:
for country in dfs:
    tmp = dfs[country][["HomeTeam", "AwayTeam"]]
    dfs[country] = pd.get_dummies(dfs[country], columns=["HomeTeam", "AwayTeam", "Div", "league"])
    try:
        dfs[country] = pd.get_dummies(dfs[country], columns=["Referee"])
    except KeyError:
        pass
    dfs[country] = pd.concat([dfs[country], tmp], axis=1)

# Feature Average amount of goals scored/received in the earlier matches

HomeTeamAvgScored, AwayTeamAvgScored, HomeTeamAvgReceived, AwayTeamAvgReceived




In [None]:
# ty prumery mozna nesedi (koukala jsem na sezonu 22, ale mozna jsem koukala spatne) - PLS CHECK
def get_avg_team(df):
    lastmatch_home = pd.Series()
    df_sub =  df[df["season"] != 22]
    lastmatch_home["HomeTeamAvgScored"] = df_sub["HomeTeamAvgScored"].dropna().mean()
    lastmatch_home["AwayTeamAvgScored"] = df_sub["AwayTeamAvgScored"].dropna().mean()
    lastmatch_home["HomeTeamAvgReceived"] = df_sub["HomeTeamAvgReceived"].dropna().mean()
    lastmatch_home["AwayTeamAvgReceived"] = df_sub["AwayTeamAvgReceived"].dropna().mean()
    lastmatch_home["HomeTeamAvgShotsOnTarget"] = df_sub["HomeTeamAvgShotsOnTarget"].dropna().mean()
    lastmatch_home["AwayTeamAvgShotsOnTarget"] = df_sub["AwayTeamAvgShotsOnTarget"].dropna().mean()
    lastmatch_home["HomeWinRatio"] = df_sub["HomeWinRatio"].dropna().mean()
    lastmatch_home["HomeLossRatio"] = df_sub["HomeLossRatio"].dropna().mean()
    lastmatch_home["HomeDrawRatio"] = df_sub["HomeDrawRatio"].dropna().mean()
    lastmatch_home["AwayWinRatio"] = df_sub["HomeWinRatio"].dropna().mean()
    lastmatch_home["AwayLossRatio"] = df_sub["HomeLossRatio"].dropna().mean()
    lastmatch_home["AwayDrawRatio"] = df_sub["HomeDrawRatio"].dropna().mean()
    return lastmatch_home

def get_goals_stats(df):
    df.sort_values(by="Date", inplace=True)
    df["HomeTeamAvgScored"] = 0
    df["AwayTeamAvgScored"] = 0
    df["HomeTeamAvgReceived"] = 0
    df["AwayTeamAvgReceived"] = 0
    
    df["HomeWinRatio"] = 0
    df["HomeLossRatio"] = 0
    df["HomeDrawRatio"] = 0
    
    df["AwayWinRatio"] = 0
    df["AwayLossRatio"] = 0
    df["AwayDrawRatio"] = 0
  
    df["HomeTeamAvgShotsOnTarget"] = 0
    df["AwayTeamAvgShotsOnTarget"] = 0
   
    home_wins = 0
    home_losses = 0
    home_draws = 0

    away_wins = 0
    away_losses = 0
    away_draws = 0
    
    team_list = set.union(set(df["HomeTeam"]), set(df["AwayTeam"]))
    team_list_received = [f"{team}_received" for team in team_list]
    team_list_res = [f"{team}_res" for team in team_list]
    team_list_shots = [f"{team}_shots" for team in team_list]

    df = df.reindex(df.columns.tolist() + list(team_list) + list(team_list_received) + list(team_list_res) + list(team_list_shots),axis=1)
    
    nancount = 0
    
    df.reset_index(inplace=True)
    for i, row in df.iterrows():
        home = row["HomeTeam"]
        away = row["AwayTeam"]

        filtered_df = df[(df["MatchTeams"] == row["MatchTeams"]) & (df["season"] != 22)]
        if not filtered_df.empty and (np.isnan(row["LastMatchAwayGoals"]) or np.isnan(row["LastMatchHomeGoals"])):
            # print(filtered_df["LastMatchAwayGoals"].iloc[-1])
            last_match_away_goals = filtered_df["LastMatchAwayGoals"].iloc[-1]
            last_match_home_goals = filtered_df["LastMatchHomeGoals"].iloc[-1]

            # print(f"{last_match_away_goals = }, {last_match_home_goals = }")
            df.loc[i, "LastMatchAwayGoals"] = last_match_away_goals if row["SameHomeTeam"] else last_match_home_goals
            df.loc[i, "LastMatchHomeGoals"] = last_match_home_goals if row["SameHomeTeam"] else last_match_away_goals
            # print(f"{df.loc[i, 'LastMatchAwayGoals'] = }")
            # df.loc[i, "LastMatchAwayGoals"] = df[(df["MatchTeams"] == row["MatchTeams"]) & (df["season"] != 22)]["LastMatchAwayGoals"].iloc[-1] if row["SameHomeTeam"] else df[(df["MatchTeams"] == row["MatchTeams"]) & (df["season"] != 22)]["LastMatchHomeGoals"].iloc[-1]
            # 
            # df.loc[i, "LastMatchHomeGoals"] = df[(df["MatchTeams"] == row["MatchTeams"]) & (df["season"] != 22)]["LastMatchHomeGoals"].iloc[-1] if row["SameHomeTeam"] else df[(df["MatchTeams"] == row["MatchTeams"]) & (df["season"] != 22)]["LastMatchAwayGoals"].iloc[-1]
        elif filtered_df.empty:
            nancount += 1

        ### Pokud testing data -> musime dat posledni dostupna data
        if row["season"] == 22:
            # print("found season 22 :) in row ", i)
            
            
            lastmatch = df[df["season"] != 22] 
            try:
                lastmatch_home = lastmatch[((df["HomeTeam"] == home) | (lastmatch["AwayTeam"] == home))].iloc[-1]
                hometeam = "Home" if lastmatch_home["HomeTeam"] == home else "Away"
                
            except IndexError:
                lastmatch_home = get_avg_team(df)
                hometeam = "Home"
                
            try:
                lastmatch_away = lastmatch[((df["HomeTeam"] == away) | (lastmatch["AwayTeam"] == away))].iloc[-1]
                awayteam = "Away" if lastmatch_away["AwayTeam"] == away else "Home"
                
            except IndexError:
                lastmatch_away = get_avg_team(df)
                awayteam = "Away"            
            
            
        
            df.loc[i, "HomeTeamAvgScored"] = lastmatch_home[f"{hometeam}TeamAvgScored"]
            df.loc[i, "AwayTeamAvgScored"] = lastmatch_away[f"{awayteam}TeamAvgScored"]
            df.loc[i, "HomeTeamAvgReceived"] = lastmatch_home[f"{hometeam}TeamAvgReceived"]
            df.loc[i, "AwayTeamAvgReceived"] = lastmatch_away[f"{awayteam}TeamAvgReceived"]

            df.loc[i, "HomeTeamAvgShotsOnTarget"] = lastmatch_home[f"{hometeam}TeamAvgShotsOnTarget"]
            df.loc[i, "AwayTeamAvgShotsOnTarget"] = lastmatch_away[f"{awayteam}TeamAvgShotsOnTarget"]


            df.loc[i,"HomeWinRatio"] = lastmatch_home[f"{hometeam}WinRatio"]
            df.loc[i,"HomeLossRatio"] = lastmatch_home[f"{hometeam}LossRatio"]
            df.loc[i,"HomeDrawRatio"] = lastmatch_home[f"{hometeam}DrawRatio"]

            df.loc[i,"AwayWinRatio"] = lastmatch_home[f"{awayteam}WinRatio"]
            df.loc[i,"AwayLossRatio"] = lastmatch_home[f"{awayteam}LossRatio"]
            df.loc[i,"AwayDrawRatio"] = lastmatch_home[f"{awayteam}DrawRatio"]

            continue

        df.loc[i, "HomeTeamAvgScored"] = df[home].dropna().mean()
        df.loc[i, "AwayTeamAvgScored"] = df[away].dropna().mean()
        df.loc[i, "HomeTeamAvgReceived"] = df[f"{home}_received"].dropna().mean()
        df.loc[i, "AwayTeamAvgReceived"] = df[f"{away}_received"].dropna().mean()

        df.loc[i, "HomeTeamAvgShotsOnTarget"] = df[f"{home}_shots"].dropna().mean()
        df.loc[i, "AwayTeamAvgShotsOnTarget"] = df[f"{away}_shots"].dropna().mean()
        
        df.loc[i, home] = row["FTHG"]
        df.loc[i, f"{home}_received"] = row["FTAG"]
        df.loc[i, away] = row["FTAG"]
        df.loc[i, f"{away}_received"] = row["FTHG"]
        
        df.loc[i, f"{home}_shots"] = row["HST"]
        df.loc[i, f"{away}_shots"] = row["AST"]
        
        try:
            home_wins = df[f"{home}_res"].value_counts()["W"]
            home_losses = df[f"{home}_res"].value_counts()["L"]
            home_draws = df[f"{home}_res"].value_counts()["D"]
    
            away_wins = df[f"{away}_res"].value_counts()["W"]
            away_losses = df[f"{away}_res"].value_counts()["L"]
            away_draws = df[f"{away}_res"].value_counts()["D"]
        except KeyError:
            pass
        
        try:
            df.loc[i,"HomeWinRatio"] = home_wins / (home_wins + home_draws + home_losses)
            df.loc[i,"HomeLossRatio"] = home_losses / (home_wins + home_draws + home_losses)
            df.loc[i,"HomeDrawRatio"] = home_draws / (home_wins + home_draws + home_losses)
        except ZeroDivisionError:
            pass

        try:
            df.loc[i,"AwayWinRatio"] = away_wins / (away_wins + away_draws + away_losses)
            df.loc[i,"AwayLossRatio"] = away_losses / (away_wins + away_draws + away_losses)
            df.loc[i,"AwayDrawRatio"] = away_draws / (away_wins + away_draws + away_losses)
        except ZeroDivisionError:
            pass
        
        if row["FTR"] == "A":
            df.loc[i, f"{home}_res"] = "L"
            df.loc[i, f"{away}_res"] = "W"
        elif row["FTR"] == "H":
            df.loc[i, f"{home}_res"] = "W"
            df.loc[i, f"{away}_res"] = "L"
        else:
            df.loc[i, f"{home}_res"] = "D"
            df.loc[i, f"{away}_res"] = "D"

    display(df[df["LastMatchHomeGoals"].isna()][["MatchTeams", "LastMatchHomeGoals", "LastMatchAwayGoals"]])
    df[df["HomeTeamAvgScored"].isna()]["HomeTeamAvgScored"] = 0
    df[df["AwayTeamAvgScored"].isna()]["AwayTeamAvgScored"] = 0
    df[df["HomeTeamAvgReceived"].isna()]["HomeTeamAvgReceived"] = 0
    df[df["HomeTeamAvgReceived"].isna()]["HomeTeamAvgReceived"] = 0
    df.drop(team_list, inplace=True, axis=1)
    df.drop(team_list_received, inplace=True, axis=1)
    df.drop(team_list_res, inplace=True, axis=1)
    df.drop(team_list_shots, inplace=True, axis=1)

    df.drop(["index", "HomeTeam", "AwayTeam", "LastMatchIndex"], inplace=True, axis=1)
    display(df[df["LastMatchHomeGoals"].isna()][["MatchTeams", "LastMatchHomeGoals", "LastMatchAwayGoals"]])
    display(nancount)
    return df

In [None]:
for country in dfs:
    dfs[country] = get_goals_stats(dfs[country])
    # don't get scared if you get "only" belgium done, england takes *LONG* (10 min), rest is faster 
    print(f"done {country}")
    # break ## UNCOMMENT THIS LINE IF TESTING

### Add scored/received ratio for each team

Problem with NaN and inf values for the first instances where teams have scored 0 or received 0 goals -> we could drop the first season maybe?

In [None]:
for country in dfs: # this will need fix for season 22? 
    cnt = dfs[country]
    cnt["HomeTeamScoredRatio"] = cnt["HomeTeamAvgScored"]/(cnt["HomeTeamAvgReceived"] + cnt["HomeTeamAvgScored"])
    cnt["AwayTeamScoredRatio"] = cnt["AwayTeamAvgScored"]/(cnt["AwayTeamAvgReceived"] + cnt["AwayTeamAvgScored"])

In [None]:
blg = dfs["df_belgium"]

In [None]:
blg[blg["MatchTeams"] == "Antwerp_Westerlo"]

In [None]:
# check if all seasons got through
blg[blg["season"] == 22]

In [None]:
df_turkey = dfs["df_turkey"]

# For HomeTeam_Gaziantep
teams = ["Hatayspor", "Gaziantep"]

for team in teams:
    for location in ["Home", "Away"]:
        home_col_prefix = f"HomeTeam_{team}" if location == "Home" else f"AwayTeam_{team}"

        # For mean values
        df_turkey.loc[(df_turkey["Avg_bookie_prediction"].isna()) & (df_turkey[home_col_prefix]), f"Avg_home_odds"] = df_turkey[df_turkey[home_col_prefix]]["Avg_home_odds"].mean()
        df_turkey.loc[(df_turkey["Avg_bookie_prediction"].isna()) & (df_turkey[home_col_prefix]), f"Avg_away_odds"] = df_turkey[df_turkey[home_col_prefix]]["Avg_away_odds"].mean()
        df_turkey.loc[(df_turkey["Avg_bookie_prediction"].isna()) & (df_turkey[home_col_prefix]), f"Avg_draw_odds"] = pd.concat([df_turkey[df_turkey[home_col_prefix]], df_turkey[df_turkey[f"HomeTeam_{team}"]], df_turkey[df_turkey[f"AwayTeam_{team}"]]])["Avg_draw_odds"].mean()

        # For Var values
        df_turkey.loc[(df_turkey["Avg_bookie_prediction"].isna()) & (df_turkey[home_col_prefix]), f"Var_home_odds"] = df_turkey[df_turkey[home_col_prefix]]["Var_home_odds"].mean()
        df_turkey.loc[(df_turkey["Avg_bookie_prediction"].isna()) & (df_turkey[home_col_prefix]), f"Var_away_odds"] = df_turkey[df_turkey[home_col_prefix]]["Var_away_odds"].mean()
        df_turkey.loc[(df_turkey["Avg_bookie_prediction"].isna()) & (df_turkey[home_col_prefix]), f"Var_draw_odds"] = pd.concat([df_turkey[df_turkey[home_col_prefix]], df_turkey[df_turkey[f"HomeTeam_{team}"]], df_turkey[df_turkey[f"AwayTeam_{team}"]]])["Var_draw_odds"].mean()


teams = ["Bradford", "Mansfield"]
df_england = dfs["df_england"]

for team in teams:
    for location in ["Home", "Away"]:
        home_col_prefix = f"HomeTeam_{team}" if location == "Home" else f"AwayTeam_{team}"

        # For mean values
        df_england.loc[(df_england["Avg_bookie_prediction"].isna()) & (df_england[home_col_prefix]), f"Avg_home_odds"] = df_england[df_england[home_col_prefix]]["Avg_home_odds"].mean()
        df_england.loc[(df_england["Avg_bookie_prediction"].isna()) & (df_england[home_col_prefix]), f"Avg_away_odds"] = df_england[df_england[home_col_prefix]]["Avg_away_odds"].mean()
        df_england.loc[(df_england["Avg_bookie_prediction"].isna()) & (df_england[home_col_prefix]), f"Avg_draw_odds"] = pd.concat([df_england[df_england[home_col_prefix]], df_england[df_england[f"HomeTeam_{team}"]], df_england[df_england[f"AwayTeam_{team}"]]])["Avg_draw_odds"].mean()

        # For Var values
        df_england.loc[(df_england["Avg_bookie_prediction"].isna()) & (df_england[home_col_prefix]), f"Var_home_odds"] = df_england[df_england[home_col_prefix]]["Var_home_odds"].mean()
        df_england.loc[(df_england["Avg_bookie_prediction"].isna()) & (df_england[home_col_prefix]), f"Var_away_odds"] = df_england[df_england[home_col_prefix]]["Var_away_odds"].mean()
        df_england.loc[(df_england["Avg_bookie_prediction"].isna()) & (df_england[home_col_prefix]), f"Var_draw_odds"] = pd.concat([df_england[df_england[home_col_prefix]], df_england[df_england[f"HomeTeam_{team}"]], df_england[df_england[f"AwayTeam_{team}"]]])["Var_draw_odds"].mean()



df_england["Avg_bookie_prediction"] = df_england[["Avg_away_odds", "Avg_home_odds", "Avg_draw_odds"]].idxmin(axis=1).fillna("").astype(str).str[4]
df_england["Avg_bookie_prediction"] = df_england["Avg_bookie_prediction"].str.upper()

df_turkey["Avg_bookie_prediction"] = df_turkey[["Avg_away_odds", "Avg_home_odds", "Avg_draw_odds"]].idxmin(axis=1).fillna("").astype(str).str[4]
df_turkey["Avg_bookie_prediction"] = df_turkey["Avg_bookie_prediction"].str.upper()

dfs["df_england"] = df_england
dfs["df_turkey"] = df_turkey

In [None]:
df_turkey[(df_turkey["Avg_bookie_prediction"].isna()) & (df_turkey["HomeTeam_Gaziantep"])]["Avg_home_odds"]

In [None]:
df_turkey[df_turkey["HomeTeam_Gaziantep"]]["Avg_home_odds"].mean()

In [None]:
display(df_turkey[df_turkey["Avg_away_odds"].isna()])

### Drop unnecessary columns

In [None]:
combined_labels = [
    "HS", "AS", "HST", "AST", "HHW", "AHW",
    "HC", "AC", "HF", "AF", "HFKC", "AFKC",
    "HO", "AO", "HY", "AY", "HR", "AR",
    "HBP", "ABP",
    "B365H", "B365D", "B365A",
    "BSH", "BSD", "BSA",
    "BWH", "BWD", "BWA",
    "GBH", "GBD", "GBA",
    "IWH", "IWD", "IWA",
    "LBH", "LBD", "LBA",
    "PSH", "PSD", "PSA",
    "SOH", "SOD", "SOA",
    "SBH", "SBD", "SBA",
    "SJH", "SJD", "SJA",
    "SYH", "SYD", "SYA",
    "VCH", "VCD", "VCA",
    "WHH", "WHD", "WHA",
    "Bb1X2", "BbMxH", "BbAvH", "BbMxD", "BbAvD", "BbMxA", "BbAvA",
    "MaxH", "MaxD", "MaxA", "AvgH", "AvgD", "AvgA",
    "BbOU", "BbMx>2.5", "BbAv>2.5", "BbMx<2.5", "BbAv<2.5",
    "GB>2.5", "GB<2.5", "B365>2.5", "B365<2.5", "P>2.5", "P<2.5",
    "Max>2.5", "Max<2.5", "Avg>2.5", "Avg<2.5",
    "BbAH", "BbAHh", "AHh", "BbMxAHH", "BbAvAHH", "BbMxAHA", "BbAvAHA",
    "GBAHH", "GBAHA", "GBAH", "LBAHH", "LBAHA", "LBAH",
    "B365AHH", "B365AHA", "B365AH", "PAHH", "PAHA",
    "MaxAHH", "MaxAHA", "AvgAHH", "AvgAHA", "Unnamed: 0", "Date", "HTHG", "FTR", "HTHG", "HTAG", "HTR", "Index", "country",
    "PSCH", "PSCD", "PSCA", "Time", "B365CH", "B365CD", "B365CA",
    "BWCH", "BWCD", "BWCA", "IWCH", "IWCD", "IWCA", "WHCH", "WHCD",
    "WHCA", "VCCH", "VCCD", "VCCA", "MaxCH", "MaxCD", "MaxCA", "AvgCH",
    "AvgCD", "AvgCA", "B365C>2.5", "B365C<2.5", "PC>2.5", "PC<2.5",
    "MaxC>2.5", "MaxC<2.5", "AvgC>2.5", "AvgC<2.5", "AHCh", "B365CAHH",
    "B365CAHA", "PCAHH", "PCAHA", "MaxCAHH", "MaxCAHA", "AvgCAHH", "AvgCAHA", "Attendance"
]

In [99]:
for country in dfs:
    for col in combined_labels:
        dfs[country].drop(col,axis=1, inplace=True, errors='ignore')
    dfs[country]["Target_regr"] = dfs[country]["FTHG"] + dfs[country]["FTAG"]
    dfs[country]['Target_clas'] = [0 if a > h else 1 if h > a else -1 for a, h in zip(dfs[country]['FTAG'], dfs[country]['FTHG'])]
    dfs[country] = dfs[country][dfs[country]["season"] != 0]
    dfs[country] = pd.get_dummies(df['Avg_bookie_prediction'], prefix='Bookie_Prediction')

    print(f"done {country}")

done df_belgium
done df_england
done df_france
done df_germany
done df_greece
done df_italy
done df_netherlands
done df_portugal
done df_scotland
done df_spain
done df_turkey


In [103]:
dfs[country].head()

Unnamed: 0,Bookie_Prediction_A,Bookie_Prediction_D,Bookie_Prediction_H
0,True,False,False
1,True,False,False
2,True,False,False
3,False,False,True
4,False,False,True


# Export Dataset as .csv file

In [101]:
for country in dfs:
    dfs[country][dfs[country]["season"] != 22].to_csv(f"data/train_preprocessed/{country}.csv")
    dfs[country][dfs[country]["season"] == 22].to_csv(f"data/test_preprocessed/{country}.csv")

KeyError: 'season'