In [1]:
import pandas as pd
import numpy as np
import os as os
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score,f1_score
from matplotlib import pyplot as plt
from sklearn.tree import plot_tree

### Load data

In [2]:
dfs_train = {}
dfs_test = {}

In [3]:
for root, directory, files in os.walk("data/train_preprocessed", topdown=False):
    if files:
        for file in files:
            dfs_train[file[:-4]] = pd.read_csv(f"{root}/{file}")
            try:
                dfs_train[file[:-4]] = pd.get_dummies(dfs_train[file[:-4]], columns=["Avg_bookie_prediction"], prefix='Bookie_Prediction')
            except KeyError:
                pass
for root, directory, files in os.walk("data/test_preprocessed", topdown=False):
    if files:
        for file in files:
            dfs_test[file[:-4]] = pd.read_csv(f"{root}/{file}")
            try:
                dfs_test[file[:-4]] = pd.get_dummies(dfs_test[file[:-4]], columns=["Avg_bookie_prediction"], prefix='Bookie_Prediction')
                
            except KeyError:
                pass
            dfs_test[file[:-4]].drop(columns=["Target", "Target_clas", "Target_regr"], inplace=True)




In [4]:
def impute_nan_values(dfs):
    for df in dfs.values():
        for col in df.columns:
            if df[col].dtype == np.float64 or df[col].dtype == np.int64:
                df[col] = df.groupby("season")[col].transform(lambda x: x.fillna(x.mean()))
        df.dropna(inplace=True)
impute_nan_values(dfs_train)
impute_nan_values(dfs_test)

In [26]:
# validation set
dfs_valid_reg_X = {}
dfs_valid_reg_y = {}
dfs_train_reg_X = {}
dfs_train_reg_y = {}
dfs_valid_clas_X = {}
dfs_valid_clas_y = {}
dfs_train_clas_X = {}
dfs_train_clas_y = {}


cols_to_drop = ['FTHG', 'FTAG', 'MatchTeams', 'SameHomeTeam', 'Target', 'Target_regr', 'Target_clas', "Unnamed: 0"]

for country in dfs_train:
    # dfs_train[country] =   dfs_train[country][  dfs_train[country]["season"] > 12]
    dfs_train[country]['Target_clas'] = [0 if a > h else 1 if h > a else -1 for a, h in zip(dfs_train[country]['FTAG'], dfs_train[country]['FTHG'])]

    dfs_valid_reg_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21]["Target_regr"]
    dfs_valid_reg_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21].drop(columns=cols_to_drop, errors='ignore')
    
    dfs_train_reg_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21]["Target_regr"]
    dfs_train_reg_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21].drop(columns=cols_to_drop, errors='ignore')
    
    dfs_train_clas_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21]["Target_clas"]
    dfs_train_clas_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21].drop(columns=cols_to_drop, errors='ignore')

    dfs_valid_clas_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21]["Target_clas"]
    dfs_valid_clas_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21].drop(columns=cols_to_drop, errors='ignore')
    

# Classification task

### Random Forest Baseline Model

In [27]:
rfm = RandomForestClassifier(random_state=42, max_depth=10) ### Cesta je snizit hloubku stromu

In [25]:
dfs_train_clas_X["belgium"]

Unnamed: 0.1,Unnamed: 0,season,Avg_away_odds,Avg_home_odds,Avg_draw_odds,Var_away_odds,Var_home_odds,Var_draw_odds,LastMatchAwayGoals,LastMatchHomeGoals,...,HomeDrawRatio,AwayWinRatio,AwayLossRatio,AwayDrawRatio,HomeTeamAvgShotsOnTarget,AwayTeamAvgShotsOnTarget,HomeTeamScoredRatio,AwayTeamScoredRatio,Avg_bookie_prediction_A,Avg_bookie_prediction_H
0,300,1.0,4.666667,1.556667,3.533333,0.083333,0.000133,0.043333,0.0,1.0,...,0.250000,0.264706,0.500000,0.235294,4.756882,4.309364,0.478261,0.319149,False,True
1,301,1.0,8.600000,1.233333,4.900000,2.680000,0.003333,0.280000,0.0,2.0,...,0.235294,0.323529,0.470588,0.205882,6.264428,4.510234,0.778761,0.457831,False,True
2,302,1.0,2.966667,2.050000,3.166667,0.043333,0.032500,0.063333,4.0,1.0,...,0.187500,0.468750,0.250000,0.281250,4.734209,5.034806,0.472527,0.571429,False,True
3,303,1.0,5.733333,1.416667,3.833333,0.463333,0.000833,0.043333,1.0,0.0,...,0.272727,0.264706,0.500000,0.235294,5.121079,4.826950,0.552381,0.448980,False,True
4,304,1.0,3.916667,1.706667,3.383333,0.270833,0.008133,0.110833,1.0,0.0,...,0.272727,0.264706,0.500000,0.235294,5.015620,5.024576,0.501487,0.505637,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5188,5488,20.0,1.758333,4.306667,3.868333,0.000537,0.052947,0.035617,3.0,0.0,...,0.260692,0.468413,0.288136,0.243451,4.772837,5.113590,0.497591,0.571584,True,False
5189,5489,20.0,5.336667,1.625000,3.916667,0.043067,0.000270,0.013667,3.0,0.0,...,0.258015,0.406250,0.437500,0.156250,5.143005,4.218750,0.608815,0.482759,False,True
5190,5490,20.0,12.416667,1.205000,6.708333,0.241667,0.000070,0.110417,0.0,0.0,...,0.193199,0.337838,0.443694,0.218468,5.602570,4.640432,0.674631,0.479649,False,True
5191,5491,20.0,3.541667,1.845000,4.268333,0.022417,0.000710,0.072417,3.0,1.0,...,0.306122,0.232210,0.494382,0.273408,4.668479,4.398972,0.439294,0.398943,False,True


In [29]:
dfs_train_predict = {}
for country in dfs_train_clas_X.keys():
    rfm.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = rfm.predict(dfs_train_clas_X[country])

    # Calculate the average depth of all decision trees
    average_depth = sum(tree.get_depth() for tree in rfm.estimators_) / len(rfm.estimators_)

    # print(f"Average Depth of Decision Trees for {country}: {average_depth}")

    # print(f"Accuracy for baseline model on validation data for {country}: {round(accuracy_score(dfs_valid_clas_y[country], dfs_train_predict[country])*100, ndigits=4)}%")
    print(f"F1-score for baseline model on validation data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")

F1-score for baseline model on validation data for belgium: 0.6521335968954519
F1-score for baseline model on validation data for england: 0.3448263620952428
F1-score for baseline model on validation data for france: 0.4728276632531063
F1-score for baseline model on validation data for germany: 0.4032306609257694
F1-score for baseline model on validation data for greece: 0.7179190699038669
F1-score for baseline model on validation data for italy: 0.4745922430474163
F1-score for baseline model on validation data for netherlands: 0.6408420423490157
F1-score for baseline model on validation data for portugal: 0.6450986011586589
F1-score for baseline model on validation data for scotland: 0.39488667281153794
F1-score for baseline model on validation data for spain: 0.4354203559769306
F1-score for baseline model on validation data for turkey: 0.5759979992451613


### Random Search

In [8]:
param_grid = {
    'max_depth': [i for i in range(1, 300)],
    'min_samples_split': [i for i in range(1, 300)],
    'min_samples_leaf': [i for i in range(1, 200)]
}
rfm = RandomForestClassifier(random_state=42)
dfs_train_predict_random = {}

random_search = RandomizedSearchCV(estimator=rfm, param_distributions=param_grid, n_iter=2,
                                   cv=5, random_state=42, n_jobs=-1)
dfs_train_predict = {}
for country in dfs_train_clas_X.keys():
    random_search.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    best_params = random_search.best_params_
    best_random_forest = random_search.best_estimator_
    dfs_train_predict_random[country] = best_random_forest.predict(dfs_valid_clas_X[country])

    print(f"F1-score for best random search model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_train_predict_random[country], average='macro')}")

F1-score for best random search model on validation data for belgium: 0.3667028592110026
F1-score for best random search model on validation data for england: 0.3443651480133092
F1-score for best random search model on validation data for france: 0.38087113995476174
F1-score for best random search model on validation data for germany: 0.3787230479265418
F1-score for best random search model on validation data for greece: 0.38423579517048295
F1-score for best random search model on validation data for italy: 0.3935444095357095
F1-score for best random search model on validation data for netherlands: 0.42759268136784917
F1-score for best random search model on validation data for portugal: 0.4132132132132133
F1-score for best random search model on validation data for scotland: 0.39276795524922586
F1-score for best random search model on validation data for spain: 0.3661081493930391
F1-score for best random search model on validation data for turkey: 0.3636725016681573
