In [114]:
import pandas as pd
import numpy as np
import os as os
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB,CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score,f1_score
from matplotlib import pyplot as plt
from sklearn.tree import plot_tree

### Load data

In [2]:
dfs_train = {}
dfs_test = {}

In [3]:
for root, directory, files in os.walk("data/train_preprocessed", topdown=False):
    if files:
        for file in files:
            dfs_train[file[:-4]] = pd.read_csv(f"{root}/{file}")
            try:
                dfs_train[file[:-4]] = pd.get_dummies(dfs_train[file[:-4]], columns=["Avg_bookie_prediction"], prefix='Bookie_Prediction')
            except KeyError:
                pass
for root, directory, files in os.walk("data/test_preprocessed", topdown=False):
    if files:
        for file in files:
            dfs_test[file[:-4]] = pd.read_csv(f"{root}/{file}")
            try:
                dfs_test[file[:-4]] = pd.get_dummies(dfs_test[file[:-4]], columns=["Avg_bookie_prediction"], prefix='Bookie_Prediction')
                
            except KeyError:
                pass
            dfs_test[file[:-4]].drop(columns=["Target_clas", "Target_regr"], inplace=True)

In [4]:
def impute_nan_values(dfs):
    for df in dfs.values():
        for col in df.columns:
            if df[col].dtype == np.float64 or df[col].dtype == np.int64:
                df[col] = df.groupby("season")[col].transform(lambda x: x.fillna(x.mean()))
        df.dropna(inplace=True)
impute_nan_values(dfs_train)
impute_nan_values(dfs_test)

In [33]:
# validation set
dfs_valid_reg_X = {}
dfs_valid_reg_y = {}
dfs_train_reg_X = {}
dfs_train_reg_y = {}
dfs_valid_clas_X = {}
dfs_valid_clas_y = {}
dfs_train_clas_X = {}
dfs_train_clas_y = {}


cols_to_drop = ['FTHG', 'FTAG', 'MatchTeams', 'SameHomeTeam', 'Target', 'Target_regr', 'Target_clas', "Unnamed: 0"]

for country in dfs_train:
    # dfs_train[country] =   dfs_train[country][  dfs_train[country]["season"] > 12]
    dfs_train[country]['Target_clas'] = [0 if a > h else 1 if h > a else -1 for a, h in zip(dfs_train[country]['FTAG'], dfs_train[country]['FTHG'])]

    dfs_valid_reg_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21]["Target_regr"]
    dfs_valid_reg_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21].drop(columns=cols_to_drop, errors='ignore')
    
    dfs_train_reg_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21]["Target_regr"]
    dfs_train_reg_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21].drop(columns=cols_to_drop, errors='ignore')
    
    dfs_train_clas_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21]["Target_clas"]
    dfs_train_clas_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21].drop(columns=cols_to_drop, errors='ignore')

    dfs_valid_clas_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21]["Target_clas"]
    dfs_valid_clas_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21].drop(columns=cols_to_drop, errors='ignore')


In [66]:
train_results = pd.DataFrame(columns=["country"])

In [67]:
train_results["country"] = dfs_train_clas_X.keys()

In [68]:
train_results

Unnamed: 0,country
0,belgium
1,england
2,france
3,germany
4,greece
5,italy
6,netherlands
7,portugal
8,scotland
9,spain


# Classification task
We decided to go with Voting Classifier consisting of 3 classification algorithms -  Gaussian Naive Bayes, RandomForestClassifier & Logistic Regression

### Logistic Regression

In [37]:
lr = LogisticRegression(random_state=42, multi_class='multinomial', max_iter=10000) ### Cesta je snizit hloubku stromu

In [69]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    lr.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = lr.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = lr.predict(dfs_valid_clas_X[country])
    train_results.loc[train_results["country"] == country, "lr"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')
    print(f"F1-score for baseline model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}\n --------")

F1-score for baseline model on train data for belgium: 0.43014606492079216
F1-score for baseline model on validation data for belgium: 0.4176354911238633
 --------
F1-score for baseline model on train data for england: 0.3949700342699738
F1-score for baseline model on validation data for england: 0.4100840644217642
 --------
F1-score for baseline model on train data for france: 0.39493113940696506
F1-score for baseline model on validation data for france: 0.3980616384466553
 --------
F1-score for baseline model on train data for germany: 0.3958911765311015
F1-score for baseline model on validation data for germany: 0.3719856092675347
 --------
F1-score for baseline model on train data for greece: 0.5008292035211275
F1-score for baseline model on validation data for greece: 0.42837095194268276
 --------
F1-score for baseline model on train data for italy: 0.4426181011698844
F1-score for baseline model on validation data for italy: 0.4140664996093013
 --------
F1-score for baseline model

### Random Forest Baseline Model

In [109]:
rfm = RandomForestClassifier(random_state=42, max_depth=12) ### Cesta je snizit hloubku stromu

In [112]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    rfm.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = rfm.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = rfm.predict(dfs_valid_clas_X[country])
    # Calculate the average depth of all decision trees
    average_depth = sum(tree.get_depth() for tree in rfm.estimators_) / len(rfm.estimators_)

    # print(f"Average Depth of Decision Trees for {country}: {average_depth}")
    train_results.loc[train_results["country"] == country, "rfm"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    # print(f"Accuracy for baseline model on validation data for {country}: {round(accuracy_score(dfs_valid_clas_y[country], dfs_train_predict[country])*100, ndigits=4)}%")
    print(f"F1-score for baseline Random Forest model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline Random model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}\n --------")

F1-score for baseline Random Forest model on train data for belgium: 0.4958541003148133
F1-score for baseline Random model on validation data for belgium: 0.4075616480613878
 --------
F1-score for baseline Random Forest model on train data for england: 0.34021077501126856
F1-score for baseline Random model on validation data for england: 0.3708075467071283
 --------
F1-score for baseline Random Forest model on train data for france: 0.38344945989267903
F1-score for baseline Random model on validation data for france: 0.36346769506877824
 --------
F1-score for baseline Random Forest model on train data for germany: 0.3646404892872359
F1-score for baseline Random model on validation data for germany: 0.3517197540320865
 --------
F1-score for baseline Random Forest model on train data for greece: 0.6013034629047252
F1-score for baseline Random model on validation data for greece: 0.4062224616213304
 --------
F1-score for baseline Random Forest model on train data for italy: 0.412891454523

### MultinomialNaive Bayes

In [11]:
nb = MultinomialNB()

In [71]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    nb.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = nb.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = nb.predict(dfs_valid_clas_X[country])
    train_results.loc[train_results["country"] == country, "nb"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    print(f"F1-score for baseline Multinomial Naive Bayes model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline Multinomial Naive Bayes model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}\n --------")

F1-score for baseline Multinomial Naive Bayes model on train data for belgium: 0.44618249390671066
F1-score for baseline Multinomial Naive Bayes model on validation data for belgium: 0.4628873279475689
 --------
F1-score for baseline Multinomial Naive Bayes model on train data for england: 0.4027666304679926
F1-score for baseline Multinomial Naive Bayes model on validation data for england: 0.3966624859491937
 --------
F1-score for baseline Multinomial Naive Bayes model on train data for france: 0.4326559199079685
F1-score for baseline Multinomial Naive Bayes model on validation data for france: 0.4023597767272586
 --------
F1-score for baseline Multinomial Naive Bayes model on train data for germany: 0.42889144141673646
F1-score for baseline Multinomial Naive Bayes model on validation data for germany: 0.43488045924524615
 --------
F1-score for baseline Multinomial Naive Bayes model on train data for greece: 0.5137807834450298
F1-score for baseline Multinomial Naive Bayes model on val

### KNeighborsClassifier

In [13]:
kn = KNeighborsClassifier()

In [72]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    kn.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = kn.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = kn.predict(dfs_valid_clas_X[country])
    train_results.loc[train_results["country"] == country, "kn"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    print(f"F1-score for baseline KNeighbors model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline KNeighbors Naive Bayes model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}\n --------")

F1-score for baseline KNeighbors model on train data for belgium: 0.5989689080352577
F1-score for baseline KNeighbors Naive Bayes model on validation data for belgium: 0.39529002686897413
 --------
F1-score for baseline KNeighbors model on train data for england: 0.5836088667204372
F1-score for baseline KNeighbors Naive Bayes model on validation data for england: 0.37459397413994583
 --------
F1-score for baseline KNeighbors model on train data for france: 0.5867507611254664
F1-score for baseline KNeighbors Naive Bayes model on validation data for france: 0.42872444831200207
 --------
F1-score for baseline KNeighbors model on train data for germany: 0.5871107727382849
F1-score for baseline KNeighbors Naive Bayes model on validation data for germany: 0.4256512457558741
 --------
F1-score for baseline KNeighbors model on train data for greece: 0.6377260226943732
F1-score for baseline KNeighbors Naive Bayes model on validation data for greece: 0.4101284958427816
 --------
F1-score for bas

### Gradient Boosting Classifier

In [15]:
gbc = GradientBoostingClassifier()

In [73]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    gbc.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = gbc.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = gbc.predict(dfs_valid_clas_X[country])
    train_results.loc[train_results["country"] == country, "gbc"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    print(f"F1-score for baseline GradientBoostingClassifier model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline GradientBoostingClassifier Naive Bayes model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}\n --------")

F1-score for baseline GradientBoostingClassifier model on train data for belgium: 0.5444803763080198
F1-score for baseline GradientBoostingClassifier Naive Bayes model on validation data for belgium: 0.4085501936968092
 --------
F1-score for baseline GradientBoostingClassifier model on train data for england: 0.3666579764213906
F1-score for baseline GradientBoostingClassifier Naive Bayes model on validation data for england: 0.3867912056451798
 --------
F1-score for baseline GradientBoostingClassifier model on train data for france: 0.4273899958663092
F1-score for baseline GradientBoostingClassifier Naive Bayes model on validation data for france: 0.3808062020538685
 --------
F1-score for baseline GradientBoostingClassifier model on train data for germany: 0.408162703355235
F1-score for baseline GradientBoostingClassifier Naive Bayes model on validation data for germany: 0.3538068811884982
 --------
F1-score for baseline GradientBoostingClassifier model on train data for greece: 0.6152

### Random Search
Use random search to get best parameters for each country

In [132]:
param_grid = {
    'max_depth': [i for i in range(1, 30)],
    'min_samples_split': [i for i in range(1, 300)],
    'min_samples_leaf': [i for i in range(1, 200)]
}
rfm = RandomForestClassifier(random_state=42)
dfs_train_predict_random = {}
dfs_valid_predict_random = {}
dfs_best_params = {}
random_search = RandomizedSearchCV(estimator=rfm, param_distributions=param_grid, n_iter=40,
                                   cv=5, random_state=42, n_jobs=-1)
dfs_train_predict = {}
for country in dfs_train_clas_X.keys():
    
    random_search.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    
    best_params = random_search.best_params_
    best_random_forest = random_search.best_estimator_
    
    dfs_best_params[country] = best_params
    dfs_train_predict_random[country] = best_random_forest.predict(dfs_train_clas_X[country])
    dfs_valid_predict_random[country] = best_random_forest.predict(dfs_valid_clas_X[country])
    train_results.loc[train_results["country"] == country, "rfm_rs"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')
    train_results.loc[train_results["country"] == country, "best_params"] = best_params
    print(f"F1-score for best random search model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict_random[country], average='macro')}")
    print(f"F1-score for best random search model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict_random[country], average='macro')}\n --------")

F1-score for best random search model on train data for belgium: 0.4206357122521287
F1-score for best random search model on validation data for belgium: 0.4073565874678029
 --------
F1-score for best random search model on train data for england: 0.33841253088314566
F1-score for best random search model on validation data for england: 0.372513431312572
 --------
F1-score for best random search model on train data for france: 0.3339542025743902
F1-score for best random search model on validation data for france: 0.35498291399955023
 --------
F1-score for best random search model on train data for germany: 0.34908613087321444
F1-score for best random search model on validation data for germany: 0.3560462321727856
 --------
F1-score for best random search model on train data for greece: 0.4516067839539473
F1-score for best random search model on validation data for greece: 0.4051450833905467
 --------
F1-score for best random search model on train data for italy: 0.3829512535730468
F1-sc

### Grid Search

In [153]:
train_results

Unnamed: 0,country,lr,rfm,nb,kn,gbc,rfm_rs,best_models,best_params
0,belgium,0.417635,0.407562,0.462887,0.39529,0.40855,0.407562,,
1,england,0.410084,0.370808,0.396662,0.374594,0.386791,0.370808,,
2,france,0.398062,0.363468,0.40236,0.428724,0.380806,0.363468,,
3,germany,0.371986,0.35172,0.43488,0.425651,0.353807,0.35172,,
4,greece,0.428371,0.406222,0.434389,0.410128,0.431096,0.406222,,
5,italy,0.414066,0.38997,0.459913,0.442976,0.401518,0.38997,,
6,netherlands,0.411464,0.398587,0.47692,0.431415,0.402,0.398587,,
7,portugal,0.420148,0.403393,0.546046,0.516181,0.466707,0.403393,,
8,scotland,0.367647,0.363087,0.456391,0.427188,0.40409,0.363087,,
9,spain,0.411035,0.369943,0.417607,0.42329,0.408056,0.369943,,


### Implementing Voting Classifier

In [149]:
best_models = []
for i, row in train_results.iterrows():
    # display(row[["lr", "rfm", "nb", "kn", "gbc", "rfm_rs"]])
    best_models.append(pd.to_numeric(row[["lr", "rfm", "nb", "kn", "gbc", "rfm_rs"]]).nlargest(3).index.tolist())

In [151]:
models = {
    "lr": lr,
    "rfm": rfm,
    "nb": nb,
    "kn": kn,
    "gbc": gbc,
    "rfm_rs": RandomForestClassifier()
}

def voting_classifier(best_models_country, X_train, y_train, X_val, y_val, country: str):
    """"""
    print(best_models_country)
    clf1 = models[best_models_country[0]] if best_models_country[0] != "rfm_rs" else models[best_models_country[0]].set_params(**dfs_best_params[country])
    clf2 = models[best_models_country[1]] if best_models_country[1] != "rfm_rs" else models[best_models_country[1]].set_params(**dfs_best_params[country])
    clf3 = models[best_models_country[2]] if best_models_country[2] != "rfm_rs" else models[best_models_country[2]].set_params(**dfs_best_params[country])
    
    eclf = VotingClassifier(
        estimators=[(best_models_country[0], clf1 ), (best_models_country[1], clf2 ),(best_models_country[2], clf3 )],
        voting='hard'
    ) 
    eclf.fit(X_train,y_train)
    
    y_train_predict = eclf.predict(X_train)
    y_val_predict = eclf.predict(X_val)
    
    
    print(f"F1-score for best random search model on train data for {country}: {f1_score(y_train, y_train_predict, average='macro')}")
    print(f"F1-score for best random search model on validation data for {country}: {f1_score(y_val, y_val_predict, average='macro')}\n --------")
    

In [152]:
for i, country in enumerate(dfs_train_clas_X.keys()):
    voting_classifier(best_models[i], dfs_train_clas_X[country], dfs_train_clas_y[country], dfs_valid_clas_X[country], dfs_valid_clas_y[country],country)

['nb', 'lr', 'gbc']
F1-score for best random search model on train data for belgium: 0.4641168582003948
F1-score for best random search model on validation data for belgium: 0.4402185037214237
 --------
['lr', 'nb', 'gbc']
F1-score for best random search model on train data for england: 0.39075727212292793
F1-score for best random search model on validation data for england: 0.4080574744021903
 --------
['kn', 'nb', 'lr']
F1-score for best random search model on train data for france: 0.4694452448749935
F1-score for best random search model on validation data for france: 0.44201874916052386
 --------
['nb', 'kn', 'lr']
F1-score for best random search model on train data for germany: 0.4726499134054887
F1-score for best random search model on validation data for germany: 0.4437529251146402
 --------
['nb', 'gbc', 'lr']
F1-score for best random search model on train data for greece: 0.5616674354550465
F1-score for best random search model on validation data for greece: 0.4455895996614998