In [186]:
import pandas as pd
import numpy as np
import os as os
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB,CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score,f1_score
from matplotlib import pyplot as plt
from sklearn.tree import plot_tree

### Load data

In [187]:
dfs_train = {}
dfs_test = {}

In [188]:
for root, directory, files in os.walk("data/train_preprocessed", topdown=False):
    if files:
        for file in files:
            dfs_train[file[:-4]] = pd.read_csv(f"{root}/{file}")
            try:
                dfs_train[file[:-4]] = pd.get_dummies(dfs_train[file[:-4]], columns=["Avg_bookie_prediction"], prefix='Bookie_Prediction')
            except KeyError:
                pass
lens_test = 0
for root, directory, files in os.walk("data/test_preprocessed", topdown=False):
    if files:
        for file in files:
            dfs_test[file[:-4]] = pd.read_csv(f"{root}/{file}")
            try:
                dfs_test[file[:-4]] = pd.get_dummies(dfs_test[file[:-4]], columns=["Avg_bookie_prediction"], prefix='Bookie_Prediction')
            except KeyError:
                pass
            
            lens_test += dfs_test[file[:-4]].shape[0]
print("---")
lens_orig = 0
df_test_y = pd.DataFrame()
for root, directory, files in os.walk("data/orig_data", topdown=False):
    if files:
        for file in files:
            # print(pd.read_csv(f"{root}/{file}").shape)
            tmp = pd.read_csv(f"{root}/{file}")
            lens_orig += tmp.shape[0]
            tmp["country"] = file[:-5]
            df_test_y = pd.concat([df_test_y, tmp], axis=0)

---


In [189]:
df_test_y["Date"] = pd.to_datetime(df_test_y["Date"], format = '%d/%m/%Y', dayfirst=True)

df_test_y.sort_values(by="Date", inplace=True, ascending=True)

In [190]:
df_test_y[df_test_y["country"]=="england"]

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,country,Referee
0,E1,2022-07-29,20:00,Huddersfield,Burnley,0,1,A,0.0,1.0,...,2.09,1.81,2.10,1.82,2.14,1.83,2.09,1.78,england,J Linington
6,E3,2022-07-30,15:00,Rochdale,Crewe,1,2,A,0.0,2.0,...,1.70,2.10,1.78,2.12,1.85,2.17,1.75,2.07,england,A Kitchen
5,E3,2022-07-30,15:00,Northampton,Colchester,3,2,H,1.0,1.0,...,2.00,1.85,2.02,1.87,2.02,1.92,1.96,1.85,england,M Woods
4,E3,2022-07-30,15:00,Leyton Orient,Grimsby,2,0,H,0.0,0.0,...,1.80,2.05,1.79,2.10,1.86,2.10,1.80,2.02,england,C Pollard
11,E2,2022-07-30,15:00,Wycombe,Burton,3,0,H,3.0,0.0,...,2.00,1.85,2.00,1.88,2.03,1.92,1.96,1.85,england,G Ward
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,E0,2023-05-28,16:30,Everton,Bournemouth,1,0,H,0.0,0.0,...,2.02,1.77,2.10,1.81,2.17,1.92,2.03,1.83,england,S Attwell
372,E0,2023-05-28,16:30,Brentford,Man City,1,0,H,0.0,0.0,...,1.93,1.97,2.05,1.86,2.28,1.97,2.01,1.85,england,J Brooks
377,E0,2023-05-28,16:30,Leicester,West Ham,2,1,H,1.0,0.0,...,1.75,2.05,1.85,2.06,1.90,2.16,1.82,2.04,england,S Hooper
378,E0,2023-05-28,16:30,Man United,Fulham,2,1,H,1.0,1.0,...,1.98,1.92,1.98,1.93,2.07,1.98,1.97,1.89,england,R Jones


In [191]:
for country in df_test_y["country"].unique():
    col1 = df_test_y[df_test_y["country"] == country]["FTHG"].reset_index()
    col2 = df_test_y[df_test_y["country"] == country]["FTAG"].reset_index()

    target_values = col1["FTHG"] + col2["FTAG"]

    dfs_test[f"df_{country}"]["Target_regr"] = target_values
    dfs_test[f"df_{country}"]["FTHG"] = col1["FTHG"]
    dfs_test[f"df_{country}"]["FTAG"] = col2["FTAG"]
    dfs_test[f"df_{country}"]['Target_clas'] = [0 if a > h else 1 if h > a else -1 for a, h in zip(dfs_test[f"df_{country}"]['FTAG'], dfs_test[f"df_{country}"]['FTHG'])]
    dfs_test[f"df_{country}"].drop(columns=["FTHG", "FTAG", "Unnamed: 0"], inplace=True)


In [192]:
dfs_train["df_belgium"]

Unnamed: 0.1,Unnamed: 0,FTHG,FTAG,season,Avg_away_odds,Avg_home_odds,Avg_draw_odds,Var_away_odds,Var_home_odds,Var_draw_odds,...,AwayLossRatio,AwayDrawRatio,HomeTeamAvgShotsOnTarget,AwayTeamAvgShotsOnTarget,HomeTeamScoredRatio,AwayTeamScoredRatio,Target_regr,Target_clas,Bookie_Prediction_A,Bookie_Prediction_H
0,300,5.0,1.0,1.0,4.666667,1.556667,3.533333,0.083333,0.000133,0.043333,...,0.500000,0.235294,4.756882,4.309364,0.478261,0.319149,6.0,1,False,True
1,301,3.0,0.0,1.0,8.600000,1.233333,4.900000,2.680000,0.003333,0.280000,...,0.470588,0.205882,6.264428,4.510234,0.778761,0.457831,3.0,1,False,True
2,302,1.0,1.0,1.0,2.966667,2.050000,3.166667,0.043333,0.032500,0.063333,...,0.250000,0.281250,4.734209,5.034806,0.472527,0.571429,2.0,-1,False,True
3,303,4.0,2.0,1.0,5.733333,1.416667,3.833333,0.463333,0.000833,0.043333,...,0.500000,0.235294,5.121079,4.826950,0.552381,0.448980,6.0,1,False,True
4,304,1.0,0.0,1.0,3.916667,1.706667,3.383333,0.270833,0.008133,0.110833,...,0.500000,0.235294,,,,,1.0,1,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5487,5787,3.0,0.0,21.0,3.914000,1.870000,3.574000,0.010480,0.001150,0.013380,...,0.249637,0.258345,4.455075,5.083699,0.436352,0.600632,3.0,1,False,True
5488,5788,2.0,0.0,21.0,7.970000,1.330000,5.290000,0.092000,0.000350,0.045500,...,0.426195,0.237006,5.613602,4.621183,0.673558,0.445467,2.0,1,False,True
5489,5789,2.0,3.0,21.0,1.392000,7.320000,4.752000,0.001270,0.130750,0.026770,...,0.156342,0.227139,4.648310,5.431673,0.486462,0.683400,5.0,0,True,False
5490,5790,5.0,0.0,21.0,11.128000,1.228000,6.252000,2.046920,0.000370,0.062520,...,0.424581,0.312849,5.128811,4.660707,0.574391,0.441392,5.0,1,False,True


In [193]:
def impute_nan_values(dfs):
    for df in dfs.values():
        for col in df.columns:
            if df[col].dtype == np.float64 or df[col].dtype == np.int64:
                df[col] = df.groupby("season")[col].transform(lambda x: x.fillna(x.mean()))
        df.dropna(inplace=True)
impute_nan_values(dfs_train)
impute_nan_values(dfs_test)

In [194]:
# validation set
dfs_valid_reg_X = {}
dfs_valid_reg_y = {}
dfs_train_reg_X = {}
dfs_train_reg_y = {}
dfs_valid_clas_X = {}
dfs_valid_clas_y = {}
dfs_train_clas_X = {}
dfs_train_clas_y = {}
dfs_test_clas_X = {}
dfs_test_clas_y = {}
dfs_test_reg_X = {}
dfs_test_reg_y = {}

cols_to_drop = ['FTHG', 'FTAG', 'MatchTeams', 'SameHomeTeam', 'Target', 'Target_regr', 'Target_clas', "Unnamed: 0", "index", "season"]

for country in dfs_train:
    dfs_train[country] =   dfs_train[country][dfs_train[country]["season"] > 17]
    dfs_train[country]['Target_clas'] = [0 if a > h else 1 if h > a else -1 for a, h in zip(dfs_train[country]['FTAG'], dfs_train[country]['FTHG'])]

    dfs_valid_reg_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21]["Target_regr"]
    dfs_valid_reg_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21].drop(columns=cols_to_drop, errors='ignore')
    
    dfs_train_reg_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21]["Target_regr"]
    dfs_train_reg_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21].drop(columns=cols_to_drop, errors='ignore')
    
    dfs_train_clas_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21]["Target_clas"]
    dfs_train_clas_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21].drop(columns=cols_to_drop, errors='ignore')

    dfs_valid_clas_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21]["Target_clas"]
    dfs_valid_clas_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21].drop(columns=cols_to_drop, errors='ignore')

for country in dfs_test:
    dfs_test_reg_y[country[3:]] = dfs_test[country][dfs_test[country]["season"] != 21]["Target_regr"]
    dfs_test_reg_X[country[3:]] = dfs_test[country][dfs_test[country]["season"] != 21].drop(columns=cols_to_drop, errors='ignore')

    dfs_test_clas_y[country[3:]] = dfs_test[country][dfs_test[country]["season"] != 21]["Target_clas"]
    dfs_test_clas_X[country[3:]] = dfs_test[country][dfs_test[country]["season"] != 21].drop(columns=cols_to_drop, errors='ignore')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfs_train[country]['Target_clas'] = [0 if a > h else 1 if h > a else -1 for a, h in zip(dfs_train[country]['FTAG'], dfs_train[country]['FTHG'])]


In [195]:
train_results = pd.DataFrame(columns=["country"])
test_results = pd.DataFrame(columns=["country"])

In [196]:
train_results["country"] = dfs_train_clas_X.keys()
test_results["country"] = dfs_train_clas_X.keys()


In [197]:
train_results

Unnamed: 0,country
0,belgium
1,england
2,france
3,germany
4,greece
5,italy
6,netherlands
7,portugal
8,scotland
9,spain


# Classification task
We decided to go with Voting Classifier consisting of 3 classification algorithms -  Gaussian Naive Bayes, RandomForestClassifier & Logistic Regression

### Logistic Regression

In [198]:
lr = LogisticRegression(random_state=42, multi_class='multinomial', max_iter=10000) ### Cesta je snizit hloubku stromu

In [199]:
dfs_train_predict = {}
dfs_valid_predict = {}
dfs_test_predict = {}
for country in dfs_train_clas_X.keys():
    lr.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = lr.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = lr.predict(dfs_valid_clas_X[country])
    dfs_test_predict[country] = lr.predict(dfs_test_clas_X[country])
    train_results.loc[train_results["country"] == country, "lr"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')
    test_results.loc[test_results["country"] == country, "lr"] = f1_score(dfs_test_clas_y[country], dfs_test_predict[country], average='macro')
    
    print(f"Accuracy for baseline model on validation data for {country}: {accuracy_score(dfs_valid_clas_y[country], dfs_valid_predict[country])}")
    print(f"F1-score for baseline model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}")
    print(f"F1-score for baseline model on test data for {country}: {f1_score(dfs_test_clas_y[country], dfs_test_predict[country], average='macro')}\n --------")
    

Accuracy for baseline model on validation data for belgium: 0.5250836120401338
F1-score for baseline model on train data for belgium: 0.4727805105667286
F1-score for baseline model on validation data for belgium: 0.4300656102874103
F1-score for baseline model on test data for belgium: 0.32740886607856273
 --------
Accuracy for baseline model on validation data for england: 0.4506265664160401
F1-score for baseline model on train data for england: 0.49817595686333976
F1-score for baseline model on validation data for england: 0.4009770162096624
F1-score for baseline model on test data for england: 0.3379346465524318
 --------
Accuracy for baseline model on validation data for france: 0.44519621109607577
F1-score for baseline model on train data for france: 0.4931800029005838
F1-score for baseline model on validation data for france: 0.39574803157157595
F1-score for baseline model on test data for france: 0.3803300321503376
 --------
Accuracy for baseline model on validation data for germ

In [200]:
display(train_results[["country", "lr"]])
display(test_results)

Unnamed: 0,country,lr
0,belgium,0.430066
1,england,0.400977
2,france,0.395748
3,germany,0.390315
4,greece,0.406559
5,italy,0.428808
6,netherlands,0.413005
7,portugal,0.469899
8,scotland,0.420993
9,spain,0.407204


Unnamed: 0,country,lr
0,belgium,0.327409
1,england,0.337935
2,france,0.38033
3,germany,0.359231
4,greece,0.402704
5,italy,0.324591
6,netherlands,0.382111
7,portugal,0.389403
8,scotland,0.362295
9,spain,0.307611


### Random Forest Baseline Model

In [201]:
rfm = RandomForestClassifier(random_state=42, max_depth=4) ### Cesta je snizit hloubku stromu

In [202]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    rfm.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = rfm.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = rfm.predict(dfs_valid_clas_X[country])
    dfs_test_predict[country] = rfm.predict(dfs_test_clas_X[country])

    # Calculate the average depth of all decision trees
    average_depth = sum(tree.get_depth() for tree in rfm.estimators_) / len(rfm.estimators_)

    # print(f"Average Depth of Decision Trees for {country}: {average_depth}")
    train_results.loc[train_results["country"] == country, "rfm"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    # print(f"Accuracy for baseline model on validation data for {country}: {round(accuracy_score(dfs_valid_clas_y[country], dfs_train_predict[country])*100, ndigits=4)}%")
    print(f"F1-score for baseline Random Forest model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline Random Forest model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}")
    print(f"F1-score for baseline Random Forest model on test data for {country}: {f1_score(dfs_test_clas_y[country], dfs_test_predict[country], average='macro')}\n --------")


F1-score for baseline Random Forest model on train data for belgium: 0.4378488347502432
F1-score for baseline Random Forest model on validation data for belgium: 0.40367993153615744
F1-score for baseline Random Forest model on test data for belgium: 0.3310089877775468
 --------
F1-score for baseline Random Forest model on train data for england: 0.32743610288464803
F1-score for baseline Random Forest model on validation data for england: 0.32822042822042824
F1-score for baseline Random Forest model on test data for england: 0.25990710678210677
 --------
F1-score for baseline Random Forest model on train data for france: 0.399439097532346
F1-score for baseline Random Forest model on validation data for france: 0.35889580164665885
F1-score for baseline Random Forest model on test data for france: 0.31746807433157204
 --------
F1-score for baseline Random Forest model on train data for germany: 0.372131441854511
F1-score for baseline Random Forest model on validation data for germany: 0.3

### MultinomialNaive Bayes

In [203]:
nb = MultinomialNB()

In [204]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    nb.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = nb.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = nb.predict(dfs_valid_clas_X[country])
    dfs_test_predict[country] = nb.predict(dfs_test_clas_X[country])

    train_results.loc[train_results["country"] == country, "nb"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    print(f"F1-score for baseline Multinomial Naive Bayes model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline Multinomial Naive Bayes model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}")
    print(f"F1-score for baseline Multinomial Naive Bayes model on test data for {country}: {f1_score(dfs_test_clas_y[country], dfs_test_predict[country], average='macro')}\n --------")


F1-score for baseline Multinomial Naive Bayes model on train data for belgium: 0.4255711292491478
F1-score for baseline Multinomial Naive Bayes model on validation data for belgium: 0.40595500070221185
F1-score for baseline Multinomial Naive Bayes model on test data for belgium: 0.3616061357996842
 --------
F1-score for baseline Multinomial Naive Bayes model on train data for england: 0.47621455325225986
F1-score for baseline Multinomial Naive Bayes model on validation data for england: 0.44302221161866573
F1-score for baseline Multinomial Naive Bayes model on test data for england: 0.3425640810954196
 --------
F1-score for baseline Multinomial Naive Bayes model on train data for france: 0.48709385058962384
F1-score for baseline Multinomial Naive Bayes model on validation data for france: 0.44166874510691706
F1-score for baseline Multinomial Naive Bayes model on test data for france: 0.38679821408619536
 --------
F1-score for baseline Multinomial Naive Bayes model on train data for ger

### KNeighborsClassifier

In [205]:
kn = KNeighborsClassifier()

In [206]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    kn.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = kn.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = kn.predict(dfs_valid_clas_X[country])
    dfs_test_predict[country] = kn.predict(dfs_test_clas_X[country])
    train_results.loc[train_results["country"] == country, "kn"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    print(f"F1-score for baseline KNeighbors model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline KNeighbors model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}")
    print(f"F1-score for baseline KNeighbors model on test data for {country}: {f1_score(dfs_test_clas_y[country], dfs_test_predict[country], average='macro')}\n --------")

F1-score for baseline KNeighbors model on train data for belgium: 0.5710600107944851
F1-score for baseline KNeighbors model on validation data for belgium: 0.46310744352754857
F1-score for baseline KNeighbors model on test data for belgium: 0.3701209330737993
 --------
F1-score for baseline KNeighbors model on train data for england: 0.5827749869996115
F1-score for baseline KNeighbors model on validation data for england: 0.3933019717107375
F1-score for baseline KNeighbors model on test data for england: 0.31704960062302684
 --------
F1-score for baseline KNeighbors model on train data for france: 0.6045559302159741
F1-score for baseline KNeighbors model on validation data for france: 0.4070633684926858
F1-score for baseline KNeighbors model on test data for france: 0.3567566253218599
 --------
F1-score for baseline KNeighbors model on train data for germany: 0.5979028682142488
F1-score for baseline KNeighbors model on validation data for germany: 0.43189945723011075
F1-score for basel

### Gradient Boosting Classifier

In [207]:
gbc = GradientBoostingClassifier()

In [209]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    gbc.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = gbc.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = gbc.predict(dfs_valid_clas_X[country])
    dfs_test_predict[country] = gbc.predict(dfs_test_clas_X[country])

    train_results.loc[train_results["country"] == country, "gbc"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    print(f"F1-score for baseline GradientBoostingClassifier model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline GradientBoostingClassifier model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}")
    print(f"F1-score for baseline GradientBoostingClassifier model on test data for {country}: {f1_score(dfs_test_clas_y[country], dfs_test_predict[country], average='macro')}\n --------")

F1-score for baseline GradientBoostingClassifier model on train data for belgium: 0.8814600471442763
F1-score for baseline GradientBoostingClassifier model on validation data for belgium: 0.4155311657373229
F1-score for baseline GradientBoostingClassifier model on test data for belgium: 0.35122323033677416
 --------
F1-score for baseline GradientBoostingClassifier model on train data for england: 0.5295075625406082
F1-score for baseline GradientBoostingClassifier model on validation data for england: 0.3946094169515673
F1-score for baseline GradientBoostingClassifier model on test data for england: 0.3031707783141805
 --------
F1-score for baseline GradientBoostingClassifier model on train data for france: 0.7090153120670831
F1-score for baseline GradientBoostingClassifier model on validation data for france: 0.3685820972938547
F1-score for baseline GradientBoostingClassifier model on test data for france: 0.3360432708936869
 --------
F1-score for baseline GradientBoostingClassifier mo

## Random Search
Use random search to get best parameters for each country

### Random Forrest 

In [210]:
param_grid = {
    'max_depth': [i for i in range(1, 30)],
    'min_samples_split': [i for i in range(1, 300)],
    'min_samples_leaf': [i for i in range(1, 200)]
}
rfm = RandomForestClassifier(random_state=42)
dfs_train_predict_random = {}
dfs_valid_predict_random = {}
dfs_best_params_rfm = {}
random_search = RandomizedSearchCV(estimator=rfm, param_distributions=param_grid, n_iter=40,
                                   cv=5, random_state=42, n_jobs=-1)
dfs_train_predict = {}
for country in dfs_train_clas_X.keys():
    
    random_search.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    
    best_params = random_search.best_params_
    best_random_forest = random_search.best_estimator_
    
    dfs_best_params_rfm[country] = best_params
    dfs_train_predict_random[country] = best_random_forest.predict(dfs_train_clas_X[country])
    dfs_valid_predict_random[country] = best_random_forest.predict(dfs_valid_clas_X[country])
    dfs_test_predict[country] = best_random_forest.predict(dfs_test_clas_X[country])

    train_results.loc[train_results["country"] == country, "rfm_rs"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')
    train_results.loc[train_results["country"] == country, "best_params"] = best_params
    print(f"F1-score for best Random Forest model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict_random[country], average='macro')}")
    print(f"F1-score for best Random Forest model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict_random[country], average='macro')}")
    print(f"F1-score for baseline GradientBoostingClassifier model on test data for {country}: {f1_score(dfs_test_clas_y[country], dfs_test_predict[country], average='macro')}\n --------")

F1-score for best Random Forest model on train data for belgium: 0.40116661703963286
F1-score for best Random Forest model on validation data for belgium: 0.40099324419354687
F1-score for baseline GradientBoostingClassifier model on test data for belgium: 0.32663673774784885
 --------
F1-score for best Random Forest model on train data for england: 0.35904938284710397
F1-score for best Random Forest model on validation data for england: 0.3712762598664237
F1-score for baseline GradientBoostingClassifier model on test data for england: 0.28339852727358006
 --------
F1-score for best Random Forest model on train data for france: 0.36262657049763297
F1-score for best Random Forest model on validation data for france: 0.3540200681211452
F1-score for baseline GradientBoostingClassifier model on test data for france: 0.32003788008727807
 --------
F1-score for best Random Forest model on train data for germany: 0.37735064070560825
F1-score for best Random Forest model on validation data for g

### LogReg

In [211]:
param_grid = {
    'penalty': ['l1', 'l2'],                 # Regularization penalty ('l1' or 'l2')
    'C': [0.001, 0.01, 0.1, 1, 10, 100],     # Inverse of regularization strength
    'solver': ['liblinear', 'saga'],        # Algorithm to use in the optimization problem
    'max_iter': [8000]               # Maximum number of iterations for optimization
}

lr = LogisticRegression()
dfs_train_predict_random = {}
dfs_valid_predict_random = {}
dfs_best_params_lr = {}
random_search = RandomizedSearchCV(estimator=lr, param_distributions=param_grid, n_iter=10,
                                   cv=5, random_state=42, n_jobs=-1)


fs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    random_search.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])

    best_params = random_search.best_params_


    dfs_best_params_lr[country] = best_params

    best_knn = random_search.best_estimator_
    dfs_train_predict[country] = best_knn.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = best_knn.predict(dfs_valid_clas_X[country])
    dfs_test_predict[country] = best_knn.predict(dfs_test_clas_X[country])

    train_results.loc[train_results["country"] == country, "lr_rc"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    print(f"F1-score for best LogReg model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for best LogReg model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}")
    print(f"F1-score for best LogReg model on test data for {country}: {f1_score(dfs_test_clas_y[country], dfs_test_predict[country], average='macro')}\n --------")


F1-score for best LogReg model on train data for belgium: 0.39246176906886615
F1-score for best LogReg model on validation data for belgium: 0.39807389937106924
F1-score for best LogReg model on test data for belgium: 0.34056982647571415
 --------


KeyboardInterrupt: 

### Grid Search

In [None]:
train_results

### Implementing Voting Classifier

In [212]:
best_models = []
for i, row in train_results.iterrows():
    # display(row[["lr", "rfm", "nb", "kn", "gbc", "rfm_rs"]])
    best_models.append(pd.to_numeric(row[["lr", "rfm", "nb", "kn", "gbc", "rfm_rs"]]).nlargest(3).index.tolist())

In [213]:


def voting_classifier(best_models_country, X_train, y_train, X_val, y_val, country: str, X_test, y_test):
    """"""
    models = {
        "lr": LogisticRegression(random_state=42, multi_class='multinomial', max_iter=10000),
        "rfm": rfm,
        "nb": nb,
        "kn": kn,
        "gbc": gbc,
        "rfm_rs": RandomForestClassifier(random_state=42),
        "lr_rc": LogisticRegression(random_state=42)
        
    }
    print(best_models_country)
    clf1 = models[best_models_country[0]] if best_models_country[0] != "rfm_rs" else models[best_models_country[0]].set_params(**dfs_best_params_rfm[country]) if best_models_country[0] != "lr_rc" else models[best_models_country[0]].set_params(**dfs_best_params_lr[country])
    clf2 = models[best_models_country[1]] if best_models_country[1] != "rfm_rs" else models[best_models_country[1]].set_params(**dfs_best_params_rfm[country]) if best_models_country[0] != "lr_rc" else models[best_models_country[0]].set_params(**dfs_best_params_lr[country])
    clf3 = models[best_models_country[2]] if best_models_country[2] != "rfm_rs" else models[best_models_country[2]].set_params(**dfs_best_params_rfm[country]) if best_models_country[0] != "lr_rc" else models[best_models_country[0]].set_params(**dfs_best_params_lr[country])
    
    eclf = VotingClassifier(
        estimators=[(best_models_country[0], clf1 ), (best_models_country[1], clf2 ),(best_models_country[2], clf3 )],
        voting='hard'
    ) 
    eclf.fit(X_train,y_train)
    
    y_train_predict = eclf.predict(X_train)
    y_val_predict = eclf.predict(X_val)
    
    y_test_predict = eclf.predict(X_test)
    
    
    print(f"F1-score for voting classifier model on train data for {country}: {f1_score(y_train, y_train_predict, average='macro')}")
    print(f"F1-score for voting classifier model on validation data for {country}: {f1_score(y_val, y_val_predict, average='macro')}")
    print(f"F1-score for voting classifier model on test data for {country}: {f1_score(y_test, y_test_predict, average='macro')}\n --------")
    

In [214]:
for i, country in enumerate(dfs_train_clas_X.keys()):
    voting_classifier(best_models[i], dfs_train_clas_X[country], dfs_train_clas_y[country], dfs_valid_clas_X[country], dfs_valid_clas_y[country],country, dfs_test_clas_X[country], dfs_test_clas_y[country])

['kn', 'lr', 'gbc']
F1-score for voting classifier model on train data for belgium: 0.6952013209310076
F1-score for voting classifier model on validation data for belgium: 0.46049255036491443
F1-score for voting classifier model on test data for belgium: 0.3560412816336987
 --------
['nb', 'lr', 'gbc']
F1-score for voting classifier model on train data for england: 0.512238975196527
F1-score for voting classifier model on validation data for england: 0.42991395307775315
F1-score for voting classifier model on test data for england: 0.335940104983608
 --------
['nb', 'kn', 'lr']
F1-score for voting classifier model on train data for france: 0.5333761722346051
F1-score for voting classifier model on validation data for france: 0.4248510662873288
F1-score for voting classifier model on test data for france: 0.37622863055785283
 --------
['kn', 'nb', 'lr']
F1-score for voting classifier model on train data for germany: 0.5225956197823206
F1-score for voting classifier model on validation d