In [2]:
import pandas as pd
import numpy as np
import os as os
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB,CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score,f1_score
from matplotlib import pyplot as plt
from sklearn.tree import plot_tree

### Load data

In [3]:
dfs_train = {}
dfs_test = {}

In [81]:
for root, directory, files in os.walk("data/train_preprocessed", topdown=False):
    if files:
        for file in files:
            dfs_train[file[:-4]] = pd.read_csv(f"{root}/{file}")
            try:
                dfs_train[file[:-4]] = pd.get_dummies(dfs_train[file[:-4]], columns=["Avg_bookie_prediction"], prefix='Bookie_Prediction')
            except KeyError:
                pass
lens_test = 0
for root, directory, files in os.walk("data/test_preprocessed", topdown=False):
    if files:
        for file in files:
            dfs_test[file[:-4]] = pd.read_csv(f"{root}/{file}")
            try:
                dfs_test[file[:-4]] = pd.get_dummies(dfs_test[file[:-4]], columns=["Avg_bookie_prediction"], prefix='Bookie_Prediction')
            except KeyError:
                pass
            
            lens_test += dfs_test[file[:-4]].shape[0]
print("---")
lens_orig = 0
df_test_y = pd.DataFrame()
for root, directory, files in os.walk("data/orig_data", topdown=False):
    if files:
        for file in files:
            # print(pd.read_csv(f"{root}/{file}").shape)
            tmp = pd.read_csv(f"{root}/{file}")
            lens_orig += tmp.shape[0]
            tmp["country"] = file[:-5]
            df_test_y = pd.concat([df_test_y, tmp], axis=0)

---


In [82]:
df_test_y["Date"] = pd.to_datetime(df_test_y["Date"], format = '%d/%m/%Y', dayfirst=True)

df_test_y.sort_values(by="Date", inplace=True, ascending=True)

In [83]:
df_test_y[df_test_y["country"]=="england"]

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,country,Referee
0,E1,2022-07-29,20:00,Huddersfield,Burnley,0,1,A,0.0,1.0,...,2.09,1.81,2.10,1.82,2.14,1.83,2.09,1.78,england,J Linington
6,E3,2022-07-30,15:00,Rochdale,Crewe,1,2,A,0.0,2.0,...,1.70,2.10,1.78,2.12,1.85,2.17,1.75,2.07,england,A Kitchen
5,E3,2022-07-30,15:00,Northampton,Colchester,3,2,H,1.0,1.0,...,2.00,1.85,2.02,1.87,2.02,1.92,1.96,1.85,england,M Woods
4,E3,2022-07-30,15:00,Leyton Orient,Grimsby,2,0,H,0.0,0.0,...,1.80,2.05,1.79,2.10,1.86,2.10,1.80,2.02,england,C Pollard
11,E2,2022-07-30,15:00,Wycombe,Burton,3,0,H,3.0,0.0,...,2.00,1.85,2.00,1.88,2.03,1.92,1.96,1.85,england,G Ward
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,E0,2023-05-28,16:30,Everton,Bournemouth,1,0,H,0.0,0.0,...,2.02,1.77,2.10,1.81,2.17,1.92,2.03,1.83,england,S Attwell
372,E0,2023-05-28,16:30,Brentford,Man City,1,0,H,0.0,0.0,...,1.93,1.97,2.05,1.86,2.28,1.97,2.01,1.85,england,J Brooks
377,E0,2023-05-28,16:30,Leicester,West Ham,2,1,H,1.0,0.0,...,1.75,2.05,1.85,2.06,1.90,2.16,1.82,2.04,england,S Hooper
378,E0,2023-05-28,16:30,Man United,Fulham,2,1,H,1.0,1.0,...,1.98,1.92,1.98,1.93,2.07,1.98,1.97,1.89,england,R Jones


In [84]:
for country in df_test_y["country"].unique():
    col1 = df_test_y[df_test_y["country"] == country]["FTHG"].reset_index()
    col2 = df_test_y[df_test_y["country"] == country]["FTAG"].reset_index()

    target_values = col1["FTHG"] + col2["FTAG"]

    dfs_test[f"df_{country}"]["Target_regr"] = target_values
    dfs_test[f"df_{country}"]["FTHG"] = col1["FTHG"]
    dfs_test[f"df_{country}"]["FTAG"] = col2["FTAG"]
    dfs_test[f"df_{country}"]['Target_clas'] = [0 if a > h else 1 if h > a else -1 for a, h in zip(dfs_test[f"df_{country}"]['FTAG'], dfs_test[f"df_{country}"]['FTHG'])]
    dfs_test[f"df_{country}"].drop(columns=["FTHG", "FTAG", "Unnamed: 0"], inplace=True)


In [73]:
dfs_train["df_belgium"]

Unnamed: 0.1,Unnamed: 0,FTHG,FTAG,season,Avg_away_odds,Avg_home_odds,Avg_draw_odds,Var_away_odds,Var_home_odds,Var_draw_odds,...,AwayLossRatio,AwayDrawRatio,HomeTeamAvgShotsOnTarget,AwayTeamAvgShotsOnTarget,HomeTeamScoredRatio,AwayTeamScoredRatio,Target_regr,Target_clas,Bookie_Prediction_A,Bookie_Prediction_H
3275,3575,2.0,0.0,13.0,8.76250,1.32500,4.92000,1.036964,0.000543,0.077629,...,0.438961,0.231169,5.476707,4.584037,0.657729,0.438647,2.0,1,False,True
3276,3576,1.0,0.0,13.0,3.90500,1.89000,3.44125,0.033914,0.002400,0.024184,...,0.438596,0.333333,4.576918,4.727606,0.483425,0.422222,1.0,1,False,True
3277,3577,3.0,0.0,13.0,6.72375,1.42125,4.47375,0.826227,0.001270,0.088855,...,0.575758,0.272727,5.218518,4.326821,0.580016,0.326087,3.0,1,False,True
3278,3578,1.0,1.0,13.0,3.81625,1.88375,3.50750,0.075541,0.001370,0.026736,...,0.445141,0.225705,4.648457,4.629278,0.438108,0.449309,2.0,-1,False,True
3279,3579,1.0,1.0,13.0,1.86875,3.93750,3.48500,0.001870,0.063393,0.017229,...,0.297821,0.242131,4.348466,4.981359,0.363636,0.559594,2.0,-1,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5487,5787,3.0,0.0,21.0,3.91400,1.87000,3.57400,0.010480,0.001150,0.013380,...,0.249637,0.258345,4.455075,5.083699,0.436352,0.600632,3.0,1,False,True
5488,5788,2.0,0.0,21.0,7.97000,1.33000,5.29000,0.092000,0.000350,0.045500,...,0.426195,0.237006,5.613602,4.621183,0.673558,0.445467,2.0,1,False,True
5489,5789,2.0,3.0,21.0,1.39200,7.32000,4.75200,0.001270,0.130750,0.026770,...,0.156342,0.227139,4.648310,5.431673,0.486462,0.683400,5.0,0,True,False
5490,5790,5.0,0.0,21.0,11.12800,1.22800,6.25200,2.046920,0.000370,0.062520,...,0.424581,0.312849,5.128811,4.660707,0.574391,0.441392,5.0,1,False,True


In [85]:
def impute_nan_values(dfs):
    for df in dfs.values():
        for col in df.columns:
            if df[col].dtype == np.float64 or df[col].dtype == np.int64:
                df[col] = df.groupby("season")[col].transform(lambda x: x.fillna(x.mean()))
        df.dropna(inplace=True)
impute_nan_values(dfs_train)
impute_nan_values(dfs_test)

In [86]:
# validation set
dfs_valid_reg_X = {}
dfs_valid_reg_y = {}
dfs_train_reg_X = {}
dfs_train_reg_y = {}
dfs_valid_clas_X = {}
dfs_valid_clas_y = {}
dfs_train_clas_X = {}
dfs_train_clas_y = {}
dfs_test_clas_X = {}
dfs_test_clas_y = {}
dfs_test_reg_X = {}
dfs_test_reg_y = {}

cols_to_drop = ['FTHG', 'FTAG', 'MatchTeams', 'SameHomeTeam', 'Target', 'Target_regr', 'Target_clas', "Unnamed: 0", "index"]

for country in dfs_train:
    dfs_train[country] =   dfs_train[country][  dfs_train[country]["season"] > 1]
    dfs_train[country]['Target_clas'] = [0 if a > h else 1 if h > a else -1 for a, h in zip(dfs_train[country]['FTAG'], dfs_train[country]['FTHG'])]

    dfs_valid_reg_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21]["Target_regr"]
    dfs_valid_reg_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21].drop(columns=cols_to_drop, errors='ignore')
    
    dfs_train_reg_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21]["Target_regr"]
    dfs_train_reg_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21].drop(columns=cols_to_drop, errors='ignore')
    
    dfs_train_clas_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21]["Target_clas"]
    dfs_train_clas_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21].drop(columns=cols_to_drop, errors='ignore')

    dfs_valid_clas_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21]["Target_clas"]
    dfs_valid_clas_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21].drop(columns=cols_to_drop, errors='ignore')

for country in dfs_test:
    dfs_test_reg_y[country[3:]] = dfs_test[country][dfs_test[country]["season"] != 21]["Target_regr"]
    dfs_test_reg_X[country[3:]] = dfs_test[country][dfs_test[country]["season"] != 21].drop(columns=cols_to_drop, errors='ignore')

    dfs_test_clas_y[country[3:]] = dfs_test[country][dfs_test[country]["season"] != 21]["Target_clas"]
    dfs_test_clas_X[country[3:]] = dfs_test[country][dfs_test[country]["season"] != 21].drop(columns=cols_to_drop, errors='ignore')

In [92]:
train_results = pd.DataFrame(columns=["country"])
test_results = pd.DataFrame(columns=["country"])

In [93]:
train_results["country"] = dfs_train_clas_X.keys()
test_results["country"] = dfs_train_clas_X.keys()


In [69]:
train_results

Unnamed: 0,country
0,belgium
1,england
2,france
3,germany
4,greece
5,italy
6,netherlands
7,portugal
8,scotland
9,spain


# Classification task
We decided to go with Voting Classifier consisting of 3 classification algorithms -  Gaussian Naive Bayes, RandomForestClassifier & Logistic Regression

### Logistic Regression

In [89]:
lr = LogisticRegression(random_state=42, multi_class='multinomial', max_iter=10000) ### Cesta je snizit hloubku stromu

In [95]:
dfs_train_predict = {}
dfs_valid_predict = {}
dfs_test_predict = {}
for country in dfs_train_clas_X.keys():
    lr.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = lr.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = lr.predict(dfs_valid_clas_X[country])
    dfs_test_predict[country] = lr.predict(dfs_test_clas_X[country])
    train_results.loc[train_results["country"] == country, "lr"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')
    test_results.loc[test_results["country"] == country, "lr"] = f1_score(dfs_test_clas_y[country], dfs_test_predict[country], average='macro')
    
    print(f"F1-score for baseline model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}")
    print(f"F1-score for baseline model on test data for {country}: {f1_score(dfs_test_clas_y[country], dfs_test_predict[country], average='macro')}\n --------")

F1-score for baseline model on train data for belgium: 0.43324213514260723
F1-score for baseline model on validation data for belgium: 0.40667163551580465
F1-score for baseline model on test data for belgium: 0.35256625206326925
 --------
F1-score for baseline model on train data for england: 0.3878445995512461
F1-score for baseline model on validation data for england: 0.40085214827836646
F1-score for baseline model on test data for england: 0.3040747837058228
 --------
F1-score for baseline model on train data for france: 0.3960376185319634
F1-score for baseline model on validation data for france: 0.3793925942828063
F1-score for baseline model on test data for france: 0.34674629852675815
 --------
F1-score for baseline model on train data for germany: 0.38280615248837274
F1-score for baseline model on validation data for germany: 0.381335522714833
F1-score for baseline model on test data for germany: 0.3464547514586684
 --------
F1-score for baseline model on train data for greece: 

In [97]:
display(train_results[["country", "lr"]])
display(test_results)

Unnamed: 0,country,lr
0,belgium,0.352566
1,england,0.304075
2,france,0.346746
3,germany,0.346455
4,greece,0.399244
5,italy,0.33841
6,netherlands,0.382817
7,portugal,0.419729
8,scotland,0.340007
9,spain,0.328382


Unnamed: 0,country
0,belgium
1,england
2,france
3,germany
4,greece
5,italy
6,netherlands
7,portugal
8,scotland
9,spain


### Random Forest Baseline Model

In [43]:
rfm = RandomForestClassifier(random_state=42, max_depth=12) ### Cesta je snizit hloubku stromu

In [44]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    rfm.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = rfm.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = rfm.predict(dfs_valid_clas_X[country])
    # Calculate the average depth of all decision trees
    average_depth = sum(tree.get_depth() for tree in rfm.estimators_) / len(rfm.estimators_)

    # print(f"Average Depth of Decision Trees for {country}: {average_depth}")
    train_results.loc[train_results["country"] == country, "rfm"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    # print(f"Accuracy for baseline model on validation data for {country}: {round(accuracy_score(dfs_valid_clas_y[country], dfs_train_predict[country])*100, ndigits=4)}%")
    print(f"F1-score for baseline Random Forest model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline Random model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}\n --------")

F1-score for baseline Random Forest model on train data for belgium: 0.9588011524407691
F1-score for baseline Random model on validation data for belgium: 0.39641823130666937
 --------
F1-score for baseline Random Forest model on train data for england: 0.3966153174713389
F1-score for baseline Random model on validation data for england: 0.37411725066326
 --------
F1-score for baseline Random Forest model on train data for france: 0.7360234892340599
F1-score for baseline Random model on validation data for france: 0.3957340717952769
 --------
F1-score for baseline Random Forest model on train data for germany: 0.7086838610863483
F1-score for baseline Random model on validation data for germany: 0.35646305565893766
 --------
F1-score for baseline Random Forest model on train data for greece: 0.9525638619682809
F1-score for baseline Random model on validation data for greece: 0.42765271615554656
 --------
F1-score for baseline Random Forest model on train data for italy: 0.72388087933908

### MultinomialNaive Bayes

In [45]:
nb = MultinomialNB()

In [46]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    nb.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = nb.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = nb.predict(dfs_valid_clas_X[country])
    train_results.loc[train_results["country"] == country, "nb"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    print(f"F1-score for baseline Multinomial Naive Bayes model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline Multinomial Naive Bayes model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}\n --------")

F1-score for baseline Multinomial Naive Bayes model on train data for belgium: 0.44324675869002944
F1-score for baseline Multinomial Naive Bayes model on validation data for belgium: 0.47122532022420777
 --------
F1-score for baseline Multinomial Naive Bayes model on train data for england: 0.43686657947350005
F1-score for baseline Multinomial Naive Bayes model on validation data for england: 0.4453726950951576
 --------
F1-score for baseline Multinomial Naive Bayes model on train data for france: 0.4657072628643342
F1-score for baseline Multinomial Naive Bayes model on validation data for france: 0.4265940415167743
 --------
F1-score for baseline Multinomial Naive Bayes model on train data for germany: 0.45486702518673355
F1-score for baseline Multinomial Naive Bayes model on validation data for germany: 0.42009765221039846
 --------
F1-score for baseline Multinomial Naive Bayes model on train data for greece: 0.4968840620068689
F1-score for baseline Multinomial Naive Bayes model on v

### KNeighborsClassifier

In [47]:
kn = KNeighborsClassifier()

In [48]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    kn.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = kn.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = kn.predict(dfs_valid_clas_X[country])
    train_results.loc[train_results["country"] == country, "kn"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    print(f"F1-score for baseline KNeighbors model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline KNeighbors Naive Bayes model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}\n --------")

F1-score for baseline KNeighbors model on train data for belgium: 0.5827771341903464
F1-score for baseline KNeighbors Naive Bayes model on validation data for belgium: 0.39529002686897413
 --------
F1-score for baseline KNeighbors model on train data for england: 0.5880131199462958
F1-score for baseline KNeighbors Naive Bayes model on validation data for england: 0.37459397413994583
 --------
F1-score for baseline KNeighbors model on train data for france: 0.5991169941551583
F1-score for baseline KNeighbors Naive Bayes model on validation data for france: 0.42872444831200207
 --------
F1-score for baseline KNeighbors model on train data for germany: 0.5928079694344451
F1-score for baseline KNeighbors Naive Bayes model on validation data for germany: 0.4256512457558741
 --------
F1-score for baseline KNeighbors model on train data for greece: 0.6317739931833501
F1-score for baseline KNeighbors Naive Bayes model on validation data for greece: 0.4101284958427816
 --------
F1-score for bas

### Gradient Boosting Classifier

In [49]:
gbc = GradientBoostingClassifier()

In [50]:
dfs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    gbc.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    dfs_train_predict[country] = gbc.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = gbc.predict(dfs_valid_clas_X[country])
    train_results.loc[train_results["country"] == country, "gbc"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    print(f"F1-score for baseline GradientBoostingClassifier model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for baseline GradientBoostingClassifier Naive Bayes model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}\n --------")

F1-score for baseline GradientBoostingClassifier model on train data for belgium: 0.682812682426516
F1-score for baseline GradientBoostingClassifier Naive Bayes model on validation data for belgium: 0.3473273269899056
 --------
F1-score for baseline GradientBoostingClassifier model on train data for england: 0.43290020067473717
F1-score for baseline GradientBoostingClassifier Naive Bayes model on validation data for england: 0.40375494180210986
 --------
F1-score for baseline GradientBoostingClassifier model on train data for france: 0.5436839244825307
F1-score for baseline GradientBoostingClassifier Naive Bayes model on validation data for france: 0.41419574311996116
 --------
F1-score for baseline GradientBoostingClassifier model on train data for germany: 0.563480932282584
F1-score for baseline GradientBoostingClassifier Naive Bayes model on validation data for germany: 0.39950312603804194
 --------
F1-score for baseline GradientBoostingClassifier model on train data for greece: 0.7

## Random Search
Use random search to get best parameters for each country

### Random Forrest 

In [51]:
param_grid = {
    'max_depth': [i for i in range(1, 30)],
    'min_samples_split': [i for i in range(1, 300)],
    'min_samples_leaf': [i for i in range(1, 200)]
}
rfm = RandomForestClassifier(random_state=42)
dfs_train_predict_random = {}
dfs_valid_predict_random = {}
dfs_best_params_rfm = {}
random_search = RandomizedSearchCV(estimator=rfm, param_distributions=param_grid, n_iter=40,
                                   cv=5, random_state=42, n_jobs=-1)
dfs_train_predict = {}
for country in dfs_train_clas_X.keys():
    
    random_search.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])
    
    best_params = random_search.best_params_
    best_random_forest = random_search.best_estimator_
    
    dfs_best_params_rfm[country] = best_params
    dfs_train_predict_random[country] = best_random_forest.predict(dfs_train_clas_X[country])
    dfs_valid_predict_random[country] = best_random_forest.predict(dfs_valid_clas_X[country])
    train_results.loc[train_results["country"] == country, "rfm_rs"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')
    train_results.loc[train_results["country"] == country, "best_params"] = best_params
    print(f"F1-score for best random search model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict_random[country], average='macro')}")
    print(f"F1-score for best random search model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict_random[country], average='macro')}\n --------")

F1-score for best random search model on train data for belgium: 0.4013348326540933
F1-score for best random search model on validation data for belgium: 0.4097130242825607
 --------
F1-score for best random search model on train data for england: 0.35561755228684744
F1-score for best random search model on validation data for england: 0.37083351270954307
 --------
F1-score for best random search model on train data for france: 0.3555419801702601
F1-score for best random search model on validation data for france: 0.3558850805059682
 --------
F1-score for best random search model on train data for germany: 0.371584221333324
F1-score for best random search model on validation data for germany: 0.35823950599676196
 --------
F1-score for best random search model on train data for greece: 0.43080812944451435
F1-score for best random search model on validation data for greece: 0.36575415995705846
 --------
F1-score for best random search model on train data for italy: 0.3697237669491424
F1-

### LogReg

In [None]:
param_grid = {
    'penalty': ['l1', 'l2'],                 # Regularization penalty ('l1' or 'l2')
    'C': [0.001, 0.01, 0.1, 1, 10, 100],     # Inverse of regularization strength
    'solver': ['liblinear', 'saga'],        # Algorithm to use in the optimization problem
    'max_iter': [8000]               # Maximum number of iterations for optimization
}

lr = LogisticRegression()
dfs_train_predict_random = {}
dfs_valid_predict_random = {}
dfs_best_params_lr = {}
random_search = RandomizedSearchCV(estimator=lr, param_distributions=param_grid, n_iter=10,
                                   cv=5, random_state=42, n_jobs=-1)


fs_train_predict = {}
dfs_valid_predict = {}
for country in dfs_train_clas_X.keys():
    random_search.fit(dfs_train_clas_X[country], dfs_train_clas_y[country])

    best_params = random_search.best_params_


    dfs_best_params_lr[country] = best_params

    best_knn = random_search.best_estimator_
    dfs_train_predict[country] = best_knn.predict(dfs_train_clas_X[country])
    dfs_valid_predict[country] = best_knn.predict(dfs_valid_clas_X[country])
    train_results.loc[train_results["country"] == country, "lr_rc"] = f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')

    print(f"F1-score for best LogReg model on train data for {country}: {f1_score(dfs_train_clas_y[country], dfs_train_predict[country], average='macro')}")
    print(f"F1-score for best LogReg model on validation data for {country}: {f1_score(dfs_valid_clas_y[country], dfs_valid_predict[country], average='macro')}\n --------")

### Grid Search

In [65]:
train_results

Unnamed: 0,country,lr,rfm,nb,kn,gbc,rfm_rs,best_params,lr_rc
0,belgium,0.416161,0.396418,0.471225,0.39529,0.347327,0.347327,,0.409721
1,england,0.40666,0.374117,0.445373,0.374594,0.403755,0.403755,,0.372328
2,france,0.39484,0.395734,0.426594,0.428724,0.414196,0.414196,,0.355784
3,germany,0.411588,0.356463,0.420098,0.425651,0.399503,0.399503,,0.360561
4,greece,0.435277,0.427653,0.41087,0.410128,0.399721,0.399721,,0.470566
5,italy,0.410251,0.409695,0.464104,0.442976,0.42555,0.42555,,0.384639
6,netherlands,0.410742,0.430036,0.475385,0.431415,0.484872,0.484872,,0.394598
7,portugal,0.439967,0.44663,0.549858,0.516181,0.454269,0.454269,,0.398209
8,scotland,0.390106,0.374633,0.449061,0.427188,0.438818,0.438818,,0.36209
9,spain,0.411941,0.399577,0.401609,0.42329,0.423832,0.423832,,0.378776


### Implementing Voting Classifier

In [61]:
best_models = []
for i, row in train_results.iterrows():
    # display(row[["lr", "rfm", "nb", "kn", "gbc", "rfm_rs"]])
    best_models.append(pd.to_numeric(row[["lr", "rfm", "nb", "kn", "gbc", "rfm_rs", "lr_rc"]]).nlargest(3).index.tolist())

In [62]:
models = {
    "lr": lr,
    "rfm": rfm,
    "nb": nb,
    "kn": kn,
    "gbc": gbc,
    "rfm_rs": RandomForestClassifier(random_state=42)
}

def voting_classifier(best_models_country, X_train, y_train, X_val, y_val, country: str):
    """"""
    models = {
        "lr": lr,
        "rfm": rfm,
        "nb": nb,
        "kn": kn,
        "gbc": gbc,
        "rfm_rs": RandomForestClassifier(random_state=42),
        "lr_rc": LogisticRegression(random_state=42)
        
    }
    print(best_models_country)
    clf1 = models[best_models_country[0]] if best_models_country[0] != "rfm_rs" else models[best_models_country[0]].set_params(**dfs_best_params_rfm[country]) if best_models_country[0] != "lr_rc" else models[best_models_country[0]].set_params(**dfs_best_params_lr[country])
    clf2 = models[best_models_country[1]] if best_models_country[1] != "rfm_rs" else models[best_models_country[1]].set_params(**dfs_best_params_rfm[country]) if best_models_country[0] != "lr_rc" else models[best_models_country[0]].set_params(**dfs_best_params_lr[country])
    clf3 = models[best_models_country[2]] if best_models_country[2] != "rfm_rs" else models[best_models_country[2]].set_params(**dfs_best_params_rfm[country]) if best_models_country[0] != "lr_rc" else models[best_models_country[0]].set_params(**dfs_best_params_lr[country])
    
    eclf = VotingClassifier(
        estimators=[(best_models_country[0], clf1 ), (best_models_country[1], clf2 ),(best_models_country[2], clf3 )],
        voting='hard'
    ) 
    eclf.fit(X_train,y_train)
    
    y_train_predict = eclf.predict(X_train)
    y_val_predict = eclf.predict(X_val)
    
    
    print(f"F1-score for best random search model on train data for {country}: {f1_score(y_train, y_train_predict, average='macro')}")
    print(f"F1-score for best random search model on validation data for {country}: {f1_score(y_val, y_val_predict, average='macro')}\n --------")
    

In [63]:
for i, country in enumerate(dfs_train_clas_X.keys()):
    voting_classifier(best_models[i], dfs_train_clas_X[country], dfs_train_clas_y[country], dfs_valid_clas_X[country], dfs_valid_clas_y[country],country)

['nb', 'lr', 'lr_rc']
F1-score for best random search model on train data for belgium: 0.4373195170022232
F1-score for best random search model on validation data for belgium: 0.4479299321271508
 --------
['nb', 'lr', 'gbc']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt