In [1]:
import pandas as pd
import numpy as np
import os as os
from IPython.display import display
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error as MSE

### Load data

In [2]:
dfs_train = {}
dfs_test = {}

In [3]:
for root, directory, files in os.walk("data/train_preprocessed", topdown=False):
    if files:
        for file in files:
            dfs_train[file[:-4]] = pd.read_csv(f"{root}/{file}")
for root, directory, files in os.walk("data/test_preprocessed", topdown=False):
    if files:
        for file in files:
            dfs_test[file[:-4]] = pd.read_csv(f"{root}/{file}")
            dfs_test[file[:-4]].drop(columns=["Target", "Target_clas", "Target_regr"], inplace=True)

In [4]:
def impute_nan_values(dfs):
    for df in dfs.values():
        for col in df.columns:
            if df[col].dtype == np.float64 or df[col].dtype == np.int64:
                df[col] = df.groupby("season")[col].transform(lambda x: x.fillna(x.mean()))
        df.dropna(inplace=True)
impute_nan_values(dfs_train)
impute_nan_values(dfs_test)

In [5]:
# validation set

dfs_valid_reg_X = {}
dfs_valid_reg_y = {}
dfs_train_reg_X = {}
dfs_train_reg_y = {}
dfs_valid_clas_X = {}
dfs_valid_clas_y = {}
dfs_train_clas_X = {}
dfs_train_clas_y = {}

dfs_train.pop('df_england')


cols_to_drop = ['FTHG', 'FTAG', 'MatchTeams', 'SameHomeTeam', 'Target', 'Target_regr', 'Target_clas']

for country in dfs_train:
    dfs_valid_reg_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21]["Target_regr"]
    dfs_valid_reg_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21].drop(columns=cols_to_drop)

    dfs_train_reg_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] > 5]["Target_regr"]
    dfs_train_reg_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] > 5]
    
    dfs_train_reg_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21]["Target_regr"]
    dfs_train_reg_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21].drop(columns=cols_to_drop)

    # dfs_train_clas_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21]["Target_clas"]
    # dfs_train_clas_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] != 21].drop(columns=cols_to_drop)
    # 
    # dfs_valid_clas_y[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21]["Target_clas"]
    # dfs_valid_clas_X[country[3:]] = dfs_train[country][dfs_train[country]["season"] == 21].drop(columns=cols_to_drop)

### Regression task


baseline model


In [6]:
bl_forests = {}

In [7]:
for country in dfs_train_reg_X:
    bl_forests[country] = RandomForestRegressor(n_estimators=100, criterion="squared_error", max_depth=10)

In [8]:
pd.set_option('display.max_columns', 500)

In [9]:
for country in bl_forests:
    try:
        dfs_valid_reg_X[country].drop(columns=['Avg_bookie_prediction'], axis=1, inplace=True)
        dfs_train_reg_X[country].drop(columns=['Avg_bookie_prediction'], axis=1, inplace=True)
    except:
        print("hmm")

hmm


In [10]:
# dfs_train_reg_X['england'].head()

KeyError: 'england'

In [11]:
for country in bl_forests:
    print("fitting ", country)
    bl_forests[country].fit(X=dfs_train_reg_X[country], y=dfs_train_reg_y[country])

fitting  belgium
fitting  france
fitting  germany
fitting  greece
fitting  italy
fitting  netherlands
fitting  portugal
fitting  scotland
fitting  spain
fitting  turkey


In [12]:
for country in bl_forests:
    # print("predicting ", country)
    prediction = bl_forests[country].predict(dfs_valid_reg_X[country])
    print("baseline forest mse of ", country, ": ", MSE(prediction, dfs_valid_reg_y[country]))
    

baseline forest mse of  belgium :  3.542512144980251
baseline forest mse of  france :  2.681936417224585
baseline forest mse of  germany :  2.8143159878508173
baseline forest mse of  greece :  2.675688838868358
baseline forest mse of  italy :  2.5602504240410373
baseline forest mse of  netherlands :  2.691732597484649
baseline forest mse of  portugal :  2.212194706192593
baseline forest mse of  scotland :  2.4932141477037533
baseline forest mse of  spain :  2.552807045578344
baseline forest mse of  turkey :  2.8315313408901357


random search


In [13]:
param_grid = {
    'n_estimators':[i for i in range(50,200)],
    'min_samples_split':[i for i in range(2, 50)],
    'max_depth':[i for i in range(3, 100)]
}

rs_estimators = {}
rs_forests = {}
rs_best = {}
rs_best_estims = {}

In [14]:
for country in bl_forests:
    rs_estimators[country] = RandomForestRegressor(random_state=42)

In [15]:
for country in dfs_train_reg_X:
    print("random searching", country)
    rs_forests[country] = RandomizedSearchCV(estimator=rs_estimators[country], param_distributions=param_grid,  n_iter=3, cv=3, random_state=42, error_score='raise')

random searching belgium
random searching france
random searching germany
random searching greece
random searching italy
random searching netherlands
random searching portugal
random searching scotland
random searching spain
random searching turkey


In [None]:
for country in rs_forests:
    rs_forests[country].fit(X=dfs_train_reg_X[country], y=dfs_train_reg_y[country])
    rs_best[country] = rs_forests[country].best_params_
    print("best params for country", country, ": ", rs_best[country])
    rs_best_estims[country] = rs_forests[country].best_estimator_

best params for country belgium :  {'n_estimators': 132, 'min_samples_split': 17, 'max_depth': 21}
best params for country france :  {'n_estimators': 58, 'min_samples_split': 47, 'max_depth': 19}
best params for country germany :  {'n_estimators': 58, 'min_samples_split': 47, 'max_depth': 19}
best params for country greece :  {'n_estimators': 58, 'min_samples_split': 47, 'max_depth': 19}
best params for country italy :  {'n_estimators': 58, 'min_samples_split': 47, 'max_depth': 19}
best params for country netherlands :  {'n_estimators': 58, 'min_samples_split': 47, 'max_depth': 19}
best params for country portugal :  {'n_estimators': 58, 'min_samples_split': 47, 'max_depth': 19}


In [None]:
for country in rs_forests:
    preds = rs_best_estims[country].predict(dfs_valid_reg_X[country])
    print("Validation MSE of country ", country, " with random search params: ", MSE(dfs_valid_reg_y[country], preds))

grid search


train on best params


*repeat with boosting?*

In [None]:
gb_forests = {}

In [None]:
for country in dfs_train_reg_X:
    gb_forests[country] = HistGradientBoostingRegressor(max_depth=10)

In [None]:
for country in gb_forests:
    print("fitting ", country)
    gb_forests[country].fit(X=dfs_train_reg_X[country], y=dfs_train_reg_y[country])

In [None]:
for country in bl_forests:
    # print("predicting ", country)
    prediction = gb_forests[country].predict(dfs_valid_reg_X[country])
    print("gradient boost forest mse of ", country, ": ", MSE(prediction, dfs_valid_reg_y[country]))