In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.base import clone
from pickle import dump

In [None]:
## Load the dataset
df = pd.read_csv('../data/dataset/dataset.csv')
df.head()

In [None]:
df.tail()

In [None]:
## Drop the columns that are not needed
df.drop("year", axis=1, inplace=True)

In [None]:
## Split the dataset into train and test
from sklearn.model_selection import train_test_split

X = df.drop('net_score', axis=1)
y = df['net_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
le = preprocessing.LabelEncoder()

fifa2022_countries = pd.read_csv("../data/world_cup_2022_data/groups_of_2022.csv")["country"].values.tolist()
countries_set = set(list(X["2nd_team"]) + list(X["1st_team"]) + fifa2022_countries)

countries_encoder = le.fit(list(countries_set))
dump(countries_encoder, open('../models/encoders/countries_encoder.pkl', 'wb'))
X_train["2nd_team"] = countries_encoder.transform(X_train["2nd_team"])
X_train["1st_team"] = countries_encoder.transform(X_train["1st_team"])

X_test["2nd_team"] = countries_encoder.transform(X_test["2nd_team"])
X_test["1st_team"] = countries_encoder.transform(X_test["1st_team"])

In [None]:
## Apply PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pca = PCA(n_components=0.9)
scaler = StandardScaler()



In [None]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

estimators_dict = {
RandomForestRegressor(): {"est__n_estimators": [100,300,500, 1000], "est__max_depth": [None, 3, 5, 7], "est__oob_score": [True], "est__min_samples_split":[0.05, 0.15, 0.3, 0.5] },
AdaBoostRegressor(): {"est__n_estimators": [100,300,500, 1000], "est__learning_rate": [0.01, 0.05, 0.1, 0.5, 1], "est__loss": ["linear", "square", "exponential"] },
GradientBoostingRegressor(): {"est__n_estimators": [100,300,500, 1000], "est__learning_rate": [0.01, 0.05, 0.1, 0.5, 1], "est__loss": ["squared_error", "absolute_error", "huber", "quantile"], "est__max_depth": [None, 3, 5, 7] },
Ridge(): {"est__alpha": [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100], "est__solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"] },
Lasso(): {"est__alpha": [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100], "est__max_iter": [1000, 2000, 5000, 10000]},
SVR(): {"est__kernel": ["linear", "poly", "rbf", "sigmoid"], "est__C": [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100], "est__gamma": ["scale", "auto"] },
KNeighborsRegressor(): {"est__n_neighbors": [3, 5, 7, 9, 11, 13, 15, 17, 19, 21], "est__weights": ["uniform", "distance"], "est__algorithm": ["auto", "ball_tree", "kd_tree", "brute"], "est__leaf_size": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}

}

In [None]:
from sklearn.model_selection import GridSearchCV

def get_best_estimator(estimator, params, cv):
    grid = GridSearchCV(estimator, params, cv=cv, scoring="neg_mean_squared_error", n_jobs=-1)
    grid.fit(X_train, y_train)
    return grid.best_estimator_


In [None]:
X_train

In [None]:
### Get tunned pipeline
from sklearn.model_selection import cross_validate, KFold

scorers = ["neg_mean_absolute_error", "r2", "neg_mean_absolute_percentage_error"]
train_mae = "train_neg_mean_absolute_error"
test_mae = "test_neg_mean_absolute_error"

train_r2 = "train_r2"
test_r2 = "test_r2"

train_mape = "train_neg_mean_absolute_percentage_error"
test_mape = "test_neg_mean_absolute_percentage_error"
cv = KFold(n_splits=5, shuffle=True, random_state=42)



                                      
for est, params in estimators_dict.items():
    print("Getting best estimator for {}".format(est))
    pipeline = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('est', est)])
    tunned_estimator = get_best_estimator(pipeline, params, cv)

    
    scores = cross_validate(tunned_estimator, X_train, y_train, cv=cv, return_train_score=True, scoring=scorers, n_jobs=-1)
    train_mae_score = np.mean(scores[train_mae]*-1)
    test_mae_score = np.mean(scores[test_mae]*-1)
    train_r2_score = np.mean(scores[train_r2])
    test_r2_score = np.mean(scores[test_r2])
    train_mape_score = np.mean(scores[train_mape]*-1)
    test_mape_score = np.mean(scores[test_mape]*-1)
    valid_results = valid_results.append({"estimator": tunned_estimator, "params": tunned_estimator.get_params()["est"],
                                            "train_MAE": train_mae_score, "test_MAE": test_mae_score,
                                            "train_r2": train_r2_score, "test_r2": test_r2_score,
                                            "train_MAPE": train_mape_score, "test_MAPE": test_mape_score}, ignore_index=True)

valid_results

In [None]:
for model in valid_results["estimator"].values:
    model = clone(model)
    print(model)
    model.fit(X, y)
    dump(model, open("../models/regression/{}.pkl".format(model["est"].__class__.__name__), "wb"))
    
