In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.base import clone
from pickle import dump
import xgboost as xgb


In [None]:
## Load the dataset
df = pd.read_csv('../data/dataset/dataset.csv')
df.head()

In [None]:
df.tail()

In [None]:
## Drop the columns that are not needed
df.drop("year", axis=1, inplace=True)

## Drop net score = 0
df = df[df['net_score'] != 0]

In [None]:
def class_maper(score):
    if score > 0:
        return 0   # Home win
    elif score < 0:
        return 1   # Away Win
    

In [None]:
## Split the dataset into train and test
from sklearn.model_selection import train_test_split

X = df.drop('net_score', axis=1)
y = df['net_score'].apply(class_maper)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
le = preprocessing.LabelEncoder()

fifa2022_countries = pd.read_csv("../data/world_cup_2022_data/groups_of_2022.csv")["country"].values.tolist()
countries_set = set(list(X["2nd_team"]) + list(X["1st_team"]) + fifa2022_countries)

countries_encoder = le.fit(list(countries_set))
dump(countries_encoder, open('../models/encoders/countries_encoder.pkl', 'wb'))

X_train["2nd_team"] = countries_encoder.transform(X_train["2nd_team"])
X_train["1st_team"] = countries_encoder.transform(X_train["1st_team"])

X_test["2nd_team"] = countries_encoder.transform(X_test["2nd_team"])
X_test["1st_team"] = countries_encoder.transform(X_test["1st_team"])

In [None]:
## Apply PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pca = PCA(n_components=0.9)
scaler = StandardScaler()



In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression


estimators_dict = {
# xgb.XGBClassifier():
# {
#     "est__n_estimators": [ 200, 500, 1500],
#     "est__booster": ["gbtree", "gblinear"],
#     "est__eta": [0.01, 0.1, 1.0],
#     "est__alpha": [0, 0.5, 1],
#     "est__lambda": [0, 1 , 2]

# },
# RandomForestClassifier():  
# {
#     "est__n_estimators": [10, 50, 100, 200, 500],
#     "est__max_depth": [3, 5, 10, 20, 55],
#     "est__min_samples_split": [2, 5, 10],
#     "est__min_samples_leaf": [1, 2, 4],
#     "est__bootstrap": [True, False],
#     "est__criterion": ["gini", "entropy"]
# },
# AdaBoostClassifier():
# {
#     "est__n_estimators": [10, 50, 100, 200, 500],
#     "est__learning_rate": [0.01, 0.05, 0.1, 0.5, 1.0],
#     "est__algorithm": ["SAMME", "SAMME.R"]
# },

# KNeighborsClassifier():
# {
#     "est__n_neighbors": [3, 5, 10, 20, 50, 100],
#     "est__weights": ["uniform", "distance"],
#     "est__algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
#     "est__leaf_size": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
# },
LogisticRegression():
{
    "est__penalty": ["l1", "l2"],
    "est__C": [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0],
    "est__solver": ["newton-cg", "liblinear", "sag", "saga"],
    "est__max_iter": [100, 200, 500, 1000, 2000, 5000, 10000]
}


}


In [None]:
from sklearn.model_selection import GridSearchCV

def get_best_estimator(estimator, params, cv):
    grid = GridSearchCV(estimator, params, cv=cv, scoring="f1", n_jobs=-1)
    grid.fit(X_train, y_train)
    return grid.best_estimator_


In [None]:
### Get tunned pipeline
from sklearn.model_selection import cross_validate, KFold

scorers = ["f1"]
train_f1 = "train_f1"
test_f1 = "test_f1"

cv = KFold(n_splits=5, shuffle=True, random_state=42)

valid_results = pd.DataFrame(columns=["estimator", "params", "train_f1", "test_f1"])    

                                      
for est, params in estimators_dict.items():
    print("Getting best estimator for {}".format(est))
    pipeline = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('est', est)])
    tunned_estimator = get_best_estimator(pipeline, params, cv)

    
    scores = cross_validate(tunned_estimator, X_train, y_train, cv=cv, return_train_score=True, scoring=scorers, n_jobs=8)
    valid_results = valid_results.append({
        "estimator": tunned_estimator,
        "params": tunned_estimator.get_params(),
        train_f1: scores["train_f1"].mean(),
        test_f1: scores["test_f1"].mean(),
    }, ignore_index=True)

valid_results

In [None]:
## get the highest test score
best_estimator = valid_results.loc[valid_results[test_f1].idxmax()]["estimator"]
best_estimator

In [None]:
X["1st_team"] = countries_encoder.transform(X["1st_team"])
X["2nd_team"] = countries_encoder.transform(X["2nd_team"])


In [None]:
for model in valid_results["estimator"].values:
    model = clone(model)
    print(model)
    model.fit(X, y)
    dump(model, open("../models/classification/{}_binary.pkl".format(model["est"].__class__.__name__), "wb"))
    


In [None]:
# Test clssifiers on test data

from sklearn.metrics import f1_score

test_results = pd.DataFrame(columns=["estimator", "params", "test_f1"])

for model in valid_results["estimator"].values:
    model = clone(model)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    test_results = test_results.append({
        "estimator": model,
        "params": model.get_params(),
        "test_f1": f1_score(y_test, y_pred, average="micro")
    }, ignore_index=True)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
for model in valid_results["estimator"].values:
    print(model["est"].__class__.__name__)
    disp = ConfusionMatrixDisplay.from_estimator(
        model,
        X_test,
        y_test,
        # display_labels=class_names,
        cmap=plt.cm.Blues,
    )
    disp.ax_.set_title("Confusion matrix")
    plt.show()
    

    # print(disp.confusion_matrix)

