In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, classification_report


In [2]:
X_train = pd.read_csv("../data/X_train.csv", index_col=0).drop(columns=["Div", "Date", "HomeTeam", "AwayTeam", "HomeTeam_ShotOnTarget", "AwayTeam_ShotOnTarget"])
X_test = pd.read_csv("../data/X_test.csv", index_col=0).drop(columns=["Div", "Date", "HomeTeam", "AwayTeam", "HomeTeam_ShotOnTarget", "AwayTeam_ShotOnTarget"])
y_train = pd.read_csv("../data/y_train.csv", index_col=0)
y_test = pd.read_csv("../data/y_test.csv", index_col=0)

y_train = y_train.squeeze()
y_test = y_test.squeeze()

# <h1 style='font-size:30px;'>Model</h1>

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score

def run_model(classifier, param_grid, X_train, y_train, X_test, y_test):

    grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring='balanced_accuracy',
                               cv=5, verbose=1)
    grid_search.fit(X_train, y_train)
    model = grid_search.best_estimator_
    y_pred = model.predict(X_test)
    print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred, zero_division=0))

    return grid_search.best_params_

# <h1 style='font-size:30px;'>Random Forest Classifier</h1>

In [30]:
from sklearn.ensemble import RandomForestClassifier
param_grid = {
    'n_estimators': [100, 500, 1000],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    }
classifier = RandomForestClassifier(random_state=42)
best_params_rf = run_model(classifier, param_grid, X_train, y_train, X_test, y_test)
best_params_rf

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Test Set Accuracy: 0.52
              precision    recall  f1-score   support

           0       0.42      0.36      0.39        47
           1       0.48      0.24      0.32        50
           2       0.57      0.78      0.66        83

    accuracy                           0.52       180
   macro avg       0.49      0.46      0.46       180
weighted avg       0.50      0.52      0.49       180



{'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}

# <h1 style='font-size:30px;'>Gradient Boosting Classifier</h1>

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=1000, max_depth=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, zero_division=0))

Test Set Accuracy: 0.50
              precision    recall  f1-score   support

           0       0.42      0.38      0.40        47
           1       0.38      0.20      0.26        50
           2       0.56      0.75      0.64        83

    accuracy                           0.50       180
   macro avg       0.45      0.44      0.43       180
weighted avg       0.47      0.50      0.47       180



# <h1 style='font-size:30px;'>Naive Bayes</h1>

In [34]:
from sklearn.naive_bayes import GaussianNB

param_grid = {
    'var_smoothing': np.logspace(0, -12, num=10),
    'priors': [[0.3, 0.4, 0.3]]
}
classifier = GaussianNB()

best_params_nb = run_model(classifier, param_grid, X_train, y_train, X_test, y_test)
best_params_nb

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Test Set Accuracy: 0.49
              precision    recall  f1-score   support

           0       0.46      0.55      0.50        47
           1       0.34      0.46      0.39        50
           2       0.70      0.47      0.56        83

    accuracy                           0.49       180
   macro avg       0.50      0.49      0.48       180
weighted avg       0.54      0.49      0.50       180



{'priors': [0.3, 0.4, 0.3], 'var_smoothing': 2.1544346900318868e-11}

# <h1 style='font-size:30px;'>Stacking Classifier</h1>

# <h1 style='font-size:15px;'>By Using Stacking Classifier, we can have a more balanced result which have a better performance in predicting results for Draw</h1>

In [35]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

classifier_1 = GradientBoostingClassifier(n_estimators=1000, criterion='friedman_mse', learning_rate=0.1, subsample=0.5)
classifier_2 = RandomForestClassifier(n_estimators=1000, min_samples_leaf=1, max_leaf_nodes=5)
classifier_3 = GaussianNB(var_smoothing=1e-09)
sclf = StackingClassifier(estimators = [('rf', classifier_2), ('gb', classifier_1), ('gnb', classifier_3)],
                          final_estimator = classifier_3
                          )

model = sclf.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, zero_division=0))

Test Set Accuracy: 0.53
              precision    recall  f1-score   support

           0       0.46      0.62      0.53        47
           1       0.42      0.30      0.35        50
           2       0.64      0.63      0.63        83

    accuracy                           0.53       180
   macro avg       0.51      0.51      0.50       180
weighted avg       0.53      0.53      0.53       180



# <h1 style='font-size:30px;'>CatBoost</h1>

In [3]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
X_test_resampled, y_test_resampled = smote.fit_resample(X_test, y_test)

In [10]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=1000, loss_function="MultiClass", 
                                eval_metric="Accuracy", learning_rate=0.3, l2_leaf_reg=9, class_weights=[1, 1.5, 1])

model.fit(X_train,
          y_train,
          eval_set=(X_test, y_test),
          verbose=False)

y_pred = model.predict(X_test)
print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, zero_division=0))
print(model.get_best_score())

Test Set Accuracy: 0.58
              precision    recall  f1-score   support

           0       0.60      0.45      0.51        47
           1       0.50      0.46      0.48        50
           2       0.61      0.72      0.66        83

    accuracy                           0.58       180
   macro avg       0.57      0.54      0.55       180
weighted avg       0.58      0.58      0.57       180

{'learn': {'Accuracy': 0.9829928334581238, 'MultiClass': 0.2874334984834144}, 'validation': {'Accuracy': 0.5634146341463414, 'MultiClass': 1.0087578860801056}}


# <h1 style='font-size:30px;'>XGBoost</h1>

In [37]:
from xgboost import XGBClassifier

grid_params = {
    'max_depth': [3,6,9],
    'min_child_weight': [1,3,5],
    'learning_rate': [0.1, 0.5, 1],
    'objective': ['multi:softmax'],
    'n_estimators': [1000]
}

classifier = XGBClassifier()
best_params_gb = run_model(classifier, grid_params, X_train, y_train, X_test, y_test)
best_params_gb

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Test Set Accuracy: 0.52
              precision    recall  f1-score   support

           0       0.48      0.43      0.45        47
           1       0.37      0.20      0.26        50
           2       0.57      0.76      0.65        83

    accuracy                           0.52       180
   macro avg       0.47      0.46      0.45       180
weighted avg       0.49      0.52      0.49       180



{'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 5,
 'n_estimators': 1000,
 'objective': 'multi:softmax'}

# <h1 style='font-size:30px;'>Save Model</h1>

# <h1 style='font-size:15px;'>Catboost is the most efficient model such that it has best balanced prediction result in all 3 possible outcomes</h1>

In [19]:
import pickle

catboost = CatBoostClassifier(iterations=1000, loss_function="MultiClass", 
                                eval_metric="Accuracy", learning_rate=0.3, l2_leaf_reg=9, class_weights=[1, 1.5, 1])

catboost.fit(X_train,
          y_train,
          eval_set=(X_test, y_test),
          verbose=False)

pickle.dump(catboost, open('catboost.pkl', 'wb'))