In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, classification_report


In [3]:
X_train = pd.read_csv("../data/X_train.csv", index_col=0).drop(columns=["Div", "Date", "HomeTeam", "AwayTeam", "HomeTeam_ShotOnTarget", "AwayTeam_ShotOnTarget"])
X_test = pd.read_csv("../data/X_test.csv", index_col=0).drop(columns=["Div", "Date", "HomeTeam", "AwayTeam", "HomeTeam_ShotOnTarget", "AwayTeam_ShotOnTarget"])
y_train = pd.read_csv("../data/y_train.csv", index_col=0)
y_test = pd.read_csv("../data/y_test.csv", index_col=0)

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train['FTR_encoded'].to_numpy()
y_test = y_test['FTR_encoded'].to_numpy()

# <h1 style='font-size:30px;'>Model</h1>

In [80]:
from sklearn.model_selection import GridSearchCV
def run_model(classifier, param_grid, X_train, y_train, X_test, y_test):

    grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1)
    grid_search.fit(X_train, y_train)
    model = grid_search.best_estimator_
    y_pred = model.predict(X_test)
    print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred, zero_division=0))

    return y_pred

In [11]:
X_train

array([[0.84615385, 0.73333333, 0.57692308, ..., 0.        , 1.        ,
        0.        ],
       [0.80769231, 0.6       , 0.65384615, ..., 0.        , 0.        ,
        0.        ],
       [0.57692308, 0.46666667, 0.34615385, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.53846154, 0.46666667, 0.84615385, ..., 1.        , 0.        ,
        0.        ],
       [0.57692308, 0.46666667, 0.65384615, ..., 0.        , 0.        ,
        0.        ],
       [0.53846154, 0.46666667, 0.53846154, ..., 0.        , 0.        ,
        0.        ]])

# <h1 style='font-size:30px;'>Random Forest Classifier</h1>

In [81]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=1000, random_state=42)
result = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, zero_division=0))


Test Set Accuracy: 0.53
              precision    recall  f1-score   support

           0       0.56      0.40      0.46       692
           1       0.30      0.12      0.17       520
           2       0.56      0.82      0.66      1065

    accuracy                           0.53      2277
   macro avg       0.47      0.45      0.43      2277
weighted avg       0.50      0.53      0.49      2277



In [85]:
from sklearn.ensemble import RandomForestClassifier
param_grid = {
    'n_estimators': [1000],
    'max_depth': [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    }
classifier = RandomForestClassifier(random_state=42)
y_pred = run_model(classifier, param_grid, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Test Set Accuracy: 0.55
              precision    recall  f1-score   support

           0       0.59      0.41      0.48       692
           1       0.32      0.09      0.14       520
           2       0.56      0.86      0.68      1065

    accuracy                           0.55      2277
   macro avg       0.49      0.45      0.43      2277
weighted avg       0.51      0.55      0.50      2277



# <h1 style='font-size:30px;'>Decision Tree Classifier</h1>

In [84]:
from sklearn.tree import DecisionTreeClassifier
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]

    }
classifier = DecisionTreeClassifier()
y_pred = run_model(classifier, param_grid, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Test Set Accuracy: 0.46
              precision    recall  f1-score   support

           0       0.42      0.37      0.39       692
           1       0.25      0.28      0.27       520
           2       0.59      0.60      0.59      1065

    accuracy                           0.46      2277
   macro avg       0.42      0.42      0.42      2277
weighted avg       0.46      0.46      0.46      2277



# <h1 style='font-size:30px;'>Gradient Boosting Classifier</h1>

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Test Set Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, zero_division=0))

Test Set Accuracy: 0.50
              precision    recall  f1-score   support

           0       0.52      0.38      0.44       692
           1       0.26      0.22      0.24       520
           2       0.57      0.72      0.64      1065

    accuracy                           0.50      2277
   macro avg       0.45      0.44      0.44      2277
weighted avg       0.49      0.50      0.49      2277



In [90]:

param_grid = {
    'n_estimators': [1000],
    'criterion': ['friedman_mse', 'squared_error'],
    'subsample': [0, 0.5, 1],
    'learning_rate': [0.1, 0.5, 1]
    }
classifier = GradientBoostingClassifier()
y_pred = run_model(classifier, param_grid, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


30 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ducduongto/Soccer Prediction/soccerprediction/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ducduongto/Soccer Prediction/soccerprediction/lib/python3.13/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "/Users/ducduongto/Soccer Prediction/soccerprediction/lib/python3.13/site-packages/sklearn/base.py", line 666, in _validate_

Test Set Accuracy: 0.51
              precision    recall  f1-score   support

           0       0.53      0.41      0.46       692
           1       0.27      0.21      0.24       520
           2       0.57      0.72      0.63      1065

    accuracy                           0.51      2277
   macro avg       0.46      0.44      0.44      2277
weighted avg       0.49      0.51      0.49      2277

