In [1018]:
# Standard library imports
import os
import sys
import re
import warnings
import random
import hashlib

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and preprocessing
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer

# Specific models and tools
from xgboost import XGBClassifier
import xgboost as xgb

# Encoding and feature selection
from category_encoders import TargetEncoder  # Fixed the import based on usage
from scipy.stats import randint, uniform

# Model persistence
from joblib import dump, load

# Miscellaneous settings
%matplotlib inline
warnings.filterwarnings('ignore')


In [1019]:
comps = [
        
            'E0', 'E1', 'E2', 
            'SC0', 'SC1', 'SC2',

            'B1',
            'D1', 'D2',
            'F1', 'F2',
            'I1', 'I2',
            'SP1', 'SP2',

            'G1',
            'N1',
            'P1',
            'T1',           
            
            ]
         
         
seasons = [2324, 2223, 2122, 2021, 1920, 1819, 1718, 1617]

In [1020]:
# Load all filepaths for the competitions and seasons into a list
matches_files = []

for season in seasons:    
    for comp in comps:  
        matches_files.append('data/zip/%s/%s.csv' % (season, comp))

In [1021]:
# Load and concatenate matches data into a single DataFrame
df = pd.DataFrame()

for file in matches_files:

    try:
        df_temp = pd.read_csv(file)
        df = pd.concat([df, df_temp], ignore_index=True)
    except:
        # print an error message
        print(f'Error: {file} not found')

# print the amount of data loaded
print(f"Data loaded: {df.shape[0]} matches")

Error: data/zip/1819/SC0.csv not found
Error: data/zip/1819/I2.csv not found
Data loaded: 49677 matches


In [1022]:
# Sort the df by date
df = df.sort_values(by='Date', ascending=True)

In [1023]:
# Print all rows where AvgH and AvgA are higher than 1.8
#df = df[(df['AvgH'] > 1.8) & (df['AvgA'] > 1.8)]

In [1024]:
def parse_date_to_int(date_str):
    for fmt in ('%d/%m/%Y', '%d/%m/%y'):  # Add more formats here as needed
        try:
            # Parse the date
            dt = pd.to_datetime(date_str, format=fmt)
            # Format as 'YYYYMMDD' and convert to int
            return int(dt.strftime('%Y%m%d'))
        except ValueError:
            continue
    return None  # Return None if all formats fail

df['Date'] = df['Date'].apply(parse_date_to_int)

In [1025]:
# Connvert 'Time', which is now in HH:MM format to a 4 digit integer
# Assuming a default time of 00:00 for missing values
df['Time'] = df['Time'].fillna('00:00').str.replace(':', '').astype(int)

In [1026]:
# Drop every row where 'FTR' is not 'H', 'D', or 'A'
df = df[df['FTR'].isin(['H', 'D', 'A'])]

# Map 'H', 'D', and 'A' to 1, 0, and 0 respectively
df['FTR'] = df['FTR'].map({'H': 1, 'D': 0, 'A': 0}).astype(int)

In [1027]:
def encode_teams(df):
    # Ensure we don't alter the original DataFrame
    df = df.copy()
    
    # Combine unique values from both columns
    unique_teams = pd.unique(df[['HomeTeam', 'AwayTeam']].values.ravel('K'))
    
    # Create a dictionary mapping team names to an index
    team_to_index = {team: index for index, team in enumerate(unique_teams)}
    
    # Replace team names in the DataFrame with their corresponding index
    df['HomeTeam'] = df['HomeTeam'].map(team_to_index)
    df['AwayTeam'] = df['AwayTeam'].map(team_to_index)
    
    return df, team_to_index

def decode_teams(df, team_to_index):
    # Ensure we don't alter the original DataFrame
    df_decoded = df.copy()
    
    # Reverse the dictionary to map indices back to team names
    index_to_team = {index: team for team, index in team_to_index.items()}
    
    # Replace indices in the DataFrame with the original team names
    df_decoded['HomeTeam'] = df_decoded['HomeTeam'].map(index_to_team)
    df_decoded['AwayTeam'] = df_decoded['AwayTeam'].map(index_to_team)
    
    return df_decoded


In [1028]:
df, team_to_index = encode_teams(df)

In [1029]:
from sklearn.base import BaseEstimator, TransformerMixin

class TeamEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        unique_teams = pd.unique(X[['HomeTeam', 'AwayTeam']].values.ravel('K'))
        self.team_to_index_ = {team: index for index, team in enumerate(unique_teams)}
        return self
    
    def transform(self, X):
        X_encoded = X.copy()
        X_encoded['HomeTeam'] = X_encoded['HomeTeam'].map(self.team_to_index_)
        X_encoded['AwayTeam'] = X_encoded['AwayTeam'].map(self.team_to_index_)
        return X_encoded

# Note: The TeamDecoder is not typically used in the training pipeline
class TeamDecoder(BaseEstimator, TransformerMixin):
    def __init__(self, team_to_index):
        self.team_to_index_ = team_to_index
        
    def fit(self, X, y=None):
        return self  # Nothing to do here
    
    def transform(self, X):
        X_decoded = X.copy()
        index_to_team = {index: team for team, index in self.team_to_index_.items()}
        X_decoded['HomeTeam'] = X_decoded['HomeTeam'].map(index_to_team)
        X_decoded['AwayTeam'] = X_decoded['AwayTeam'].map(index_to_team)
        return X_decoded


In [1030]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from sklearn.linear_model import LogisticRegression

# Initialize the custom team encoder
team_encoder = TeamEncoder()

# Initialize target encoder for 'Div' (and 'Time' if you choose to include it)
target_enc = TargetEncoder(cols=['Div'])

# Initialize the classifier
clf = LogisticRegression(solver='liblinear', random_state=42)

# Create the preprocessing pipeline
preprocessing = ColumnTransformer(
    transformers=[
        ('teams', team_encoder, ['HomeTeam', 'AwayTeam']),
        ('div', target_enc, ['Div'])
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

# Combine preprocessing with classifier in a pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('classifier', clf)
])


In [1031]:
df = df[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTR', 'AvgH', 'AvgD', 'AvgA']]

In [1032]:
from sklearn.model_selection import train_test_split

# Assuming 'FTR' is the target variable
X = df.drop(columns=['FTR'])
y = df['FTR']

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [1033]:
def generate_sliding_windows(X, window_size, step):
    n_samples = len(X)
    windows = []
    for start_idx in range(0, n_samples - window_size + 1, step):
        end_idx = start_idx + window_size
        if end_idx > n_samples:
            break  # Avoid going beyond the dataset
        train_indices = list(range(max(0, start_idx - window_size), start_idx))
        test_indices = list(range(start_idx, end_idx))
        windows.append((train_indices, test_indices))
    return windows

negative_count = len(df[df['FTR'] == 0])
positive_count = len(df[df['FTR'] == 1])
scale_pos_weight_value = negative_count / positive_count

# Define the hyperparameter search space
param_dist = {
    
    'xgb__clf__max_depth': [1,2,3],
    'xgb__clf__learning_rate': [0.001, 0.01, 0.1],
    'xgb__clf__lambda': [1, 1.5, 2],  # L2 regularization term on weights
    'xgb__clf__alpha': [0, 0.5, 1],  # L1 regularization term on weights
    'xgb__clf__n_estimators': [1, 5, 100],

    'rf__clf__max_depth': [None, 4, 6],
    'rf__clf__min_samples_split': [2, 5],
    'rf__clf__min_samples_leaf': [1, 2],
    'rf__clf__bootstrap': [True, False],
    'rf__clf__n_estimators': [50, 100, 200],

    'lr__clf__C': [0.1, 1, 10],  # Inverse of regularization strength; smaller values specify stronger regularization.
    'lr__clf__penalty': ['l1', 'l2', 'elasticnet'],  # Specify the norm of the penalty.
    'lr__clf__solver': ['saga'],  # Algorithm to use in the optimization problem, 'saga' supports all penalties.
    'lr__clf__l1_ratio': [0.5],  # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1. Only used if penalty='elasticnet'.

    'cat__clf__depth': [1,2,3,4],
    'cat__clf__learning_rate': [0.01, 0.05, 0.1],
    'cat__clf__iterations': [50, 100, 200],
    'cat__clf__l2_leaf_reg': [1, 3, 5],

    'gb__clf__learning_rate': [0.01, 0.1, 0.2],
    'gb__clf__n_estimators': [50, 100, 200],
    'gb__clf__max_depth': [3, 5, 7],
    'gb__clf__min_samples_split': [2, 5],
    'gb__clf__min_samples_leaf': [1, 2],

}


param_test = {
    
    'xgb__clf__max_depth': [1,2,3],
    'xgb__clf__learning_rate': [0.001, 0.01, 0.1],
    'xgb__clf__lambda': [1, 1.5, 2],  # L2 regularization term on weights
    'xgb__clf__alpha': [0, 0.5, 1],  # L1 regularization term on weights
    'xgb__clf__n_estimators': [1, 5, 100],

}

In [1034]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import make_scorer, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline as ImbPipeline
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# LightGBM
from lightgbm import LGBMClassifier

# naive bayes
from sklearn.naive_bayes import GaussianNB

#catboost
from catboost import CatBoostClassifier

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Define pipelines for each classifier with SMOTE and TargetEncoder
pipeline_xgb = ImbPipeline([
    ('target_encoder', TargetEncoder()),
    ('smote', SMOTE(random_state=42)),
    ('clf', XGBClassifier(random_state=42, verbose=0))
])

pipeline_gb = ImbPipeline([
    ('target_encoder', TargetEncoder()),
    ('smote', SMOTE(random_state=42)),
    ('clf', GradientBoostingClassifier(random_state=42, verbose=0))
])

# pipeline for logistic regression
pipeline_lr = ImbPipeline([
    ('target_encoder', TargetEncoder()),
    ('smote', SMOTE(random_state=42)),
    ('clf', LogisticRegression(random_state=42, verbose=0))
])

# pipeline for catboost classifier
pipeline_cat = ImbPipeline([
    ('target_encoder', TargetEncoder()),
    ('smote', SMOTE(random_state=42)),
    ('clf', CatBoostClassifier(random_state=42, verbose=0))
])

# pipeline for random forest
pipeline_rf = ImbPipeline([
    ('target_encoder', TargetEncoder()),
    ('smote', SMOTE(random_state=42)),
    ('clf', RandomForestClassifier(random_state=42, verbose=0))
])

# LightGBM pipeline
pipeline_lgbm = ImbPipeline([
    ('target_encoder', TargetEncoder()),
    ('smote', SMOTE(random_state=42)),
    ('clf', LGBMClassifier(random_state=42, force_col_wise='true', verbose=0))
])

# Adaboost pipeline
pipeline_ada = ImbPipeline([
    ('target_encoder', TargetEncoder()),
    ('smote', SMOTE(random_state=42)),
    ('clf', AdaBoostClassifier(random_state=42))
])

# Combine them into an ensemble classifier
ensemble_clf = VotingClassifier(estimators=[
    ('xgb', pipeline_xgb),
    #('gb', pipeline_gb),
    ('lr', pipeline_lr),
    #('cat', pipeline_cat),
    #
    #('rf', pipeline_rf),
    #('lgbm', pipeline_lgbm),
    ('ada', pipeline_ada)
], voting='soft')

In [1035]:
# Define the F1 score for the '1' class
f1_scorer = make_scorer(f1_score, pos_label=1)

In [1036]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder

# Initialize the custom and standard encoders
team_encoder = TeamEncoder()
target_encoder = TargetEncoder()

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('team', team_encoder, ['HomeTeam', 'AwayTeam']),
        ('div_target', target_encoder, ['Div']),
    ],
    remainder='passthrough'
)


clf = RandomizedSearchCV(
    estimator=ensemble_clf,
    param_distributions=param_test,
    n_iter=5,
    scoring=f1_scorer,
    cv=TimeSeriesSplit(n_splits=3),
    random_state=42,
    n_jobs=-1,
    verbose=0
) 


# Create the pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', clf)
])


In [1037]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier



# Define the hyperparameters

param_distributions = {

}


# Create a RandomizedSearchCV object
search = RandomizedSearchCV(
    model_pipeline,
    param_distributions=param_distributions,
    n_iter=5,
    cv=TimeSeriesSplit(n_splits=5),
    scoring='f1',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model
search.fit(X_train, y_train)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_search.py", line 970, in fit
    self._run_search(evaluate_candidates)
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_search.py", line 1914, in _run_search
    evaluate_candidates(
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_search.py", line 947, in evaluate_candidates
    _warn_or_raise_about_fit_failures(out, self.error_score)
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 536, in _warn_or_raise_about_fit_failures
    raise ValueError(all_fits_failed_message)
ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\ensemble\_voting.py", line 366, in fit
    return super().fit(X, transformed_y, sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\ensemble\_voting.py", line 89, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\parallel.py", line 67, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\parallel.py", line 1863, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\parallel.py", line 1792, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\parallel.py", line 129, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\ensemble\_base.py", line 36, in _fit_single_estimator
    estimator.fit(X, y)
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\imblearn\pipeline.py", line 322, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\imblearn\pipeline.py", line 258, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\imblearn\pipeline.py", line 1050, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **params.get("fit_resample", {}))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\imblearn\base.py", line 208, in fit_resample
    return super().fit_resample(X, y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\imblearn\base.py", line 106, in fit_resample
    X, y, binarize_y = self._check_X_y(X, y)
                       ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\imblearn\base.py", line 161, in _check_X_y
    X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 1263, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 1049, in check_array
    _assert_all_finite(
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 126, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "c:\Users\vermeerbergenj\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 175, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values



In [None]:
# Print the classification report
print(classification_report(y_test, search.predict(X_test)))

              precision    recall  f1-score   support

           0       0.65      0.67      0.66      5649
           1       0.55      0.53      0.54      4287

    accuracy                           0.61      9936
   macro avg       0.60      0.60      0.60      9936
weighted avg       0.61      0.61      0.61      9936

