In [1]:
# Standard library imports
import os
import sys
import re
import warnings
import random
import hashlib

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and preprocessing
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  # Assuming you might need it

# Specific models and tools
from xgboost import XGBClassifier
import xgboost as xgb

# Encoding and feature selection
from category_encoders import TargetEncoder  # Fixed the import based on usage
from scipy.stats import randint, uniform

# Model persistence
from joblib import dump, load

# Miscellaneous settings
%matplotlib inline
warnings.filterwarnings('ignore')


In [2]:
comps = [
        
            'E0', 'E1', 
            'SC0', 'SC1', 
            
            'D1', 'D2',
            'F1', 'F2',
            'I1', 'I2',
            'SP1', 'SP2',

            'B1',
            'G1',
            'N1',
            'P1',
            'T1',           
            
        ]
         
         
seasons = [2324, 2223, 2122]

In [3]:
# Load all filepaths for the competitions and seasons into a list
matches_files = []

for season in seasons:    
    for comp in comps:  
        matches_files.append('data/zip/%s/%s.csv' % (season, comp))

In [4]:
# Load and concatenate matches data into a single DataFrame
df = pd.DataFrame()

for file in matches_files:

    try:
        df_temp = pd.read_csv(file)
        df = pd.concat([df, df_temp], ignore_index=True)
    except:
        # print an error message
        print(f'Error: {file} not found')

# print the amount of data loaded
print(f"Data loaded: {df.shape[0]} matches")

Data loaded: 16297 matches


In [5]:
df.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,Unnamed: 105
0,E0,11/08/2023,20:00,Burnley,Man City,0,3,A,0.0,2.0,...,1.5,1.95,1.98,1.95,1.97,,,1.92,1.95,
1,E0,12/08/2023,12:30,Arsenal,Nott'm Forest,2,1,H,2.0,0.0,...,-2.0,1.95,1.98,1.93,1.97,2.01,2.09,1.95,1.92,
2,E0,12/08/2023,15:00,Bournemouth,West Ham,1,1,D,0.0,0.0,...,0.0,2.02,1.91,2.01,1.92,2.06,1.96,1.96,1.91,
3,E0,12/08/2023,15:00,Brighton,Luton,4,1,H,1.0,0.0,...,-1.75,2.01,1.92,2.0,1.91,2.14,1.93,2.0,1.86,
4,E0,12/08/2023,15:00,Everton,Fulham,0,1,A,0.0,0.0,...,-0.25,2.06,1.87,2.04,1.88,2.08,1.99,1.98,1.88,


In [6]:
# Convert 'Div' to a categorical type, a numeric representation of the division
df['Div'] = df['Div'].astype('category').cat.codes

In [7]:
# Assuming 'teams' is a list of team names
teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()
teams.sort()

# Creating a dictionary from team names to an incremental index number
teams_dict = {team: index for index, team in enumerate(teams)}

In [8]:
# Create a unique list of HomeTeam and AwayTeam names combined, and add an index to each team
teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()

# Sort the teams alphabetically
teams.sort()

# Convert to an array of dictionaries
teams = [{'team': team, 'index': index} for index, team in enumerate(teams)]

df['Team_ID'] = df['HomeTeam'].map(teams_dict)
df['Opp_ID'] = df['AwayTeam'].map(teams_dict)

In [9]:
df.columns

Index(['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR',
       'HTHG', 'HTAG',
       ...
       'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA', 'AvgCAHH',
       'AvgCAHA', 'Unnamed: 105', 'Team_ID', 'Opp_ID'],
      dtype='object', length=109)

In [10]:
# Create a unique list of Referees, and add an index to each Referee
referees = pd.concat([df['Referee']]).unique()

# Convert to an array of dictionaries
referees = [{'referee': referee, 'index': index} for index, referee in enumerate(referees)]

In [11]:
# Map the team names to the index values in the 'teams' list
#df['Team_ID'] = df['HomeTeam'].map({team['team']: team['index'] for team in teams})
#df['Opp_ID'] = df['AwayTeam'].map({team['team']: team['index'] for team in teams})



In [12]:
# Correcting the creation of a unique list of Referees and adding an index to each Referee
referees = df['Referee'].unique()  # This should directly refer to the 'Referee' column

# Convert to a dictionary with referee names as keys and their indices as values
referee_dict = {referee: index for index, referee in enumerate(referees)}

# Now map the 'Referee' column to these indices
df['Ref_ID'] = df['Referee'].map(referee_dict)

In [13]:
def parse_date_to_int(date_str):
    for fmt in ('%d/%m/%Y', '%d/%m/%y'):  # Add more formats here as needed
        try:
            # Parse the date
            dt = pd.to_datetime(date_str, format=fmt)
            # Format as 'YYYYMMDD' and convert to int
            return int(dt.strftime('%Y%m%d'))
        except ValueError:
            continue
    return None  # Return None if all formats fail

# First, ensure the Date column is in a datetime format if it's not already
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)


df['Date_temp'] = df['Date'].apply(parse_date_to_int)

In [14]:

# Declare Date_temp as a temporary column, an 8 digit integer representation of the date
#df['Date_temp'] = df['Date'].dt.year * 10000 + df['Date'].dt.month * 100 + df['Date'].dt.day

# Parse Date_temp to an 8 digit integer
#df['Date_temp'] = df['Date_temp'].astype(int)

# Connvert 'Time', which is now in HH:MM format to a 4 digit integer
# Assuming a default time of 00:00 for missing values
df['Time'] = df['Time'].fillna('00:00').str.replace(':', '').astype(int)

In [15]:
df.columns = [re.sub(r'[<]', '_st_', str(col)) for col in df.columns]
df.columns = [re.sub(r'[>]', '_gt_', str(col)) for col in df.columns]

In [16]:
# Sort df by Date_temp and Time
df = df.sort_values(['Date_temp', 'Time'])

In [17]:
def history_vs_opponent_weighted(df, row, team_column):
    # Initialize the total weighted score
    weighted_score = 0
    opponent_column = 'Opp_ID'

    row_date_temp = row['Date'].year * 10000 + row['Date'].month * 100 + row['Date'].day

    
    # Filter the DataFrame for matches between the specified team and opponent from the same season, excluding the current match
    filtered_matches = df[(df[team_column] == row[team_column]) & 
                          (df[opponent_column] == row[opponent_column]) &
                          (df['Date_temp'] < row_date_temp)]
    
    recent_matches = filtered_matches.sort_values(by='Date', ascending=False).head(5)
    
    # Calculate weights - newer matches have higher weights
    weights = range(len(recent_matches), 0, -1)  # Descending list based on the number of matches
    
    # Calculate score based on the match result
    for match, weight in zip(recent_matches.itertuples(), weights):
        if getattr(match, 'FTR') == 'H' and getattr(match, team_column) == getattr(match, 'Team_ID') or \
           getattr(match, 'FTR') == 'A' and getattr(match, team_column) != getattr(match, 'Team_ID'):
            weighted_score += 3 * weight  # Team won
        elif getattr(match, 'FTR') == 'A':
            weighted_score += 1 * weight  # Draw
        
    # Normalize the weighted score by the sum of weights
    normalized_weighted_score = weighted_score / sum(weights) if weights else 0

    #print(f"Weighted score: {weighted_score}, Normalized weighted score: {normalized_weighted_score}")

    return normalized_weighted_score

In [18]:
# Apply the modified function to create new columns
df['team_hist_vs'] = df.apply(lambda x: history_vs_opponent_weighted(df, x, 'Team_ID'), axis=1)
#df['opp_hist_vs'] = df.apply(lambda x: history_vs_opponent_weighted(df, x, 'Opp_ID'), axis=1)

In [19]:
import pandas as pd

# Function adapted for DataFrame application
def convert_odds(row):
    odds_win, odds_draw, odds_lose = row['AvgH'], row['AvgD'], row['AvgA']
    prob_win = 1 / odds_win
    prob_draw = 1 / odds_draw
    prob_lose = 1 / odds_lose
    prob_not_win = prob_draw + prob_lose
    return pd.Series([prob_win, prob_not_win], index=['probs_win', 'probs_not_win'])

# Apply the function and create new columns
#df[['probs_win', 'probs_not_win']] = df.apply(convert_odds, axis=1)

#df = df.drop(columns=['AvgH', 'AvgD', 'AvgA'])

In [20]:
# Drop multiple columns
#df = df.drop(['Date'], axis=1)

In [21]:
df.tail(5)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,Unnamed: 105,Team_ID,Opp_ID,Ref_ID,Date_temp,team_hist_vs
2135,6,2024-04-01,1945,Ajaccio,Auxerre,0,1,A,0.0,1.0,...,1.85,2.12,1.76,2.06,,5,37,80,20240401,0.666667
2434,8,2024-04-01,1945,Inter,Empoli,2,0,H,1.0,0.0,...,2.0,2.0,1.95,1.91,,166,107,80,20240401,1.666667
769,4,2024-04-01,2000,Leeds,Hull,3,1,H,1.0,1.0,...,2.14,1.86,2.05,1.8,,188,163,18,20240401,0.0
3043,14,2024-04-01,2000,Villarreal,Ath Madrid,1,2,A,0.0,1.0,...,2.12,1.95,1.99,1.84,,342,34,80,20240401,0.0
4332,11,2024-04-01,2015,Portimonense,Sp Braga,3,5,A,1.0,2.0,...,1.99,1.96,1.93,1.9,,262,303,80,20240401,1.0


In [22]:
def team_form(df, row, perspective):
    # Determine the team ID based on the perspective ('Team' or 'Opp')
    if perspective == 'Team':
        team_id = row['Team_ID']
    elif perspective == 'Opp':
        team_id = row['Opp_ID']
    else:
        raise ValueError("Perspective must be 'Team' or 'Opp'")
    
    # Get the current match date
    current_date = row['Date_temp']
    
    # Filter past matches for the team
    past_matches = df[((df['Team_ID'] == team_id) | (df['Opp_ID'] == team_id)) &
                      (df['Date_temp'] < current_date)].sort_values(by='Date_temp', ascending=False).head(5)
    
    # Initialize points
    points = 0
    
    # Weights for the matches (most recent match has the highest weight)
    weights = [5, 4, 3, 2, 1]
    
    # Calculate points with weights
    weighted_points_sum = 0
    total_weights = sum(weights[:len(past_matches)])  # Adjust the total weight in case of less than 5 matches
    
    for match, weight in zip(past_matches.itertuples(), weights):
        if (match.Team_ID == team_id and match.FTR == 'H') or (match.Opp_ID == team_id and match.FTR == 'A'):
            points += 3
        elif match.FTR == 'D':
            points += 1
        else:
            points += 0

        weighted_points_sum += points * weight
    
    if total_weights > 0:
        return weighted_points_sum / total_weights
    else:
        return 0  # Return 0 if no past matches found

In [23]:
# Applying the function to each row for 'Team'
#df['team_form_team'] = df.apply(lambda row: team_form(df, row, 'Team'), axis=1)

# Applying the function to each row for 'Opp'
#df['team_form_opp'] = df.apply(lambda row: team_form(df, row, 'Opp'), axis=1)

In [24]:
def rolling_avgs(df, row, perspective, home_column, away_column):
    # Determine the team ID based on the perspective ('Team' or 'Opp')
    if perspective == 'Team':
        team_id = row['Team_ID']
    elif perspective == 'Opp':
        team_id = row['Opp_ID']
    else:
        raise ValueError("Perspective must be 'Team' or 'Opp'")
    
    # Get the current match date
    current_date = row['Date_temp']
    
    # Filter past 5 matches for the team
    past_matches = df[((df['Team_ID'] == team_id) | (df['Opp_ID'] == team_id)) &
                      (df['Date_temp'] < current_date)].sort_values(by='Date_temp', ascending=False).head(5)
    
    # Weights for the matches (most recent match has the highest weight)
    weights = [5, 4, 3, 2, 1]
    values = []
    
    # Determine which column to use and collect the values
    for match in past_matches.itertuples():
        if match.Team_ID == team_id:
            values.append(getattr(match, home_column))  # Use home_column for home team
        else:
            values.append(getattr(match, away_column))  # Use away_column for away team
    
    # Calculate the weighted average of the values
    weighted_sum = sum(value * weight for value, weight in zip(values, weights))
    total_weights = sum(weights[:len(values)])  # Adjust total weight if there are less than 5 matches
    
    if total_weights > 0:
        return weighted_sum / total_weights
    else:
        return 0  # Return 0 if no past matches found


In [25]:
#df['team_shots'] = df.apply(lambda row: rolling_avgs(df, row, 'Team', 'HS', 'AS'), axis=1)
#df['opp_shots'] = df.apply(lambda row: rolling_avgs(df, row, 'Opp', 'HS', 'AS'), axis=1)

#df['team_shots_target'] = df.apply(lambda row: rolling_avgs(df, row, 'Team', 'HST', 'AST'), axis=1)
#df['opp_shots_target'] = df.apply(lambda row: rolling_avgs(df, row, 'Opp', 'HST', 'AST'), axis=1)

In [26]:
from datetime import timedelta

def avg_games_played(df, row, team_column):
    team = row[team_column]
    # Ensure current_match_date is a Timestamp for comparison
    current_match_date = pd.to_datetime(row['Date'], dayfirst=True)  # Assuming 'Date' format is 'dd/mm/yy'

    delta = 30
    start_date = current_match_date - timedelta(days=delta)

    # Ensure 'Date' column is in datetime format for comparison
    #df['Date_temp'] = pd.to_datetime(df['Date'], dayfirst=True)  # Convert 'Date' column to datetime if not already done

    # Filter the DataFrame for matches within the last 30 days
    if team_column == 'Team_ID':
        past_matches = df[((df[team_column] == team) | (df['Opp_ID'] == team)) &
                          (df['Date'] >= start_date) & (df['Date'] < current_match_date)]
    else:
        past_matches = df[((df['Team_ID'] == team) | (df[team_column] == team)) &
                          (df['Date'] >= start_date) & (df['Date'] < current_match_date)]

    # If no matches were played in the last 30 days
    if past_matches.empty:
        return 0

    # Calculate weights based on the recency of each match
    weights = (current_match_date - past_matches['Date']).dt.days
    weighted_count = sum(delta - weights + 1)  # '+ 1' to include the match day in the weight

    # Normalize weights to sum to 1 and calculate the weighted average
    total_weight = sum(delta - weights + 1)
    weighted_avg = weighted_count / total_weight

    return weighted_avg


In [27]:
# Apply the function for each team and opponent
#df['team_avg_games'] = df.apply(lambda x: avg_games_played(df, x, 'Team_ID'), axis=1)
#df['opp_avg_games'] = df.apply(lambda x: avg_games_played(df, x, 'Opp_ID'), axis=1)

In [28]:
# Calculate means only for numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
means = df[numeric_cols].mean()

# Fill missing values in numeric columns with their respective means
df[numeric_cols] = df[numeric_cols].fillna(means)

In [29]:
# Drop every row where 'FTR' is not 'H', 'D', or 'A'
df = df[df['FTR'].isin(['H', 'D', 'A'])]

# Map 'H', 'D', and 'A' to 1, 0, and 0 respectively
df['FTR'] = df['FTR'].map({'H': 1, 'D': 0, 'A': 0}).astype(int)

In [30]:
df.tail()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,Unnamed: 105,Team_ID,Opp_ID,Ref_ID,Date_temp,team_hist_vs
2135,6,2024-04-01,1945,Ajaccio,Auxerre,0,1,0,0.0,1.0,...,1.85,2.12,1.76,2.06,,5,37,80,20240401,0.666667
2434,8,2024-04-01,1945,Inter,Empoli,2,0,1,1.0,0.0,...,2.0,2.0,1.95,1.91,,166,107,80,20240401,1.666667
769,4,2024-04-01,2000,Leeds,Hull,3,1,1,1.0,1.0,...,2.14,1.86,2.05,1.8,,188,163,18,20240401,0.0
3043,14,2024-04-01,2000,Villarreal,Ath Madrid,1,2,0,0.0,1.0,...,2.12,1.95,1.99,1.84,,342,34,80,20240401,0.0
4332,11,2024-04-01,2015,Portimonense,Sp Braga,3,5,0,1.0,2.0,...,1.99,1.96,1.93,1.9,,262,303,80,20240401,1.0


In [31]:
df = df[[
    
        'Div', 'Date_temp', 'Time', 'Team_ID', 'Opp_ID', 'Ref_ID', 'FTR', 
        'team_hist_vs', 
        #'opp_hist_vs',

        #'probs_win',         
        #'probs_not_win', 
        
        #'team_form_team', 
        #'team_form_opp',
         
        #'team_shots', 'opp_shots',
        #'team_shots_target', 'opp_shots_target',

        #'team_avg_games', 'opp_avg_games',

        'AvgH', 'AvgD', 'AvgA'
         
         
         ]]

In [32]:
# Sort the DataFrame by Date_temp and separate the 200 most recent matches into a validation set
df.sort_values('Date_temp', inplace=True)

# Set the 'Date_temp' column as the index
df.set_index('Date_temp', inplace=True)

# Drop the 'Date_temp' column
#df.drop('Date_temp', axis=1, inplace=True)

df_val = df.tail(250)
df = df.iloc[:-250]

In [33]:
#import train_test_split
from sklearn.model_selection import train_test_split

# Split the data into X and y
X = df.drop('FTR', axis=1)
y = df['FTR']

X.columns = [re.sub(r'[<]', '_st_', str(col)) for col in X.columns]
X.columns = [re.sub(r'[>]', '_gt_', str(col)) for col in X.columns]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
len(X_train), len(X_test)

(12837, 3210)

In [35]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier


# Create a pipeline
pipeline = Pipeline([
    ('target_encoder', TargetEncoder()),
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier())
])

# Define the hyperparameters

param_distributions = {

    'target_encoder__smoothing': randint(1, 100),
    'xgb__n_estimators': randint(100, 1000),
    'xgb__max_depth': randint(3, 10),
    'xgb__learning_rate': uniform(0.01, 0.6),
    'xgb__subsample': uniform(0.3, 0.7),
    'xgb__colsample_bytree': uniform(0.3, 0.7),
    'xgb__gamma': randint(0, 5),
    'xgb__reg_alpha': uniform(0, 1),
    'xgb__reg_lambda': uniform(0, 1),
    'xgb__min_child_weight': randint(1, 10),
    'xgb__scale_pos_weight': uniform(0.5, 1.5)
  
}

# Create a RandomizedSearchCV object
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=5,
    cv=TimeSeriesSplit(n_splits=5),
    scoring='f1',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model
search.fit(X_train, y_train)


Fitting 5 folds for each of 5 candidates, totalling 25 fits




In [36]:
# print the classification report
print(classification_report(y_test, search.predict(X_test)))

              precision    recall  f1-score   support

           0       0.63      0.57      0.60      1772
           1       0.53      0.59      0.56      1438

    accuracy                           0.58      3210
   macro avg       0.58      0.58      0.58      3210
weighted avg       0.59      0.58      0.58      3210



In [37]:
def generate_sliding_windows(X, window_size, step):
    n_samples = len(X)
    windows = []
    for start_idx in range(0, n_samples - window_size + 1, step):
        end_idx = start_idx + window_size
        if end_idx > n_samples:
            break  # Avoid going beyond the dataset
        train_indices = list(range(max(0, start_idx - window_size), start_idx))
        test_indices = list(range(start_idx, end_idx))
        windows.append((train_indices, test_indices))
    return windows

negative_count = len(df[df['FTR'] == 0])
positive_count = len(df[df['FTR'] == 1])
scale_pos_weight_value = negative_count / positive_count

# Define the hyperparameter search space
param_dist = {
    
    'xgb__clf__max_depth': [1,2,3],
    'xgb__clf__learning_rate': [0.001, 0.01, 0.1],
    'xgb__clf__lambda': [1, 1.5, 2],  # L2 regularization term on weights
    'xgb__clf__alpha': [0, 0.5, 1],  # L1 regularization term on weights
    'xgb__clf__n_estimators': [1, 5, 100],

    'rf__clf__max_depth': [None, 4, 6],
    'rf__clf__min_samples_split': [2, 5],
    'rf__clf__min_samples_leaf': [1, 2],
    'rf__clf__bootstrap': [True, False],
    'rf__clf__n_estimators': [50, 100, 200],

    'lr__clf__C': [0.1, 1, 10],  # Inverse of regularization strength; smaller values specify stronger regularization.
    'lr__clf__penalty': ['l1', 'l2', 'elasticnet'],  # Specify the norm of the penalty.
    'lr__clf__solver': ['saga'],  # Algorithm to use in the optimization problem, 'saga' supports all penalties.
    'lr__clf__l1_ratio': [0.5],  # The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1. Only used if penalty='elasticnet'.

    'cat__clf__depth': [1,2,3,4],
    'cat__clf__learning_rate': [0.01, 0.05, 0.1],
    'cat__clf__iterations': [50, 100, 200],
    'cat__clf__l2_leaf_reg': [1, 3, 5],

    'gb__clf__learning_rate': [0.01, 0.1, 0.2],
    'gb__clf__n_estimators': [50, 100, 200],
    'gb__clf__max_depth': [3, 5, 7],
    'gb__clf__min_samples_split': [2, 5],
    'gb__clf__min_samples_leaf': [1, 2],

}


param_test = {
    
    'xgb__clf__max_depth': [1,2,3],
    'xgb__clf__learning_rate': [0.001, 0.01, 0.1],
    'xgb__clf__lambda': [1, 1.5, 2],  # L2 regularization term on weights
    'xgb__clf__alpha': [0, 0.5, 1],  # L1 regularization term on weights
    'xgb__clf__n_estimators': [1, 5, 100],


}

In [38]:
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# Define a custom scoring function
def xgb_early_stopping_score(y, estimator, X, y_true, sample_weight=None):
    """
    Custom scorer that uses early stopping.
    """
    # Split X into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y_true, test_size=0.2, random_state=42)
    
    # Fit with early stopping
    eval_set = [(X_val, y_val)]
    estimator.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_set, verbose=False)
    
    # Predict on the validation set
    y_pred = estimator.predict(X_val)
    
    # Return the F1 score
    return f1_score(y_val, y_pred, pos_label=1)

In [39]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import make_scorer, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline as ImbPipeline
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# LightGBM
from lightgbm import LGBMClassifier

# naive bayes
from sklearn.naive_bayes import GaussianNB

#catboost
from catboost import CatBoostClassifier

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Define the F1 score for the '1' class
f1_scorer = make_scorer(f1_score, pos_label=1)

best_f1_score = 0
best_f1_params = None
best_window_size = None
best_precision = 0
best_model = None 
f1_scores = []
precision_scores = []



# Make the custom scorer
custom_scorer = make_scorer(xgb_early_stopping_score, greater_is_better=True, needs_proba=False, X=X, y_true=y)

# Set the window_size and step to 5% of the dataset
window_size = int(len(X) * 0.2)
step = int(len(X) * 0.2)

# Initialize an empty list to store precision scores
precision_scores = []

# Initialize an empty dataframe to store misclassified samples
misclassified_samples = pd.DataFrame(columns=X.columns)

# Generate windows
window_splits = generate_sliding_windows(X, window_size, step)

# Initialize training indices with the first window
train_end_index = window_size

# Iterate over each sliding window
for i, (train_index, test_index) in enumerate(window_splits):

    # Update training indices to include the next window
    train_index = list(range(train_end_index))
    train_end_index += window_size

    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test = X.iloc[test_index], y.iloc[test_index]

    print(f"Iteration {i+1} Training Data Shape: {X_train.shape}")

    # Combine misclassified samples from previous iterations with current training data
    if not misclassified_samples.empty:
        X_train_combined = pd.concat([X_train, misclassified_samples[X_train.columns]], axis=0)
        y_train_combined = pd.concat([y_train, misclassified_samples['FTR']], axis=0)
    else:
        X_train_combined = X_train
        y_train_combined = y_train

    # Calculate misclassification frequency
    misclassified_freq = y_train_combined.value_counts(normalize=True)

    # Define class weights based on misclassification frequency
    class_weights = {0: 1, 1: max(0.6, 1 - misclassified_freq.get(1, 0.5))}  # Adjust dynamically to penalize misclassification of class 1 more heavily

    # Define pipelines for each classifier with SMOTE and TargetEncoder
    pipeline_xgb = ImbPipeline([
        ('target_encoder', TargetEncoder()),
        ('smote', SMOTE(random_state=42)),
        ('clf', XGBClassifier(random_state=42, verbose=0))
    ])

    pipeline_gb = ImbPipeline([
        ('target_encoder', TargetEncoder()),
        ('smote', SMOTE(random_state=42)),
        ('clf', GradientBoostingClassifier(random_state=42, verbose=0))
    ])

    # pipeline for logistic regression
    pipeline_lr = ImbPipeline([
        ('target_encoder', TargetEncoder()),
        ('smote', SMOTE(random_state=42)),
        ('clf', LogisticRegression(random_state=42, verbose=0))
    ])

    # pipeline for catboost classifier
    pipeline_cat = ImbPipeline([
        ('target_encoder', TargetEncoder()),
        ('smote', SMOTE(random_state=42)),
        ('clf', CatBoostClassifier(random_state=42, verbose=0))
    ])

    # pipeline for random forest
    pipeline_rf = ImbPipeline([
        ('target_encoder', TargetEncoder()),
        ('smote', SMOTE(random_state=42)),
        ('clf', RandomForestClassifier(random_state=42, verbose=0))
    ])

    # LightGBM pipeline
    pipeline_lgbm = ImbPipeline([
        ('target_encoder', TargetEncoder()),
        ('smote', SMOTE(random_state=42)),
        ('clf', LGBMClassifier(random_state=42, force_col_wise='true', verbose=0))
    ])

    # Adaboost pipeline
    pipeline_ada = ImbPipeline([
        ('target_encoder', TargetEncoder()),
        ('smote', SMOTE(random_state=42)),
        ('clf', AdaBoostClassifier(random_state=42))
    ])

    # Combine them into an ensemble classifier
    ensemble_clf = VotingClassifier(estimators=[
        ('xgb', pipeline_xgb),
        #('gb', pipeline_gb),
        ('lr', pipeline_lr),
        #('cat', pipeline_cat),
        #
        #('rf', pipeline_rf),
        #('lgbm', pipeline_lgbm),
        ('ada', pipeline_ada)
    ], voting='soft')

    # Setup RandomizedSearchCV
    clf = RandomizedSearchCV(
        estimator=ensemble_clf,
        param_distributions=param_test,
        n_iter=5,
        scoring=f1_scorer,
        cv=TimeSeriesSplit(n_splits=3),
        random_state=42,
        n_jobs=-1,
        verbose=0
    )  

    # Fit RandomizedSearchCV
    clf.fit(X_train_combined, y_train_combined)

    # Get the best parameters
    best_params = clf.best_params_
    print("Best Parameters:", best_params)

    # Use the best estimator
    best_pipe = clf.best_estimator_

    # Make predictions
    y_proba = best_pipe.predict_proba(X_test)

    # Apply threshold
    threshold = 0.5  # You can adjust this threshold as needed
    y_pred = (y_proba[:, 1] >= threshold).astype(int)

    current_f1_score = f1_score(y_test, y_pred, pos_label=1)

    if current_f1_score > best_f1_score:
        best_f1_score = current_f1_score
        best_f1_params = clf.best_params_
        #best_model = clf.best_estimator_ 

    # ------------------------------------------------

    best_model = clf.best_estimator_ 

    # Calculate precision score
    precision = np.mean(y_test == y_pred)
    precision_scores.append(precision)
    print("Precision:", precision)

    print()

# Print the best F1 score and its corresponding parameters
print()
print("Best F1 Score:", best_f1_score)

# print the classification report of the best model on the full dataset
print(classification_report(y_test, y_pred))

Iteration 1 Training Data Shape: (3209, 9)
Best Parameters: {'xgb__clf__n_estimators': 1, 'xgb__clf__max_depth': 3, 'xgb__clf__learning_rate': 0.001, 'xgb__clf__lambda': 1, 'xgb__clf__alpha': 0}
Precision: 0.6319725771268307

Iteration 2 Training Data Shape: (6418, 9)
Best Parameters: {'xgb__clf__n_estimators': 5, 'xgb__clf__max_depth': 2, 'xgb__clf__learning_rate': 0.1, 'xgb__clf__lambda': 2, 'xgb__clf__alpha': 1}
Precision: 0.6463072608289187

Iteration 3 Training Data Shape: (9627, 9)
Best Parameters: {'xgb__clf__n_estimators': 100, 'xgb__clf__max_depth': 3, 'xgb__clf__learning_rate': 0.01, 'xgb__clf__lambda': 2, 'xgb__clf__alpha': 0.5}
Precision: 0.6463072608289187

Iteration 4 Training Data Shape: (12836, 9)
Best Parameters: {'xgb__clf__n_estimators': 100, 'xgb__clf__max_depth': 3, 'xgb__clf__learning_rate': 0.01, 'xgb__clf__lambda': 2, 'xgb__clf__alpha': 0.5}
Precision: 0.6360236833904643

Iteration 5 Training Data Shape: (16045, 9)
Best Parameters: {'xgb__clf__n_estimators': 100

In [40]:
# Correlation matrix
corr = df.corr()

# Put the target column to the front
cols = list(corr.columns)
cols.insert(0, cols.pop(cols.index('FTR')))
corr = corr.loc[cols, cols]

# Plot the correlation matrix
#plt.figure(figsize=(10, 8))
#sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=2)
#plt.show()



In [41]:
# Print the feature importances
importances = search.best_estimator_.named_steps['xgb'].feature_importances_
features = X_train.columns
importances_df = pd.DataFrame({'features': features, 'importances': importances})
importances_df = importances_df.sort_values('importances', ascending=False)

# Plot the feature importances
#plt.figure(figsize=(10, 8))
#sns.barplot(x='importances', y='features', data=importances_df)
#plt.title('Feature Importances')
#plt.show()



In [42]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250 entries, 20240316 to 20240401
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Div           250 non-null    int8   
 1   Time          250 non-null    int32  
 2   Team_ID       250 non-null    int64  
 3   Opp_ID        250 non-null    int64  
 4   Ref_ID        250 non-null    int64  
 5   FTR           250 non-null    int32  
 6   team_hist_vs  250 non-null    float64
 7   AvgH          250 non-null    float64
 8   AvgD          250 non-null    float64
 9   AvgA          250 non-null    float64
dtypes: float64(4), int32(2), int64(3), int8(1)
memory usage: 17.8 KB


### Validation

In [43]:
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Presuming best_model and df_val are already correctly defined and available

# Calculate the predicted probabilities for the validation set
y_val_proba = best_model.predict_proba(df_val.drop(columns=['FTR']))

# Initialize variables to track the best threshold and its corresponding accuracy
best_threshold = 0.5
best_accuracy = 0

# Iterate over potential threshold values
for threshold in np.arange(0.58, 0.8, 0.001):
    # Apply the current threshold to generate predictions
    y_val_pred = (y_val_proba[:, 1] >= threshold).astype(int)
    
    # Evaluate accuracy for the current set of predictions
    accuracy = accuracy_score(df_val['FTR'], y_val_pred)
    
    # Update the best threshold and accuracy as needed
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold 

# Print the best threshold and its accuracy
print(f"Best Threshold: {best_threshold}")
print(f"Best Accuracy: {best_accuracy}")

# Apply the best threshold to generate final predictions
# Now considering both HomeTeam win probability (index 1) and AwayTeam win probability (index 2)
df_val['proba_home_win'] = y_val_proba[:, 1]  # Probability of home win
df_val['proba_away_win'] = y_val_proba[:, 2]  # Probability of away win

# Initialize prediction column with 0 (which we'll use to indicate no prediction above threshold/draw)
df_val['Prediction'] = 0

# Update Prediction to 1 if HomeTeam win probability is above the threshold
df_val.loc[df_val['proba_home_win'] > best_threshold, 'Prediction'] = 1

# Update Prediction to 2 if AwayTeam win probability is above the threshold
df_val.loc[df_val['proba_away_win'] > best_threshold, 'Prediction'] = 2

# Filter out draws and rows where neither team's win probability exceeds the threshold
filtered_df_val = df_val[(df_val['Prediction'] == 1) | (df_val['Prediction'] == 2)].copy()

# Reset index and map team IDs back to team names for clarity
filtered_df_val.reset_index(inplace=True)
filtered_df_val['Team'] = filtered_df_val['Team_ID'].map(index_to_team)
filtered_df_val['Opponent'] = filtered_df_val['Opp_ID'].map(index_to_team)

# Determine whether the prediction was correct
filtered_df_val['Actual Result'] = df_val['FTR']
filtered_df_val['Correct Prediction'] = ((filtered_df_val['Prediction'] == 1) & (filtered_df_val['Actual Result'] == 1)) | \
                                        ((filtered_df_val['Prediction'] == 2) & (filtered_df_val['Actual Result'] == 0))

# Define columns to display
display_columns = [
    'Date_temp', 'Team', 'Opponent', 'proba_home_win', 'proba_away_win', 'Prediction',
    'Actual Result', 'Correct Prediction', 'AvgH', 'AvgD', 'AvgA'
]



Best Threshold: 0.583
Best Accuracy: 0.644


IndexError: index 2 is out of bounds for axis 1 with size 2

In [None]:
output = filtered_df_val[display_columns]
output = output.sort_values('Date_temp', ascending=False)

output = output[output['proba_1'] > best_threshold].copy()

In [None]:
display(output)

Unnamed: 0,Date_temp,Team,Opponent,proba_1,Prediction,Actual Result,Correct Prediction,AvgH,AvgD,AvgA
47,20240401,Coventry,Cardiff,0.603776,1,0,False,1.53,4.28,6.0
46,20240401,Bologna,Salernitana,0.685998,1,1,True,1.29,5.44,11.25
45,20240401,Leicester,Norwich,0.584141,1,1,True,1.65,4.17,4.83
44,20240401,Cremonese,FeralpiSalo,0.589177,1,0,False,1.56,4.0,5.48
43,20240401,Inter,Empoli,0.715984,1,1,True,1.2,7.04,13.25
42,20240401,Leeds,Hull,0.637767,1,1,True,1.43,4.75,6.98
36,20240331,Bochum,Darmstadt,0.583614,1,0,False,1.65,4.18,5.06
33,20240331,Almere City,Volendam,0.624196,1,0,False,1.44,4.59,6.88
34,20240331,Stuttgart,Heidenheim,0.677108,1,0,False,1.27,6.35,10.32
35,20240331,Oviedo,Villarreal B,0.596605,1,1,True,1.52,3.98,6.1


In [None]:
# Display the Correct Prediction True / False ratio, and ther percentage of correct predictions
correct_predictions = output['Correct Prediction'].sum()
total_predictions = len(output)
correct_ratio = correct_predictions / total_predictions

print(f"Best Threshold: {best_threshold:.2f}")
print(f"Best Accuracy: {best_accuracy:.2f}")
print()
print(f"Total Predictions: {total_predictions}")
print(f"Total Correct Predictions: {correct_predictions}")
print(f"Percentage of Correct Predictions: {correct_ratio * 100:.2f}%")

Best Threshold: 0.58
Best Accuracy: 0.64

Total Predictions: 48
Total Correct Predictions: 35
Percentage of Correct Predictions: 72.92%


In [None]:
# save filtered_df_val[display_columns] to a CSV file
output.to_csv('filtered_predictions_dual.csv', index=False)

In [None]:
import winsound
frequency = 400  # Set Frequency To 2500 Hertz
duration = 200  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)