In [12]:
# Standard library imports
import os
import sys
import re
import warnings
import random
import hashlib

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and preprocessing
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  # Assuming you might need it

# Specific models and tools
from xgboost import XGBClassifier
import xgboost as xgb

# Encoding and feature selection
from category_encoders import TargetEncoder  # Fixed the import based on usage
from scipy.stats import randint, uniform

# Model persistence
from joblib import dump, load

# Miscellaneous settings
%matplotlib inline
warnings.filterwarnings('ignore')


In [13]:
competitions = [

    # Code, Seasons
    ['E0', [2324, 2223, 2122, 2021, 1920]],
    ['D1', [2324, 2223, 2122, 2021, 1920]],
    ['I1', [2324, 2223, 2122, 2021, 1920]],



]

In [14]:
matches_files = []

In [15]:
for comp in competitions:

    for season in comp[1]:

        matches_files.append(f"data/matches/{comp[0]}_{season}.csv")

In [16]:
# Load and concatenate matches data into a single DataFrame
df = pd.DataFrame()

for file in matches_files:

    try:
        df_temp = pd.read_csv(file)
        df = pd.concat([df, df_temp], ignore_index=True)
    except:
        # print an error message
        print(f'Error: {file} not found')

# print the amount of data loaded
print(f"Data loaded: {df.shape[0]} matches")

Error: data/matches/E0_2324.csv not found
Error: data/matches/E0_2223.csv not found
Error: data/matches/E0_2122.csv not found
Error: data/matches/E0_2021.csv not found
Error: data/matches/E0_1920.csv not found
Data loaded: 3277 matches


In [17]:
# Convert all columns to lowercase
df.columns = df.columns.str.lower()

In [18]:
# Rename 'HomeTeam' to 'Team' in df
df.rename(columns={'hometeam': 'team'}, inplace=True)
df.rename(columns={'awayteam': 'opponent'}, inplace=True)

In [19]:
df.head()

Unnamed: 0,div,date,time,team,opponent,fthg,ftag,ftr,hthg,htag,...,bbmx>2.5,bbav>2.5,bbmx<2.5,bbav<2.5,bbah,bbahh,bbmxahh,bbavahh,bbmxaha,bbavaha
0,D1,18/08/2023,19:30,Werder Bremen,Bayern Munich,0,4,A,0,1,...,,,,,,,,,,
1,D1,19/08/2023,14:30,Augsburg,M'gladbach,4,4,D,3,3,...,,,,,,,,,,
2,D1,19/08/2023,14:30,Hoffenheim,Freiburg,1,2,A,0,2,...,,,,,,,,,,
3,D1,19/08/2023,14:30,Leverkusen,RB Leipzig,3,2,H,2,1,...,,,,,,,,,,
4,D1,19/08/2023,14:30,Stuttgart,Bochum,5,0,H,2,0,...,,,,,,,,,,


In [20]:
# Convert 'Div' to a categorical type, a numeric representation of the division
df['div'] = df['div'].astype('category').cat.codes

In [21]:
# Create a unique list of HomeTeam and AwayTeam names combined, and add an index to each team
teams = pd.concat([df['team'], df['opponent']]).unique()

# Sort the teams alphabetically
teams.sort()

# Convert to an array of dictionaries
teams = [{'team': team, 'index': index} for index, team in enumerate(teams)]

In [22]:
# Create a unique list of Referees, and add an index to each Referee
referees = pd.concat([df['referee']]).unique()

# Remove any missing values
referees = referees[~pd.isnull(referees)]

# Sort the teams alphabetically
referees.sort()

# Convert to an array of dictionaries
referees = [{'referee': referee, 'index': index} for index, referee in enumerate(referees)]

KeyError: 'referee'

In [None]:
# Map the referee names to the index
df['referee_id'] = df['referee'].map({referee['referee']: referee['index'] for referee in referees})

In [None]:
df['ftr_code'] = df['ftr']

# Map the FTR column to a binary outcome
#df['FTR'] = df['FTR'].map({'H': 1, 'D': 0, 'A': 2}).astype(int)
#df['HTR'] = df['HTR'].map({'H': 1, 'D': 0, 'A': 2}).astype(int)

In [None]:
# Map the team names to the index values in the 'teams' list
df['team_id'] = df['team'].map({team['team']: team['index'] for team in teams})
df['opp_id'] = df['opponent'].map({team['team']: team['index'] for team in teams})

In [None]:
# Date is in DD/MM/YYYY format, convert it to a datetime object
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')

# Declare Date_temp as a temporary column, an 8 digit integer representation of the date
df['date_temp'] = df['date'].dt.year * 10000 + df['date'].dt.month * 100 + df['date'].dt.day

# Connvert 'Time', which is now in HH:MM format to a 4 digit integer
# Assuming a default time of 00:00 for missing values
df['time'] = df['time'].fillna('00:00').str.replace(':', '').astype(int)

In [None]:
df.columns = [re.sub(r'[<]', '_st_', str(col)) for col in df.columns]
df.columns = [re.sub(r'[>]', '_gt_', str(col)) for col in df.columns]

In [None]:
# Apply the modified function to create new columns

#df['opp_hist_vs'] = df.apply(lambda x: history_vs_opponent_weighted(df, x, 'AwayTeam_ID'), axis=1)

In [None]:
df = df[['div', 'date', 'date_temp', 'time', 'team_id', 'opp_id', 
         
         #'referee_id',          
         
#'fthg',
#'ftag',
'ftr',
#'hthg',
#'htag',
#'htr',
#'hs',
#'as',
#'hst',
#'ast',
#'hf',
#'af',
#'hc',
#'ac',
#'hy',
#'ay',
#'hr',
#'ar',

'avgh',
'avgd',
'avga',  

#'team_hist_vs',
#'opp_hist_vs'
         
         
         
         
         ]]

In [None]:
df['venue'] = 1

In [None]:
# Drop multiple columns
df = df.drop(['date'], axis=1)

In [None]:
df.columns

In [None]:
import pandas as pd

# assuming df is your original dataframe
# duplicate the dataframe
df_duplicate = df.copy()

# switch the values of hometeam_id and awayteam_id
df_duplicate[['team_id', 'opp_id']] = df_duplicate[['opp_id', 'team_id']].values

# switch the values of b365h and b365a
df_duplicate[['avgh', 'avga']] = df_duplicate[['avga', 'avgh']].values

# switch the values of probs_win and probs_not_win
#df_duplicate[['probs_win', 'probs_not_win']] = df_duplicate[['probs_not_win', 'probs_win']].values

df_duplicate['venue'] = 0

In [None]:
# Concatenate the original dataframe with the modified duplicate
df = pd.concat([df, df_duplicate], ignore_index=True)

In [None]:
df['ftr'] = df['ftr'].map({'H': 1, 'D': 0, 'A': 0}).astype(int)

In [None]:
def history_vs_opponent_weighted(df, row, team_column):
    """
    Calculate the weighted average of points scored by a team against a specific opponent
    in their 5 most recent matches before the date of the current match. Points are awarded 
    based on the match result: 3 points for a win, and 1 point for a draw. The most recent match
    has more weight than the older matches.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame containing match data.
    - row (pd.Series): The current row being evaluated, used to identify the team and opponent.
    - team_column (str): The column name in df that identifies the team.
    
    Returns:
    float: The normalized weighted score for the team against the specified opponent.
    """
    
    opponent_column = 'opp_id'

    # Filter DataFrame for relevant matches before the current date_temp
    df_temp = df[
        (df['date_temp'] < row['date_temp']) & 
        ((df['team_id'] == row[team_column]) | (df['opp_id'] == row[team_column]))
    ].sort_values(by='date_temp', ascending=False).head(5)

    # Initialize the weighted score and the total possible weight
    weighted_score = 0
    total_weight = sum(range(1, len(df_temp) + 1))  # The sum of weights from 1 to n (number of matches)

    # Iterate through the matches with a weight from 5 (most recent) to 1 (least recent)
    for weight, (index, match) in zip(range(len(df_temp), 0, -1), df_temp.iterrows()):
        # Determine points based on match result
        points = 0
        if match[team_column] == row['team_id']:
            if match['ftr'] == 'H':
                points = 3
            elif match['ftr'] == 'D':
                points = 1
        else:
            if match['ftr'] == 'A':
                points = 3
            elif match['ftr'] == 'D':
                points = 1

        # Multiply the points by the match's weight
        weighted_score += points * weight

    # Normalize the weighted score by the sum of the weights
    normalized_weighted_score = weighted_score / total_weight if total_weight > 0 else 0
    
    return weighted_score


df['team_hist_vs'] = df.apply(lambda x: history_vs_opponent_weighted(df, x, 'team_id'), axis=1)




In [None]:
# Sort the DataFrame by Date_temp and separate the 200 most recent matches into a validation set
df.sort_values('date_temp', inplace=True)
df_val = df.tail(100)
df = df.iloc[:-100]

In [None]:
#import train_test_split
from sklearn.model_selection import train_test_split

# Split the data into X and y
X = df.drop('ftr', axis=1)
y = df['ftr']

X.columns = [re.sub(r'[<]', '_st_', str(col)) for col in X.columns]
X.columns = [re.sub(r'[>]', '_gt_', str(col)) for col in X.columns]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

In [None]:
# Create a pipeline
pipeline = Pipeline([
    ('target_encoder', TargetEncoder()),
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier())
])

# Define the hyperparameters

param_distributions = {

 
}

# Create a RandomizedSearchCV object
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=5,
    cv=TimeSeriesSplit(n_splits=15),
    scoring='accuracy',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model
search.fit(X_train, y_train)


In [None]:
# print the classification report
print(classification_report(y_test, search.predict(X_test)))

In [None]:
# Show the 20 most important features
importances = search.best_estimator_['xgb'].feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
sns.barplot(x=importances[indices][:20], y=X_train.columns[indices][:20])
plt.title('Feature Importances')
plt.show()


### Validation

In [None]:
# Apply the model to the validation set
y_val = df_val['ftr']
X_val = df_val.drop('ftr', axis=1)
y_pred = search.predict(X_val)

# print the classification report
print(classification_report(y_val, y_pred))
