In [171]:
# Standard library imports
import os
import sys
import re
import warnings
import random
import hashlib

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and preprocessing
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  # Assuming you might need it

# Specific models and tools
from xgboost import XGBClassifier
import xgboost as xgb

# Encoding and feature selection
from category_encoders import TargetEncoder  # Fixed the import based on usage
from scipy.stats import randint, uniform

# Model persistence
from joblib import dump, load

# Miscellaneous settings
%matplotlib inline
warnings.filterwarnings('ignore')


In [172]:
seasons = [
    '2324', 
    '2223', 
    '2122', 
    '2021',
    '1920', 
    '1819', 
    #'1718', 
    #'1617',
    #'1516', '1415', '1314', '1213',
    #'1112', '1011', 
    #'0910', '0809',
    #'0708', '0607', '0506', '0405',
    #'0304', '0203', '0102', '0001',
]

countries = [

    "ARG", "AUT", "BRA", "CHN",
    "DNK", "FIN", "IRL", "JPN",
    "MEX", "NOR", "POL", "ROU",
    "RUS", "SWE", "SWZ", "USA",
]

fixtures = [
    "fixtures_world",
]


In [173]:
# Set the dataprep_start_date to the date the data preparation should start
# If None, the data preparation will start from the beginning of the data

# Make sure the file below already exists if you want to start from a specific date
# file should be in the format "processed_data_<content>.csv"
content = "world_all"

#dataprep_start_date = None
dataprep_start_date = pd.Timestamp(year=2024, month=4, day=23)

In [174]:
# Load all filepaths into a list
matches_files = []
fixtures_files = []

In [175]:
for fixture in fixtures:    
    fixtures_files.append(f'data/fixtures/{fixture}.csv')
    continue

In [176]:
for country in countries:
    matches_files.append('data/cleaned/%s.csv' % (country))
    continue

In [177]:
def load_data(files):
    df = pd.DataFrame()

    for file in files:
        try:
            print(f'Loading {file}')

            # Try to read with default utf-8 encoding
            try:
                df_temp = pd.read_csv(file, encoding='utf-8')
            except UnicodeDecodeError:
                # If utf-8 decoding fails, try reading with ISO-8859-1
                df_temp = pd.read_csv(file, encoding='ISO-8859-1')

            #df_temp['Season'] = year
            df = pd.concat([df, df_temp], ignore_index=True)
        except FileNotFoundError:
            print(f'Error: {file} not found')
        except Exception as e:
            print(f"An error occurred while loading {file}: {e}")

    return df


In [178]:
# Load data into DataFrames
df = load_data(matches_files)
df_fixtures = load_data(fixtures_files)

Loading data/cleaned/ARG.csv
Loading data/cleaned/AUT.csv
Loading data/cleaned/BRA.csv
Loading data/cleaned/CHN.csv
Loading data/cleaned/DNK.csv
Loading data/cleaned/FIN.csv
Loading data/cleaned/IRL.csv
Loading data/cleaned/JPN.csv
Loading data/cleaned/MEX.csv
Loading data/cleaned/NOR.csv
Loading data/cleaned/POL.csv
Loading data/cleaned/ROU.csv
Loading data/cleaned/RUS.csv
Loading data/cleaned/SWE.csv
Loading data/cleaned/SWZ.csv
Loading data/cleaned/USA.csv
Loading data/fixtures/fixtures_world.csv


In [179]:
len(df), len(df_fixtures)

(51981, 125)

In [180]:
df_fixtures['Div'].value_counts()

Div
new_league_fixtures    125
Name: count, dtype: int64

In [181]:
def parse_date_to_int(date_str):
    # Split the date_str by the "/" character into day, month, year
    components = date_str.split('/')
    
    # If split was successful but not in expected format, try splitting by absence of separator for '%d%m%Y' or '%d%m%y'
    if len(components) == 1:
        if len(date_str) in [6, 8]:  # Length 6 for '%d%m%y', 8 for '%d%m%Y'
            day, month = int(date_str[:2]), int(date_str[2:4])
            year = int(date_str[4:])
        else:
            return 19000101  # Return default if format does not match expected
    else:
        day, month = int(components[0]), int(components[1])
        year = int(components[2])
    
    # Adjust the year if it was only 2 characters long
    if year < 100:
        year += 2000
    
    # Create a date variable by using the day, month, year integers
    # Note: Direct creation of date variable skipped to avoid unnecessary complexity,
    # directly formatting to YYYYMMDD integer format instead.
    date_int = int(f"{year:04d}{month:02d}{day:02d}")
    
    return date_int

In [182]:
df_fixtures['Div'].value_counts()

Div
new_league_fixtures    125
Name: count, dtype: int64

In [183]:
if len(df_fixtures) > 0:

    # Parse the 'Date' column to a datetime object
    df_fixtures['Date_temp'] = pd.to_datetime(df_fixtures['Date'], format='%d/%m/%Y')

    # Convert the datetime object to an integer in the format YYYYMMDD
    df_fixtures['Date_temp'] = df_fixtures['Date_temp'].apply(
        lambda x: int(x.strftime('%Y%m%d')) if pd.notnull(x) else 19000101)

    # Replace all values with -1 in FTR column
    df_fixtures['FTR'].fillna('X', inplace=True)

    # Find the lowest fixture date
    # This is the date where the data preparation will start
    fixture_cutoff = df_fixtures['Date_temp'].min()

    # Remove all the rows in df that are after the fixture_cutoff date
    #df = df[df['Date'] < fixture_cutoff]

    # Concatenate the matches and fixtures dataframes
    df = pd.concat([df, df_fixtures], ignore_index=True)


In [184]:
# Check for duplicate column names
print(df.columns[df.columns.duplicated()])

Index([], dtype='object')


In [185]:
len(df)

52106

In [186]:
# Remove all the rows in the dataframe where the 'Div' is not in the list of countries
df = df[df['Div'].isin(countries)]

In [187]:
len(df)

51981

In [188]:
# Create a dictionary for all competitions

file_path = f"data/comps_dict_{content}.txt"

# Check if the file exists
if os.path.exists(file_path):
    # Load the dictionary from the file
    with open(file_path, 'r') as file:
        comps_dict = eval(file.read())  # Using eval to convert string back to dictionary
    # Find the maximum index currently in the dictionary
    max_index = max(comps_dict.values())

    print(f"max index: {max_index}")
else:
    comps_dict = {}
    max_index = -1  

# Get all unique divisions from DataFrame
all_comps = df['Div'].dropna().unique()
all_comps.sort()

# Create a dictionary of new divisions alone
new_comps = {div: index for index, div in enumerate(all_comps, start=max_index + 1) if div not in comps_dict}

# Update dictionary only with new divisions
comps_dict.update(new_comps)

# Save the updated dictionary to a file
with open(file_path, 'w') as file:
    file.write(str(comps_dict))

# Add division ID column to DataFrame
df['Div'] = df['Div'].map(comps_dict)

max index: 15


In [189]:
# Create a dictionary for all teams

file_path = f"data/teams_dict_{content}.txt"

# Check if the file exists
if os.path.exists(file_path):
    # Load the dictionary from the file
    with open(file_path, 'r') as file:
        teams_dict = eval(file.read())  
    max_index = max(teams_dict.values())

    print(f"max index: {max_index}")
else:
    teams_dict = {}
    max_index = -1 

# Get all teams from DataFrame
all_teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).dropna().unique()
all_teams.sort()

# Create a dictionary of new teams alone
new_teams = {team: index for index, team in enumerate(all_teams) if team not in teams_dict}

# Update dictionary only with new teams, starting indices from max_index + 1
start_index = max_index + 1
teams_dict.update({team: index + start_index for index, team in enumerate(new_teams) if team not in teams_dict})

# Save the updated dictionary to a file
with open(file_path, 'w') as file:
    file.write(str(teams_dict))

# Add team ID columns to DataFrame
df['Team_ID'] = df['HomeTeam'].map(teams_dict)
df['Opp_ID'] = df['AwayTeam'].map(teams_dict)

max index: 480


In [190]:
def clean_duplicates(df):
    # Sort the DataFrame so that rows with 'FTR' == -1 come first
    df.sort_values(by=['Date', 'Team_ID', 'Opp_ID', 'FTR'], ascending=[True, True, True, False], inplace=True)
    
    # Drop duplicates based on 'Date', 'Team_ID', and 'Opp_ID' keeping the first occurrence (where 'FTR' is -1)
    df = df.drop_duplicates(subset=['Date', 'Team_ID', 'Opp_ID'], keep='first')
    
    return df

df = clean_duplicates(df)

### Feature Engineering

In [191]:
# Calculate ELO ratings for each team

# Initialize ratings dictionary
teams = pd.concat([df['Team_ID'], df['Opp_ID']]).unique()
ratings = {team: 1500 for team in teams}

def calculate_expected_score(rating_a, rating_b):
    return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

def update_elo(rating, actual_score, expected_score, k=30):

    rating = rating + k * (actual_score - expected_score)

    # Parse the rating as an integer with no decimal points
    return int(rating)

# Iterate over the DataFrame and update ELO ratings after each match
elo_team = []
elo_opp = []

for index, row in df.iterrows():
    home_team, away_team, home_score, away_score = row['Team_ID'], row['Opp_ID'], row['FTHG'], row['FTAG']
    home_rating = ratings[home_team]
    away_rating = ratings[away_team]
    
    # Calculate expected scores
    expected_home = calculate_expected_score(home_rating, away_rating)
    expected_away = calculate_expected_score(away_rating, home_rating)
    
    # Calculate actual scores
    actual_home = 1 if home_score > away_score else 0.5 if home_score == away_score else 0
    actual_away = 1 - actual_home
    
    # Update ratings
    new_home_rating = update_elo(home_rating, actual_home, expected_home)
    new_away_rating = update_elo(away_rating, actual_away, expected_away)
    
    # Store updated ratings in the ratings dictionary
    ratings[home_team] = new_home_rating
    ratings[away_team] = new_away_rating
    
    # Append current ratings to list
    elo_team.append(new_home_rating)
    elo_opp.append(new_away_rating)


In [192]:
# Assign new ELO ratings to the DataFrame
df['team_elo'] = elo_team
df['opp_elo'] = elo_opp

In [193]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)

# Apply the modified function
df['Date_temp'] = df['Date'].apply(lambda x: parse_date_to_int(x.strftime('%d/%m/%Y')) if pd.notnull(x) else 19000101)

# Day of the week as an integer
df['DayOTW'] = df['Date'].dt.dayofweek

df['Time'] = df['Time'].fillna('00:00').str.replace(':', '').astype(int)

# Only keep the first 2 digits of the Time column, no decimals
df['Time'] = df['Time'] // 100

# Sort df by Date_temp and Time
df = df.sort_values(['Date_temp', 'Time'])

In [194]:
df.columns = [re.sub(r'[<]', '_st_', str(col)) for col in df.columns]
df.columns = [re.sub(r'[>]', '_gt_', str(col)) for col in df.columns]

In [195]:
def points(df, row, team_column):

    # Season of the current match
    current_season = row['Season']
    
    # Date of the current match
    current_date = row['Date']

    # Define the opponent column based on the team column
    opponent_column = 'Opp_ID' if team_column == 'Team_ID' else 'Team_ID'

    # Filter DataFrame for matches from the same season before the current date
    past_matches = df[
        (df['Season'] == current_season) & 
        (df['Date'] < current_date) &
        ((df['Team_ID'] == row[team_column]) | (df['Opp_ID'] == row[team_column]))
    ]

    # Initialize total points
    total_points = 0

    # Calculate points for each past match
    for match in past_matches.itertuples():
        if getattr(match, 'Team_ID') == row[team_column]:
            if getattr(match, 'FTR') == 'H':
                total_points += 3  # Home win
            elif getattr(match, 'FTR') == 'D':
                total_points += 1  # Draw
        elif getattr(match, 'Opp_ID') == row[team_column]:
            if getattr(match, 'FTR') == 'A':
                total_points += 3  # Away win
            elif getattr(match, 'FTR') == 'D':
                total_points += 1  # Draw

    # Calculate average points if there are any matches played
    matches_played = len(past_matches)
    if matches_played > 0:
        avg_points = total_points / matches_played
    else:
        avg_points = 0

    # Round to 3 decimal places
    return round(avg_points, 3) 

In [196]:
df['team_points'] = df.apply(lambda x: points(df, x, 'Team_ID') 
    if dataprep_start_date is None or x['Date'] >= dataprep_start_date else None, axis=1)
df['opp_points'] = df.apply(lambda x: points(df, x, 'Opp_ID') 
    if dataprep_start_date is None or x['Date'] >= dataprep_start_date else None, axis=1)

KeyboardInterrupt: 

In [None]:
def history_vs_opponent_weighted(df, row, team_column):
    # Determine opponent column based on team column
    opponent_column = 'Team_ID' if team_column == 'Opp_ID' else 'Opp_ID'

    # Combine year, month, and day into an integer 'Date_temp'
    row_date_temp = row['Date'].year * 10000 + row['Date'].month * 100 + row['Date'].day

    # Filter for matches between specified teams, excluding current match
    mask = (
        ((df[team_column] == row[team_column]) & (df[opponent_column] == row[opponent_column])) |
        ((df[team_column] == row[opponent_column]) & (df[opponent_column] == row[team_column]))
    ) & (df['Date_temp'] < row_date_temp)

    filtered_matches = df[mask]
    
    if filtered_matches.empty:
        return 0  # Return early if no matches found

    # Sort by date and select top 5 recent matches
    recent_matches = filtered_matches.sort_values(by='Date', ascending=False).head(5)
    weights = list(range(len(recent_matches), 0, -1))  # Descending weights

    # Calculate weighted score based on match results
    weighted_score = sum(
        (3 * weight if match.FTR == 'H' and match.__getattribute__(team_column) == match.Team_ID or
                      match.FTR == 'A' and match.__getattribute__(team_column) != match.Team_ID else
         1 * weight if match.FTR == 'D' else 0)
        for match, weight in zip(recent_matches.itertuples(), weights)
    )



    # Normalize the weighted score by the sum of weights
    return round(weighted_score / sum(weights), 3) if weights else 0


In [None]:
df['team_hist_vs'] = df.apply(lambda x: history_vs_opponent_weighted(df, x, 'Team_ID') 
    if dataprep_start_date is None or x['Date'] >= dataprep_start_date else None, axis=1)
df['opp_hist_vs'] = df.apply(lambda x: history_vs_opponent_weighted(df, x, 'Opp_ID') 
    if dataprep_start_date is None or x['Date'] >= dataprep_start_date else None, axis=1)

KeyboardInterrupt: 

In [None]:
def team_form(df, row, perspective):
    # Determine the team ID based on the perspective ('Team' or 'Opp')
    if perspective == 'Team':
        team_id = row['Team_ID']
    elif perspective == 'Opp':
        team_id = row['Opp_ID']
    else:
        raise ValueError("Perspective must be 'Team' or 'Opp'")
    
    # Get the current match date
    current_date = row['Date_temp']
    
    # Filter past matches for the team
    past_matches = df[((df['Team_ID'] == team_id) | (df['Opp_ID'] == team_id)) &
                      (df['Date_temp'] < current_date)].sort_values(by='Date_temp', ascending=False).head(5)
    
    # Initialize points
    points = 0
    
    # Weights for the matches (most recent match has the highest weight)
    weights = [5, 4, 3, 2, 1]
    
    # Calculate points with weights
    weighted_points_sum = 0
    total_weights = sum(weights[:len(past_matches)])  # Adjust the total weight in case of less than 5 matches
    
    for match, weight in zip(past_matches.itertuples(), weights):
        if (match.Team_ID == team_id and match.FTR == 'H') or (match.Opp_ID == team_id and match.FTR == 'A'):
            points += 3
        elif match.FTR == 'D':
            points += 1
        else:
            points += 0

        weighted_points_sum += points * weight
    
    if total_weights > 0:

        team_form = round(weighted_points_sum / total_weights, 2)

        return team_form
    else:
        return 0  # Return 0 if no past matches found

In [None]:
df['team_form'] = df.apply(lambda x: team_form(df, x, 'Team') if dataprep_start_date is None or x['Date'] >= dataprep_start_date else None, axis=1)
df['opp_form'] = df.apply(lambda x: team_form(df, x, 'Opp') if dataprep_start_date is None or x['Date'] >= dataprep_start_date else None, axis=1)

In [None]:
def rolling_avgs_combined(df, row, perspective):
    # Determine the team ID based on the perspective ('Team' or 'Opp')
    if perspective == 'Team':
        team_id = row['Team_ID']
    elif perspective == 'Opp':
        team_id = row['Opp_ID']
    else:
        raise ValueError("Perspective must be 'Team' or 'Opp'")
    
    # Get the current match date
    current_date = row['Date_temp']
    
    # Filter past 5 matches for the team
    past_matches = df[((df['Team_ID'] == team_id) | (df['Opp_ID'] == team_id)) &
                      (df['Date_temp'] < current_date)].sort_values(by='Date_temp', ascending=False).head(5)
    
    # Weights for the matches (most recent match has the highest weight)
    weights = [5, 4, 3, 2, 1]
    
    # Initialize sums and weighted sums
    shots = []
    shots_target = []
    
    # Determine which columns to use and collect the values
    for match in past_matches.itertuples():
        if match.Team_ID == team_id:
            shots.append(getattr(match, 'HS'))  # Home shots
            shots_target.append(getattr(match, 'HST'))  # Home shots on target
        else:
            shots.append(getattr(match, 'AS'))  # Away shots
            shots_target.append(getattr(match, 'AST'))  # Away shots on target
    
    # Calculate the weighted averages of the values
    weighted_shots = sum(s * w for s, w in zip(shots, weights))
    weighted_shots_target = sum(st * w for st, w in zip(shots_target, weights))
    total_weights = sum(weights[:len(shots)])  # Adjust total weight if there are less than 5 matches
    
    avg_shots = weighted_shots / total_weights if total_weights > 0 else 0
    avg_shots_target = weighted_shots_target / total_weights if total_weights > 0 else 0

    # Round the averages to 2 decimal places
    avg_shots = round(avg_shots, 2)
    avg_shots_target = round(avg_shots_target, 2)
    
    return avg_shots, avg_shots_target

In [None]:
df.head()

Unnamed: 0,Div,Season,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,AvgH,...,Opp_ID,team_elo,opp_elo,DayOTW,team_points,opp_points,team_hist_vs,opp_hist_vs,team_form,opp_form
19083,6,2012,2012-03-02,19,Derry City,Bohemians,1.0,0.0,H,1.73,...,51,1532,1480,4,,,,,,
19082,6,2012,2012-03-02,19,Drogheda,Shamrock Rovers,1.0,2.0,A,9.8,...,358,1476,1618,4,,,,,,
19084,6,2012,2012-03-02,19,St. Patricks,Bray,1.0,0.0,H,1.44,...,56,1536,1499,4,,,,,,
19085,6,2012,2012-03-02,19,UC Dublin,Cork City,1.0,0.0,H,4.02,...,92,1403,1469,4,,,,,,
19086,6,2012,2012-03-02,20,Monaghan,Dundalk,0.0,0.0,D,2.9,...,113,1487,1566,4,,,,,,


In [None]:
#df['team_shots'], df['team_shots_target'] = zip(*df.apply(lambda x: rolling_avgs_combined(df, x, 'Team') 
#    if dataprep_start_date is None or x['Date'] >= dataprep_start_date else (0, 0), axis=1))
#df['opp_shots'], df['opp_shots_target'] = zip(*df.apply(lambda x: rolling_avgs_combined(df, x, 'Opp') 
#    if dataprep_start_date is None or x['Date'] >= dataprep_start_date else (0, 0), axis=1))

In [None]:
def avg_goals(df, row, team_column):
    # Season and date of the current match
    current_season = row['Season']
    current_date = row['Date']

    # Determine the columns for goals scored and conceded based on perspective
    if team_column == 'Team_ID':
        goals_scored_column = 'FTHG'  # Assuming FTHG is the column for home team goals
        goals_conceded_column = 'FTAG'  # Assuming FTAG is the column for away team goals
    else:
        goals_scored_column = 'FTAG'  # Flip the columns if we are looking from the opponent's perspective
        goals_conceded_column = 'FTHG'

    # Filter matches from the same season and before the current date
    past_matches = df[
        (df['Season'] == current_season) & 
        (df['Date'] < current_date) & 
        ((df['Team_ID'] == row[team_column]) | (df['Opp_ID'] == row[team_column]))
    ]

    # Calculate the average goals scored and conceded
    goals_scored = 0
    goals_conceded = 0
    total_matches = len(past_matches)

    for match in past_matches.itertuples():
        if getattr(match, 'Team_ID') == row[team_column]: 
            goals_scored += getattr(match, goals_scored_column)
            goals_conceded += getattr(match, goals_conceded_column)
        else:  # Team is playing away
            goals_scored += getattr(match, goals_scored_column)
            goals_conceded += getattr(match, goals_conceded_column)

    avg_goals_for = goals_scored / total_matches if total_matches > 0 else 0
    avg_goals_against = goals_conceded / total_matches if total_matches > 0 else 0

    avg_goals_for = round(avg_goals_for, 2)
    avg_goals_against = round(avg_goals_against, 2)

    return avg_goals_for, avg_goals_against

In [None]:
# Apply the function and create new columns
df['team_avg_goals_for'], df['team_avg_goals_against'] = zip(*df.apply(lambda x: avg_goals(df, x, 'Team_ID') 
    if dataprep_start_date is None or x['Date'] >= dataprep_start_date else (0, 0), axis=1))
df['opp_avg_goals_for'], df['opp_avg_goals_against'] = zip(*df.apply(lambda x: avg_goals(df, x, 'Opp_ID') 
    if dataprep_start_date is None or x['Date'] >= dataprep_start_date else (0, 0), axis=1))

In [None]:
# Calculate means only for numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
means = df[numeric_cols].mean()

# Fill missing values in numeric columns with their respective means
df[numeric_cols] = df[numeric_cols].fillna(means)

In [None]:
# Set the FTR to 'X' where the value is currently NaN
df['FTR'] = df['FTR'].fillna('X')

In [None]:
# Drop every row where 'FTR' is not 'H', 'D', or 'A', or 'X' (if future matches are included)
df = df[df['FTR'].isin(['H', 'D', 'A', 'X'])]

# Map 'H', 'D', and 'A' to 0, 1, and 2 respectively
df['FTR'] = df['FTR'].map({'H': 0, 'D': 1, 'A': 2, 'X': -1}).astype(int)

In [None]:
def calculate_xg(df, row, team_column):
    # Initialize the expected goals (xg)
    xg_total = 0
    count_matches = 0

    # Season of the current match
    current_season = row['Season']

    # Date of the current match
    current_date = pd.to_datetime(row['Date'], dayfirst=True)  # Ensure the date format is correct

    # Define the opponent column based on the team column
    if team_column == 'Team_ID':
        goals_col = 'FTHG'
        shots_on_target_col = 'HST'
    else:
        goals_col = 'FTAG'
        shots_on_target_col = 'AST'

    # Filter DataFrame for matches from the same season before the current date
    past_matches = df[
        (df['Season'] == current_season) &
        (pd.to_datetime(df['Date'], dayfirst=True) < current_date) &
        (df[team_column] == row[team_column])
    ]

    # Calculate efficiency and xg
    for match in past_matches.itertuples():
        goals = getattr(match, goals_col)
        shots_on_target = getattr(match, shots_on_target_col)
        if shots_on_target > 0:
            efficiency = goals / shots_on_target
            xg_total += efficiency
            count_matches += 1

    # Calculate average xg
    if count_matches > 0:
        avg_xg = xg_total / count_matches
    else:
        avg_xg = 0

    return avg_xg

In [None]:
#df['team_xg'] = df.apply(lambda x: calculate_xg(df, x, 'Team_ID'), axis=1)
#df['opp_xg'] = df.apply(lambda x: calculate_xg(df, x, 'Opp_ID'), axis=1)

In [None]:
df.shape

(51981, 28)

In [None]:
df = df[[
    
        'Div', 'Season', 'Date_temp', 'Time', 'DayOTW', 'Team_ID', 'Opp_ID', 'FTR',

        'team_elo', 'opp_elo',

        #'team_xg', 'opp_xg',
        
        'team_hist_vs', 
        'opp_hist_vs',

        'team_points',
        'opp_points',
        
        'team_form', 
        'opp_form',

        'team_avg_goals_for', 
        'team_avg_goals_against',
        'opp_avg_goals_for',
        'opp_avg_goals_against',
         
        #'team_shots', 'opp_shots',
        #'team_shots_target', 'opp_shots_target',

        'AvgH', 'AvgD', 'AvgA'
         
         
         ]]

In [None]:
# Print the value counts of the Date_temp column where FTR is -1
print(df[df['FTR'] == -1]['Date_temp'].value_counts())

Date_temp
20161211    1
Name: count, dtype: int64


In [None]:
# Rename 'Date_temp' to 'Date'
df.rename(columns={'Date_temp': 'Date'}, inplace=True)

In [None]:
import pandas as pd

try:
   
    if dataprep_start_date is not None:
        # Convert date columns to datetime
        df['Date_temp'] = pd.to_datetime(df['Date'], format='%Y%m%d')
        
        # Filter new data based on start date
        df_new = df[df['Date_temp'] >= dataprep_start_date].copy()

        # Load existing data
        df_existing = pd.read_csv(f'data/processed/processed_data_{content}.csv')

        df_existing['Date_temp'] = pd.to_datetime(df_existing['Date'])
        
        # Filter existing data to remove overlap with new data
        df_existing = df_existing[df_existing['Date_temp'] < dataprep_start_date]

        # Combine and sort data
        df_final = pd.concat([df_existing, df_new], ignore_index=True)
        df_final.sort_values(['Date_temp', 'Time'], inplace=True)

        # Clean up temporary columns
        df_final.drop(columns='Date_temp', inplace=True)
    else:
        df_final = df.copy()

    # Save the final DataFrame
    df_final.to_csv(f'data/processed/processed_data_{content}.csv', index=False)
    print(f"Data saved: {df_final.shape[0]} matches")

except Exception as e:
    print(f"Error: {e}")


Error: [Errno 13] Permission denied: 'data/processed/processed_data_world_all.csv'


In [None]:
import winsound
frequency = 400  # Set Frequency To 2500 Hertz
duration = 200  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)