In [138]:
# Standard library imports
import os
import sys
import re
import warnings
import random
import hashlib

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and preprocessing
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  # Assuming you might need it

# Specific models and tools
from xgboost import XGBClassifier
import xgboost as xgb

# Encoding and feature selection
from category_encoders import TargetEncoder  # Fixed the import based on usage
from scipy.stats import randint, uniform

# Model persistence
from joblib import dump, load

# Miscellaneous settings
%matplotlib inline
warnings.filterwarnings('ignore')


In [139]:
comps = [
    'E0', 
    'E1', 
    
    'E2', 'E3',
        
    'SC0', 
    'SC1',

    'D1', 'D2',
    'F1', 'F2',
    'I1', 'I2',
    'SP1','SP2',
    'B1',
    'G1',
    'N1',
    'P1',
    'T1',
]

seasons = [
    '2324', 
    '2223', '2122', '2021',
    '1920', 
    '1819', 
    #'1718', 
    #'1617',
    #'1516', '1415', '1314', '1213',
    #'1112', '1011', 
    #'0910', '0809',
    #'0708', '0607', '0506', '0405',
    #'0304', '0203', '0102', '0001',
]

countries = [
    "ARG", "AUT", "BRA", "CHN",
    "DNK", "FIN", "IRL", "JPN",
    "MEX", "NOR", "POL", "ROU",
    "RUS", "SWE", "SWZ", "USA",
]

fixtures = [
    "fixtures",
    #"new_league_fixtures"
]

content = "euro_6s"

In [140]:
# Load all filepaths into a list
matches_files = []

In [141]:
for season in seasons:    
    for comp in comps:  
        matches_files.append('data/scraped/%s/%s.csv' % (season, comp))
        continue

In [142]:
for country in countries:    
    #matches_files.append('data/scraped/other/%s.csv' % (country))
    continue

In [143]:
for fixture in fixtures:    
    matches_files.append(f'data/scraped/{seasons[0]}/{fixture}.csv')
    continue

In [144]:
# Load and concatenate matches data into a single DataFrame
df = pd.DataFrame()

for file in matches_files:

    try:

        year = re.search(r'(\d{4})', file).group(1)
        print(f'Loading {file}')

        df_temp = pd.read_csv(file)

        # add the year to the dataframe as a column 'Season'
        df_temp['Season'] = year

        df = pd.concat([df, df_temp], ignore_index=True)
    except:
        # print an error message
        print(f'Error: {file} not found')

# print the amount of data loaded
print(f"Data loaded: {df.shape[0]} matches")

Loading data/scraped/2324/E0.csv
Loading data/scraped/2324/E1.csv
Loading data/scraped/2324/E2.csv
Loading data/scraped/2324/E3.csv
Loading data/scraped/2324/SC0.csv
Loading data/scraped/2324/SC1.csv
Loading data/scraped/2324/D1.csv
Loading data/scraped/2324/D2.csv
Loading data/scraped/2324/F1.csv
Loading data/scraped/2324/F2.csv
Loading data/scraped/2324/I1.csv
Loading data/scraped/2324/I2.csv
Loading data/scraped/2324/SP1.csv
Loading data/scraped/2324/SP2.csv
Loading data/scraped/2324/B1.csv
Loading data/scraped/2324/G1.csv
Loading data/scraped/2324/N1.csv
Loading data/scraped/2324/P1.csv
Loading data/scraped/2324/T1.csv
Loading data/scraped/2223/E0.csv
Loading data/scraped/2223/E1.csv
Loading data/scraped/2223/E2.csv
Loading data/scraped/2223/E3.csv
Loading data/scraped/2223/SC0.csv
Loading data/scraped/2223/SC1.csv
Loading data/scraped/2223/D1.csv
Loading data/scraped/2223/D2.csv
Loading data/scraped/2223/F1.csv
Loading data/scraped/2223/F2.csv
Loading data/scraped/2223/I1.csv
Load

Loading data/scraped/2021/SP2.csv
Loading data/scraped/2021/B1.csv
Loading data/scraped/2021/G1.csv
Loading data/scraped/2021/N1.csv
Loading data/scraped/2021/P1.csv
Loading data/scraped/2021/T1.csv
Loading data/scraped/1920/E0.csv
Loading data/scraped/1920/E1.csv
Loading data/scraped/1920/E2.csv
Loading data/scraped/1920/E3.csv
Loading data/scraped/1920/SC0.csv
Loading data/scraped/1920/SC1.csv
Loading data/scraped/1920/D1.csv
Loading data/scraped/1920/D2.csv
Loading data/scraped/1920/F1.csv
Loading data/scraped/1920/F2.csv
Loading data/scraped/1920/I1.csv
Loading data/scraped/1920/I2.csv
Loading data/scraped/1920/SP1.csv
Loading data/scraped/1920/SP2.csv
Loading data/scraped/1920/B1.csv
Loading data/scraped/1920/G1.csv
Loading data/scraped/1920/N1.csv
Loading data/scraped/1920/P1.csv
Loading data/scraped/1920/T1.csv
Loading data/scraped/1819/E0.csv
Loading data/scraped/1819/E1.csv
Loading data/scraped/1819/E2.csv
Loading data/scraped/1819/E3.csv
Loading data/scraped/1819/SC0.csv
Erro

In [145]:
df.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,BbMx<2.5,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,HFKC,AFKC
0,E0,11/08/2023,20:00,Burnley,Man City,0.0,3.0,A,0.0,2.0,...,,,,,,,,,,
1,E0,12/08/2023,12:30,Arsenal,Nott'm Forest,2.0,1.0,H,2.0,0.0,...,,,,,,,,,,
2,E0,12/08/2023,15:00,Bournemouth,West Ham,1.0,1.0,D,0.0,0.0,...,,,,,,,,,,
3,E0,12/08/2023,15:00,Brighton,Luton,4.0,1.0,H,1.0,0.0,...,,,,,,,,,,
4,E0,12/08/2023,15:00,Everton,Fulham,0.0,1.0,A,0.0,0.0,...,,,,,,,,,,


In [146]:
# Rename columns if they exist
df.rename(columns={
    'Country': 'Div',
    'Home': 'HomeTeam',
    'Away': 'AwayTeam',
    'Res': 'FTR',

}, inplace=True)

In [147]:
# Check for duplicate column names
print(df.columns[df.columns.duplicated()])

Index([], dtype='object')


In [148]:
# Convert 'Div' to a categorical type, a numeric representation of the division
df['Div'] = df['Div'].astype('category').cat.codes

In [149]:
# Assuming 'teams' is a list of team names
teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).dropna().unique()
teams.sort()

# Creating a dictionary from team names to an incremental index number
teams_dict = {team: index for index, team in enumerate(teams)}

In [150]:
# Save the dictionary to a file
with open(f'data/teams_dict_{content}.txt', 'w') as file:
    file.write(str(teams_dict))

In [151]:
# Create a unique list of HomeTeam and AwayTeam names combined, and add an index to each team
teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).dropna().unique()

# Sort the teams alphabetically
teams.sort()

# Convert to an array of dictionaries
teams = [{'team': team, 'index': index} for index, team in enumerate(teams)]

df['Team_ID'] = df['HomeTeam'].map(teams_dict)
df['Opp_ID'] = df['AwayTeam'].map(teams_dict)

In [152]:
# if the column 'Referee' exists, convert it to a categorical type
if 'Referee' in df.columns:
    # Create a unique list of Referees, and add an index to each Referee
    referees = pd.concat([df['Referee']]).unique()

    # Convert to an array of dictionaries
    referees = [{'referee': referee, 'index': index} for index, referee in enumerate(referees)]

else:
    df['Referee'] = 0

### Feature Engineering

In [153]:
# Correcting the creation of a unique list of Referees and adding an index to each Referee
referees = df['Referee'].unique()  # This should directly refer to the 'Referee' column

if len(referees) > 0:
    # Convert to a dictionary with referee names as keys and their indices as values
    referee_dict = {referee: index for index, referee in enumerate(referees)}

    # Now map the 'Referee' column to these indices
    df['Ref_ID'] = df['Referee'].map(referee_dict)
else:
    # If there are no referees, create a dummy column with all zeros
    df['Ref_ID'] = 0

In [154]:
# Calculate ELO ratings for each team

# Initialize ratings dictionary
teams = pd.concat([df['Team_ID'], df['Opp_ID']]).unique()
ratings = {team: 1500 for team in teams}

def calculate_expected_score(rating_a, rating_b):
    return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

def update_elo(rating, actual_score, expected_score, k=30):

    rating = rating + k * (actual_score - expected_score)

    # Parse the rating as an integer with no decimal points
    return int(rating)

# Iterate over the DataFrame and update ELO ratings after each match
elo_team = []
elo_opp = []

for index, row in df.iterrows():
    home_team, away_team, home_score, away_score = row['Team_ID'], row['Opp_ID'], row['FTHG'], row['FTAG']
    home_rating = ratings[home_team]
    away_rating = ratings[away_team]
    
    # Calculate expected scores
    expected_home = calculate_expected_score(home_rating, away_rating)
    expected_away = calculate_expected_score(away_rating, home_rating)
    
    # Calculate actual scores
    actual_home = 1 if home_score > away_score else 0.5 if home_score == away_score else 0
    actual_away = 1 - actual_home
    
    # Update ratings
    new_home_rating = update_elo(home_rating, actual_home, expected_home)
    new_away_rating = update_elo(away_rating, actual_away, expected_away)
    
    # Store updated ratings in the ratings dictionary
    ratings[home_team] = new_home_rating
    ratings[away_team] = new_away_rating
    
    # Append current ratings to list
    elo_team.append(new_home_rating)
    elo_opp.append(new_away_rating)


In [155]:
# Assign new ELO ratings to the DataFrame
df['team_elo'] = elo_team
df['opp_elo'] = elo_opp

In [156]:
# Home field advantage: Add 100 to 'team_elo'
#df['team_elo'] = df['team_elo'] + 100

In [157]:
def parse_date_to_int(date_str):
    # Split the date_str by the "/" character into day, month, year
    components = date_str.split('/')
    
    # If split was successful but not in expected format, try splitting by absence of separator for '%d%m%Y' or '%d%m%y'
    if len(components) == 1:
        if len(date_str) in [6, 8]:  # Length 6 for '%d%m%y', 8 for '%d%m%Y'
            day, month = int(date_str[:2]), int(date_str[2:4])
            year = int(date_str[4:])
        else:
            return 19000101  # Return default if format does not match expected
    else:
        day, month = int(components[0]), int(components[1])
        year = int(components[2])
    
    # Adjust the year if it was only 2 characters long
    if year < 100:
        year += 2000
    
    # Create a date variable by using the day, month, year integers
    # Note: Direct creation of date variable skipped to avoid unnecessary complexity,
    # directly formatting to YYYYMMDD integer format instead.
    date_int = int(f"{year:04d}{month:02d}{day:02d}")
    
    return date_int

In [158]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)

# Apply the modified function
df['Date_temp'] = df['Date'].apply(lambda x: parse_date_to_int(x.strftime('%d/%m/%Y')) if pd.notnull(x) else 19000101)

In [159]:
# Day of the week as an integer
df['DayOTW'] = df['Date'].dt.dayofweek

In [160]:
df['Time'] = df['Time'].fillna('00:00').str.replace(':', '').astype(int)

In [161]:
# Only keep the first 2 digits of the Time column, no decimals
df['Time'] = df['Time'] // 100

In [162]:
df.columns = [re.sub(r'[<]', '_st_', str(col)) for col in df.columns]
df.columns = [re.sub(r'[>]', '_gt_', str(col)) for col in df.columns]

In [163]:
# Sort df by Date_temp and Time
df = df.sort_values(['Date_temp', 'Time'])

In [164]:
def points(df, row, team_column):
    # Initialize points
    total_points = 0

    # Season of the current match
    current_season = row['Season']

    # Date of the current match
    current_date = row['Date']

    # Define the opponent column based on the team column
    if team_column == 'Team_ID':
        home_team_col = 'Team_ID'
        away_team_col = 'Opp_ID'
    else:
        home_team_col = 'Opp_ID'
        away_team_col = 'Team_ID'

    # Filter DataFrame for matches from the same season before the current date
    past_matches = df[
        (df['Season'] == current_season) & 
        (df['Date'] < current_date) &
        ((df[home_team_col] == row[team_column]) | (df[away_team_col] == row[team_column]))
    ]

    # Calculate points based on the results
    for match in past_matches.itertuples():
        if (getattr(match, home_team_col) == row[team_column] and getattr(match, 'FTR') == 'H') or \
           (getattr(match, away_team_col) == row[team_column] and getattr(match, 'FTR') == 'A'):
            total_points += 3  # Win
        elif getattr(match, 'FTR') == 'D':
            total_points += 1  # Draw

    return total_points


In [165]:
# Example usage to apply the function and create new columns
df['team_points'] = df.apply(lambda x: points(df, x, 'Team_ID'), axis=1)
df['opp_points'] = df.apply(lambda x: points(df, x, 'Opp_ID'), axis=1)

In [166]:
def history_vs_opponent_weighted(df, row, team_column):
    # Determine opponent column based on team column
    opponent_column = 'Team_ID' if team_column == 'Opp_ID' else 'Opp_ID'

    # Combine year, month, and day into an integer 'Date_temp'
    row_date_temp = row['Date'].year * 10000 + row['Date'].month * 100 + row['Date'].day

    # Filter for matches between specified teams, excluding current match
    mask = (
        ((df[team_column] == row[team_column]) & (df[opponent_column] == row[opponent_column])) |
        ((df[team_column] == row[opponent_column]) & (df[opponent_column] == row[team_column]))
    ) & (df['Date_temp'] < row_date_temp)

    filtered_matches = df[mask]
    
    if filtered_matches.empty:
        return 0  # Return early if no matches found

    # Sort by date and select top 5 recent matches
    recent_matches = filtered_matches.sort_values(by='Date', ascending=False).head(5)
    weights = list(range(len(recent_matches), 0, -1))  # Descending weights

    # Calculate weighted score based on match results
    weighted_score = sum(
        (3 * weight if match.FTR == 'H' and match.__getattribute__(team_column) == match.Team_ID or
                      match.FTR == 'A' and match.__getattribute__(team_column) != match.Team_ID else
         1 * weight if match.FTR == 'D' else 0)
        for match, weight in zip(recent_matches.itertuples(), weights)
    )

    # Normalize the weighted score by the sum of weights
    return weighted_score / sum(weights) if weights else 0


In [167]:
# Apply the modified function to create new columns
df['team_hist_vs'] = df.apply(lambda x: history_vs_opponent_weighted(df, x, 'Team_ID'), axis=1)
df['opp_hist_vs'] = df.apply(lambda x: history_vs_opponent_weighted(df, x, 'Opp_ID'), axis=1)

In [168]:
def convert_odds(row):
    odds_win, odds_draw, odds_lose = row['AvgH'], row['AvgD'], row['AvgA']
    prob_win = 1 / odds_win
    prob_draw = 1 / odds_draw
    prob_lose = 1 / odds_lose
    prob_not_win = prob_draw + prob_lose
    return pd.Series([prob_win, prob_not_win], index=['probs_win', 'probs_not_win'])

# Apply the function and create new columns
#df[['probs_win', 'probs_not_win']] = df.apply(convert_odds, axis=1)

#df = df.drop(columns=['AvgH', 'AvgD', 'AvgA'])

In [169]:
def team_form(df, row, perspective):
    # Determine the team ID based on the perspective ('Team' or 'Opp')
    if perspective == 'Team':
        team_id = row['Team_ID']
    elif perspective == 'Opp':
        team_id = row['Opp_ID']
    else:
        raise ValueError("Perspective must be 'Team' or 'Opp'")
    
    # Get the current match date
    current_date = row['Date_temp']
    
    # Filter past matches for the team
    past_matches = df[((df['Team_ID'] == team_id) | (df['Opp_ID'] == team_id)) &
                      (df['Date_temp'] < current_date)].sort_values(by='Date_temp', ascending=False).head(5)
    
    # Initialize points
    points = 0
    
    # Weights for the matches (most recent match has the highest weight)
    weights = [5, 4, 3, 2, 1]
    
    # Calculate points with weights
    weighted_points_sum = 0
    total_weights = sum(weights[:len(past_matches)])  # Adjust the total weight in case of less than 5 matches
    
    for match, weight in zip(past_matches.itertuples(), weights):
        if (match.Team_ID == team_id and match.FTR == 'H') or (match.Opp_ID == team_id and match.FTR == 'A'):
            points += 3
        elif match.FTR == 'D':
            points += 1
        else:
            points += 0

        weighted_points_sum += points * weight
    
    if total_weights > 0:

        team_form = round(weighted_points_sum / total_weights, 2)

        return team_form
    else:
        return 0  # Return 0 if no past matches found

In [170]:
df['team_form'] = [team_form(df, row, 'Team') for index, row in df.iterrows()]
df['opp_form'] = [team_form(df, row, 'Opp') for index, row in df.iterrows()]

In [171]:
def rolling_avgs_combined(df, row, perspective):
    # Determine the team ID based on the perspective ('Team' or 'Opp')
    if perspective == 'Team':
        team_id = row['Team_ID']
    elif perspective == 'Opp':
        team_id = row['Opp_ID']
    else:
        raise ValueError("Perspective must be 'Team' or 'Opp'")
    
    # Get the current match date
    current_date = row['Date_temp']
    
    # Filter past 5 matches for the team
    past_matches = df[((df['Team_ID'] == team_id) | (df['Opp_ID'] == team_id)) &
                      (df['Date_temp'] < current_date)].sort_values(by='Date_temp', ascending=False).head(5)
    
    # Weights for the matches (most recent match has the highest weight)
    weights = [5, 4, 3, 2, 1]
    
    # Initialize sums and weighted sums
    shots = []
    shots_target = []
    
    # Determine which columns to use and collect the values
    for match in past_matches.itertuples():
        if match.Team_ID == team_id:
            shots.append(getattr(match, 'HS'))  # Home shots
            shots_target.append(getattr(match, 'HST'))  # Home shots on target
        else:
            shots.append(getattr(match, 'AS'))  # Away shots
            shots_target.append(getattr(match, 'AST'))  # Away shots on target
    
    # Calculate the weighted averages of the values
    weighted_shots = sum(s * w for s, w in zip(shots, weights))
    weighted_shots_target = sum(st * w for st, w in zip(shots_target, weights))
    total_weights = sum(weights[:len(shots)])  # Adjust total weight if there are less than 5 matches
    
    avg_shots = weighted_shots / total_weights if total_weights > 0 else 0
    avg_shots_target = weighted_shots_target / total_weights if total_weights > 0 else 0

    # Round the averages to 2 decimal places
    avg_shots = round(avg_shots, 2)
    avg_shots_target = round(avg_shots_target, 2)
    
    return avg_shots, avg_shots_target

In [172]:
# Apply the function and create new columns
#df['team_shots'], df['team_shots_target'] = zip(*df.apply(lambda x: rolling_avgs_combined(df, x, 'Team'), axis=1))
#df['opp_shots'], df['opp_shots_target'] = zip(*df.apply(lambda x: rolling_avgs_combined(df, x, 'Opp'), axis=1))

In [173]:
# Function: Average games played in the last 50 days

from datetime import timedelta

def avg_games_played(df, row, team_column):
    team = row[team_column]
    # Ensure current_match_date is a Timestamp for comparison
    current_match_date = pd.to_datetime(row['Date'], dayfirst=True)  # Assuming 'Date' format is 'dd/mm/yy'

    delta = 50
    start_date = current_match_date - timedelta(days=delta)

    # Ensure 'Date' column is in datetime format for comparison
    #df['Date_temp'] = pd.to_datetime(df['Date'], dayfirst=True)  # Convert 'Date' column to datetime if not already done

    # Filter the DataFrame for matches within the last 30 days
    if team_column == 'Team_ID':
        past_matches = df[((df[team_column] == team) | (df['Opp_ID'] == team)) &
                          (df['Date'] >= start_date) & (df['Date'] < current_match_date)]
    else:
        past_matches = df[((df['Team_ID'] == team) | (df[team_column] == team)) &
                          (df['Date'] >= start_date) & (df['Date'] < current_match_date)]

    # If no matches were played in the last 30 days
    if past_matches.empty:
        return 0

    # Calculate weights based on the recency of each match
    weights = (current_match_date - past_matches['Date']).dt.days
    weighted_count = sum(delta - weights + 1)  # '+ 1' to include the match day in the weight

    # Normalize weights to sum to 1 and calculate the weighted average
    total_weight = sum(delta - weights + 1)
    weighted_avg = weighted_count / total_weight

    return weighted_avg


In [174]:
# Apply the function for each team and opponent
#df['team_avg_games'] = df.apply(lambda x: avg_games_played(df, x, 'Team_ID'), axis=1)
#df['opp_avg_games'] = df.apply(lambda x: avg_games_played(df, x, 'Opp_ID'), axis=1)

In [175]:
def avg_goals(df, row, team_column):
    # Season and date of the current match
    current_season = row['Season']
    current_date = row['Date']

    # Determine the columns for goals scored and conceded based on perspective
    if team_column == 'Team_ID':
        goals_scored_column = 'FTHG'  # Assuming FTHG is the column for home team goals
        goals_conceded_column = 'FTAG'  # Assuming FTAG is the column for away team goals
    else:
        goals_scored_column = 'FTAG'  # Flip the columns if we are looking from the opponent's perspective
        goals_conceded_column = 'FTHG'

    # Filter matches from the same season and before the current date
    past_matches = df[
        (df['Season'] == current_season) & 
        (df['Date'] < current_date) & 
        ((df['Team_ID'] == row[team_column]) | (df['Opp_ID'] == row[team_column]))
    ]

    # Calculate the average goals scored and conceded
    goals_scored = 0
    goals_conceded = 0
    total_matches = len(past_matches)

    for match in past_matches.itertuples():
        if getattr(match, 'Team_ID') == row[team_column]:  # Team is playing at home
            goals_scored += getattr(match, goals_scored_column)
            goals_conceded += getattr(match, goals_conceded_column)
        else:  # Team is playing away
            goals_scored += getattr(match, goals_scored_column)
            goals_conceded += getattr(match, goals_conceded_column)

    avg_goals_for = goals_scored / total_matches if total_matches > 0 else 0
    avg_goals_against = goals_conceded / total_matches if total_matches > 0 else 0

    # Round the averages to 3 decimal places
    avg_goals_for = round(avg_goals_for, 2)
    avg_goals_against = round(avg_goals_against, 2)

    return avg_goals_for, avg_goals_against


In [176]:
# Apply the function and create new columns
#df['team_avg_goals_for'], df['team_avg_goals_against'] = zip(*df.apply(lambda x: avg_goals(df, x, 'Team_ID'), axis=1))
#df['opp_avg_goals_for'], df['opp_avg_goals_against'] = zip(*df.apply(lambda x: avg_goals(df, x, 'Opp_ID'), axis=1))

In [177]:
# Calculate means only for numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
means = df[numeric_cols].mean()

# Fill missing values in numeric columns with their respective means
df[numeric_cols] = df[numeric_cols].fillna(means)

In [178]:
# Set the FTR to 'X' where the value is currently NaN
df['FTR'] = df['FTR'].fillna('X')

In [179]:
# Drop every row where 'FTR' is not 'H', 'D', or 'A', or 'X' (if future matches are included)
df = df[df['FTR'].isin(['H', 'D', 'A', 'X'])]

# Map 'H', 'D', and 'A' to 0, 1, and 2 respectively
df['FTR'] = df['FTR'].map({'H': 0, 'D': 1, 'A': 2, 'X': -1}).astype(int)

In [182]:
df = df[[
    
        'Div', 'Season', 'Date_temp', 'Time', 'DayOTW', 'Team_ID', 'Opp_ID', 'Ref_ID', 'FTR',

        'team_elo', 'opp_elo',
        
        'team_hist_vs', 
        'opp_hist_vs',

        'team_points',
        'opp_points',
        
        'team_form', 
        'opp_form',

        #'team_avg_goals_for', 
        #'team_avg_goals_against',
        #'opp_avg_goals_for',
        #'opp_avg_goals_against',
         
        #'team_shots', 'opp_shots',
        #'team_shots_target', 'opp_shots_target',

        #'team_avg_games', 'opp_avg_games',

        'AvgH', 'AvgD', 'AvgA'
         
         
         ]]

In [183]:
# Rename 'Date_temp' to 'Date'
df.rename(columns={'Date_temp': 'Date'}, inplace=True)

In [184]:
# Save the df to a CSV file
df.to_csv(f'data/processed/processed_data_{content}.csv', index=False)

In [185]:
import winsound
frequency = 400  # Set Frequency To 2500 Hertz
duration = 200  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)