In [48]:
#MERGE THE DATA
import os
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Step 1: Create the merged_data folder structure
merged_data_dir = 'merged_data'
seasons = ['2021-22', '2022-23', '2023-24']
positions = ['FWD', 'MID', 'DEF', 'GK']

# Create the merged_data directory with subdirectories for each season and position
for season in seasons:
    season_dir = os.path.join(merged_data_dir, season)
    os.makedirs(season_dir, exist_ok=True)
    
    for position in positions:
        position_dir = os.path.join(season_dir, position)
        os.makedirs(position_dir, exist_ok=True)

# Step 2: Define paths to fpl_gw_data and understat_data directories
fpl_gw_data_dir = 'fpl_gw_data'
understat_data_dir = 'understat_data'

# Function to find the best match using fuzzy matching
def find_best_match(player_name, understat_files):
    best_match = process.extractOne(player_name, understat_files, scorer=fuzz.partial_ratio)
    if best_match and best_match[1] >= 65:  # Adjust the threshold as needed
        return best_match[0]
    return None

# Step 3: Process each CSV file in fpl_gw_data
for season in seasons:
    fpl_season_dir = os.path.join(fpl_gw_data_dir, season)
    understat_season_dir = os.path.join(understat_data_dir, season)
    
    if os.path.isdir(fpl_season_dir) and os.path.isdir(understat_season_dir):
        for fpl_csv in os.listdir(fpl_season_dir):
            if fpl_csv.endswith('.csv'):
                fpl_csv_path = os.path.join(fpl_season_dir, fpl_csv)
                
                # Read the fpl_gw_data CSV
                fpl_df = pd.read_csv(fpl_csv_path)
                
                # Get player name and position
                player_name = fpl_df['name'].iloc[0]  
                player_position = fpl_df['position'].iloc[0]
                
                # Capitalize first and last name for fuzzy matching
                player_name_capitalized = '_'.join([name.capitalize() for name in player_name.split()])
                
                # List of all Understat files in the corresponding season directory
                understat_files = os.listdir(understat_season_dir)
                
                # Use fuzzy matching to find the best match for the player
                best_match_file = find_best_match(player_name_capitalized, understat_files)
                
                if best_match_file:
                    understat_csv_path = os.path.join(understat_season_dir, best_match_file)
                    
                    # Read the understat CSV
                    understat_df = pd.read_csv(understat_csv_path)
                    
                    # Merge fpl_gw_data with understat_data based on matching kickoff_time and date
                    fpl_df['kickoff_time'] = pd.to_datetime(fpl_df['kickoff_time']).dt.date
                    understat_df['date'] = pd.to_datetime(understat_df['date']).dt.date
                    
                    # Perform a left join to keep all rows from fpl_df and fill missing data with NaN
                    merged_df = pd.merge(fpl_df, understat_df, left_on='kickoff_time', right_on='date', how='left')
                    
                    # Save the merged file in the correct position folder in merged_data
                    merged_output_dir = os.path.join(merged_data_dir, season, player_position)
                    merged_output_path = os.path.join(merged_output_dir, fpl_csv)
                    
                    merged_df.to_csv(merged_output_path, index=False)
                    print(f"Merged and saved {fpl_csv} to {merged_output_dir}")
                else:
                    print(f"No matching Understat file found for {player_name}. Skipping {fpl_csv}.")
    else:
        print(f"Season folder {season} does not exist in both fpl_gw_data and understat_data.")




Merged and saved Miguel_Almirón.csv to merged_data/2021-22/MID
No matching Understat file found for Isaac Success Ajayi. Skipping Isaac_Success_Ajayi.csv.
Merged and saved Joseph_Hungbo.csv to merged_data/2021-22/MID
Merged and saved Patrik_Gunnarsson.csv to merged_data/2021-22/GK
Merged and saved Nathan_Aké.csv to merged_data/2021-22/DEF
Merged and saved Jamie_Vardy.csv to merged_data/2021-22/FWD
Merged and saved Lukas_Rupp.csv to merged_data/2021-22/MID
Merged and saved Edo_Kayembe.csv to merged_data/2021-22/MID
Merged and saved Vitaly_Janelt.csv to merged_data/2021-22/MID
Merged and saved Andreas_Christensen.csv to merged_data/2021-22/DEF
Merged and saved Morgan_Gibbs-White.csv to merged_data/2021-22/MID
Merged and saved Jonjo_Shelvey.csv to merged_data/2021-22/MID
No matching Understat file found for Dara Costelloe. Skipping Dara_Costelloe.csv.
Merged and saved Josh_Benson.csv to merged_data/2021-22/MID
Merged and saved Bryan_Gil_Salvatierra.csv to merged_data/2021-22/MID
Merged an

In [49]:
#RESTRUCTURING THE FWD FOLDER DATA
import os
import pandas as pd

# Define the path to the FWD folder in each season
merged_data_dir = 'merged_data'
seasons = ['2021-22', '2022-23', '2023-24']
fwd_folder = 'FWD'

# Columns to drop
columns_to_drop = ['position_x', 'kickoff_time', 'time']

# Column order as specified, including 'starts' as optional
desired_column_order = [
    'name', 'team', 'opponent_team', 'date', 'was_home', 'position_y', 'minutes', 'goals', 'xG', 'assists', 'xA',
    'total_points', 'shots', 'key_passes', 'ict_index', 'bonus', 'starts'
]

# Loop through each season and process the FWD folder
for season in seasons:
    fwd_dir = os.path.join(merged_data_dir, season, fwd_folder)
    
    if os.path.isdir(fwd_dir):
        for csv_file in os.listdir(fwd_dir):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(fwd_dir, csv_file)
                
                # Read the CSV file
                df = pd.read_csv(file_path)
                
                # Step 1: Drop the unnecessary columns
                df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')
                
                # Step 2: Convert 'starts' column to boolean (True/False) if it exists
                if 'starts' in df.columns:
                    df['starts'] = df['starts'].apply(lambda x: True if x == 1 else False)
                else:
                    print(f"'starts' column not found in {csv_file}, skipping the conversion.")
                
                # Step 3: Reorder the columns, but first check for missing columns
                existing_columns = [col for col in desired_column_order if col in df.columns]
                df = df[existing_columns]
                
                # Step 4: Rename 'position_y' to 'position' if 'position_y' exists
                if 'position_y' in df.columns:
                    df = df.rename(columns={'position_y': 'position'})
                
                # Save the modified CSV file back
                df.to_csv(file_path, index=False)
                print(f"Processed and saved {csv_file} in {fwd_dir}")



'starts' column not found in Isaac_Success_Ajayi.csv, skipping the conversion.
Processed and saved Isaac_Success_Ajayi.csv in merged_data/2021-22/FWD
'starts' column not found in Jamie_Vardy.csv, skipping the conversion.
Processed and saved Jamie_Vardy.csv in merged_data/2021-22/FWD
'starts' column not found in Jordan_Ayew.csv, skipping the conversion.
Processed and saved Jordan_Ayew.csv in merged_data/2021-22/FWD
'starts' column not found in Joshua_King.csv, skipping the conversion.
Processed and saved Joshua_King.csv in merged_data/2021-22/FWD
'starts' column not found in Odsonne_Edouard.csv, skipping the conversion.
Processed and saved Odsonne_Edouard.csv in merged_data/2021-22/FWD
'starts' column not found in Pierre-Emerick_Aubameyang.csv, skipping the conversion.
Processed and saved Pierre-Emerick_Aubameyang.csv in merged_data/2021-22/FWD
'starts' column not found in Juan_Camilo_Hernández_Suárez.csv, skipping the conversion.
Processed and saved Juan_Camilo_Hernández_Suárez.csv in 

In [50]:
#RESTRUCTURING THE MID FOLDER 
import os
import pandas as pd

# Define the path to the MID folder in each season
merged_data_dir = 'merged_data'
seasons = ['2021-22', '2022-23', '2023-24']
mid_folder = 'MID'

# Columns to drop
columns_to_drop = ['position_x', 'kickoff_time', 'time']

# Column order as specified for the MID folder
desired_column_order = [
    'name', 'team', 'opponent_team', 'date', 'was_home', 'position_y', 'minutes', 'goals', 'xG', 'assists', 'xA',
    'total_points', 'shots', 'key_passes', 'ict_index', 'bonus', 'clean_sheets', 'goals_conceded', 'starts'
]

# Loop through each season and process the MID folder
for season in seasons:
    mid_dir = os.path.join(merged_data_dir, season, mid_folder)
    
    if os.path.isdir(mid_dir):
        for csv_file in os.listdir(mid_dir):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(mid_dir, csv_file)
                
                # Read the CSV file
                df = pd.read_csv(file_path)
                
                # Step 1: Drop the unnecessary columns
                df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')
                
                # Step 2: Convert 'starts' column to boolean (True/False) if it exists
                if 'starts' in df.columns:
                    df['starts'] = df['starts'].apply(lambda x: True if x == 1 else False)
                else:
                    print(f"'starts' column not found in {csv_file}, skipping the conversion.")
                
                # Step 3: Reorder the columns, but first check for missing columns
                existing_columns = [col for col in desired_column_order if col in df.columns]
                df = df[existing_columns]
                
                # Step 4: Rename 'position_y' to 'position' if 'position_y' exists
                if 'position_y' in df.columns:
                    df = df.rename(columns={'position_y': 'position'})
                
                # Save the modified CSV file back
                df.to_csv(file_path, index=False)
                print(f"Processed and saved {csv_file} in {mid_dir}")


'starts' column not found in Miguel_Almirón.csv, skipping the conversion.
Processed and saved Miguel_Almirón.csv in merged_data/2021-22/MID
'starts' column not found in Joseph_Hungbo.csv, skipping the conversion.
Processed and saved Joseph_Hungbo.csv in merged_data/2021-22/MID
'starts' column not found in Lukas_Rupp.csv, skipping the conversion.
Processed and saved Lukas_Rupp.csv in merged_data/2021-22/MID
'starts' column not found in Edo_Kayembe.csv, skipping the conversion.
Processed and saved Edo_Kayembe.csv in merged_data/2021-22/MID
'starts' column not found in Vitaly_Janelt.csv, skipping the conversion.
Processed and saved Vitaly_Janelt.csv in merged_data/2021-22/MID
'starts' column not found in Morgan_Gibbs-White.csv, skipping the conversion.
Processed and saved Morgan_Gibbs-White.csv in merged_data/2021-22/MID
'starts' column not found in Jonjo_Shelvey.csv, skipping the conversion.
Processed and saved Jonjo_Shelvey.csv in merged_data/2021-22/MID
'starts' column not found in Dar

In [51]:
import os
import pandas as pd

# RESTRUCTURING THE DEF FOLDER
merged_data_dir = 'merged_data'
seasons = ['2021-22', '2022-23', '2023-24']
def_folder = 'DEF'

# Columns to drop
columns_to_drop = ['position_x', 'kickoff_time', 'time', 'own_goals']

# Column order as specified for the DEF folder
desired_column_order = [
    'name', 'team', 'opponent_team', 'date', 'was_home', 'position_y', 'minutes', 'goals', 'xG', 'assists', 'xA',
    'total_points', 'shots', 'key_passes', 'ict_index', 'bonus', 'clean_sheets', 'goals_conceded', 'expected_goals_conceded', 'starts'
]

# Loop through each season and process the DEF folder
for season in seasons:
    def_dir = os.path.join(merged_data_dir, season, def_folder)
    
    if os.path.isdir(def_dir):
        for csv_file in os.listdir(def_dir):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(def_dir, csv_file)
                
                # Read the CSV file
                df = pd.read_csv(file_path)
                
                # Step 1: Drop the unnecessary columns
                df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')
                
                # Step 2: Convert 'starts' column to boolean (True/False) if it exists
                if 'starts' in df.columns:
                    df['starts'] = df['starts'].apply(lambda x: True if x == 1 else False)
                else:
                    print(f"'starts' column not found in {csv_file}, skipping the conversion.")
                
                # Step 3: Reorder the columns, but first check for missing columns
                existing_columns = [col for col in desired_column_order if col in df.columns]
                df = df[existing_columns]
                
                # Step 4: Rename 'position_y' to 'position' if 'position_y' exists
                if 'position_y' in df.columns:
                    df = df.rename(columns={'position_y': 'position'})
                
                # Save the modified CSV file back
                df.to_csv(file_path, index=False)
                print(f"Processed and saved {csv_file} in {def_dir}")


'starts' column not found in Nathan_Aké.csv, skipping the conversion.
Processed and saved Nathan_Aké.csv in merged_data/2021-22/DEF
'starts' column not found in Andreas_Christensen.csv, skipping the conversion.
Processed and saved Andreas_Christensen.csv in merged_data/2021-22/DEF
'starts' column not found in Pablo_Marí.csv, skipping the conversion.
Processed and saved Pablo_Marí.csv in merged_data/2021-22/DEF
'starts' column not found in Tariq_Lamptey.csv, skipping the conversion.
Processed and saved Tariq_Lamptey.csv in merged_data/2021-22/DEF
'starts' column not found in Samir_Caetano_de_Souza_Santos.csv, skipping the conversion.
Processed and saved Samir_Caetano_de_Souza_Santos.csv in merged_data/2021-22/DEF
'starts' column not found in Adam_Webster.csv, skipping the conversion.
Processed and saved Adam_Webster.csv in merged_data/2021-22/DEF
'starts' column not found in Zach_Awe.csv, skipping the conversion.
Processed and saved Zach_Awe.csv in merged_data/2021-22/DEF
'starts' colum

In [52]:
import os
import pandas as pd
#RESTRUCTURING THE GK DATA 
merged_data_dir = 'merged_data'
seasons = ['2021-22', '2022-23', '2023-24']
gk_folder = 'GK'

# Columns to drop
columns_to_drop = ['position_x', 'kickoff_time', 'time', 'own_goals', 'goals', 'shots', 'xG', 'assists', 'key_passes', 'position_y']

# Column order as specified for the GK folder
desired_column_order = [
    'name', 'team', 'opponent_team', 'date', 'was_home', 'minutes', 'goals_conceded', 'expected_goals_conceded',
    'saves', 'penalties_saved', 'total_points', 'bonus', 'clean_sheets', 'xA', 'starts'
]

# Loop through each season and process the GK folder
for season in seasons:
    gk_dir = os.path.join(merged_data_dir, season, gk_folder)
    
    if os.path.isdir(gk_dir):
        for csv_file in os.listdir(gk_dir):
            if csv_file.endswith('.csv'):
                file_path = os.path.join(gk_dir, csv_file)
                
                # Read the CSV file
                df = pd.read_csv(file_path)
                
                # Step 1: Drop the unnecessary columns
                df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')
                
                # Step 2: Convert 'starts' and 'clean_sheets' columns to boolean (True/False) if they exist
                if 'starts' in df.columns:
                    df['starts'] = df['starts'].apply(lambda x: True if x == 1 else False)
                else:
                    print(f"'starts' column not found in {csv_file}, skipping the conversion.")
                
                if 'clean_sheets' in df.columns:
                    df['clean_sheets'] = df['clean_sheets'].apply(lambda x: True if x == 1 else False)
                else:
                    print(f"'clean_sheets' column not found in {csv_file}, skipping the conversion.")
                
                # Step 3: Reorder the columns, but first check for missing columns
                existing_columns = [col for col in desired_column_order if col in df.columns]
                df = df[existing_columns]
                
                # Save the modified CSV file back
                df.to_csv(file_path, index=False)
                print(f"Processed and saved {csv_file} in {gk_dir}")


'starts' column not found in Patrik_Gunnarsson.csv, skipping the conversion.
Processed and saved Patrik_Gunnarsson.csv in merged_data/2021-22/GK
'starts' column not found in Freddie_Woodman.csv, skipping the conversion.
Processed and saved Freddie_Woodman.csv in merged_data/2021-22/GK
'starts' column not found in Martin_Dubravka.csv, skipping the conversion.
Processed and saved Martin_Dubravka.csv in merged_data/2021-22/GK
'starts' column not found in Hugo_Lloris.csv, skipping the conversion.
Processed and saved Hugo_Lloris.csv in merged_data/2021-22/GK
'starts' column not found in James_Storer.csv, skipping the conversion.
Processed and saved James_Storer.csv in merged_data/2021-22/GK
'starts' column not found in Harry_Lewis.csv, skipping the conversion.
Processed and saved Harry_Lewis.csv in merged_data/2021-22/GK
'starts' column not found in Viljami_Sinisalo.csv, skipping the conversion.
Processed and saved Viljami_Sinisalo.csv in merged_data/2021-22/GK
'starts' column not found in 

In [53]:
### CALUCLATING FORM FOR THE FWD & MID DATA 
import os
import pandas as pd

# Path to merged_data directory
root_dir = 'merged_data'

# Function to calculate form over last five games (or as many as available)
def calculate_form(series, num_games=5):
    form = []
    for i in range(len(series)):
        if i < num_games:
            form.append(series[:i].mean() if i > 0 else 0)  # Avoid empty slice error
        else:
            form.append(series[i-num_games:i].mean())
    return form

# Function to get stats from last season
def get_last_season_stats(player_name, last_season_dir, position):
    # The previous season's file path should point to the respective FWD/MID directory
    last_season_file = os.path.join(last_season_dir, position, player_name)
    
    print(f"Looking for {player_name} in {last_season_file}")  # Debugging
    
    if os.path.exists(last_season_file):
        last_season_df = pd.read_csv(last_season_file)
        if not last_season_df.empty:
            # Ensure the file has the necessary columns
            required_columns = ['goals', 'assists', 'xG', 'xA', 'total_points', 'minutes']
            if all(col in last_season_df.columns for col in required_columns):
                total_goals = last_season_df['goals'].sum()
                total_assists = last_season_df['assists'].sum()
                total_xG = last_season_df['xG'].sum()
                total_xA = last_season_df['xA'].sum()
                total_points = last_season_df['total_points'].sum()
                total_minutes = last_season_df['minutes'].sum()
                points_per_minute = total_points / total_minutes if total_minutes > 0 else 0
                
                print(f"Stats found for {player_name}: Goals: {total_goals}, Assists: {total_assists}, xG: {total_xG}, xA: {total_xA}, Points/Min: {points_per_minute}")  # Debugging
                
                return total_goals, total_assists, total_xG, total_xA, points_per_minute
            else:
                print(f"Missing columns in {last_season_file}")  # Debugging
    else:
        print(f"File {last_season_file} not found.")  # Debugging
    
    return 0, 0, 0, 0, 0

# Loop through both FWD and MID directories across all seasons
for season in ['2021-22', '2022-23', '2023-24']:
    for position in ['FWD', 'MID']:
        position_dir = os.path.join(root_dir, season, position)
        last_season_dir = os.path.join(root_dir, f'{int(season[:4])-1}-{int(season[5:7])-1}')
        
        # Check if position directory exists
        if not os.path.exists(position_dir):
            print(f"Directory {position_dir} does not exist. Skipping.")
            continue

        # Loop through each CSV file in the position directory
        for player_file in os.listdir(position_dir):
            player_path = os.path.join(position_dir, player_file)
            
            # Read the CSV file and check for empty or corrupted files
            try:
                df = pd.read_csv(player_path)
            except pd.errors.EmptyDataError:
                print(f"File {player_path} is empty or corrupted. Skipping.")
                continue
            
            # Check if required columns exist before processing
            required_columns = ['total_points', 'xG', 'assists', 'minutes']
            if not all(col in df.columns for col in required_columns):
                print(f"File {player_path} is missing required columns. Skipping.")
                continue
            
            # 1. Create 'form' column
            df['form'] = calculate_form(df['total_points'])
            
            # 2. Create 'xG&A_form' column
            df['xG&A_form'] = calculate_form(df[['xG', 'assists']].sum(axis=1))
            
            # 3. Create 'minutes per game' column
            df['minutes_per_game'] = df['minutes'].cumsum() / (df.index + 1)
            
            # 4. Get last season stats (only for 2022-23 and 2023-24 seasons)
            if season in ['2022-23', '2023-24']:
                last_season_goals, last_season_assists, last_season_xG, last_season_xA, last_season_ppm = get_last_season_stats(player_file, last_season_dir, position)
                df['last_season_goals'] = last_season_goals
                df['last_season_assists'] = last_season_assists
                df['last_season_xG'] = last_season_xG
                df['last_season_xA'] = last_season_xA
                df['last_season_points_per_minute'] = last_season_ppm
            
            # Round all numerical columns to two decimal places
            df = df.round(2)
            
            # Save the updated dataframe back to the CSV file
            df.to_csv(player_path, index=False)

print("All forward and midfielder CSV files have been processed and rounded to two decimal places.")





File merged_data/2021-22/FWD/Isaac_Success_Ajayi.csv is missing required columns. Skipping.
File merged_data/2021-22/FWD/Jordan_Hugill.csv is missing required columns. Skipping.
File merged_data/2021-22/FWD/Adedapo_Awokoya-Mebude.csv is missing required columns. Skipping.
File merged_data/2021-22/FWD/Kayky_da_Silva_Chagas.csv is missing required columns. Skipping.
File merged_data/2021-22/FWD/Gabriel_Fernando_de_Jesus.csv is missing required columns. Skipping.
File merged_data/2021-22/FWD/Halil_Dervişoğlu.csv is missing required columns. Skipping.
File merged_data/2021-22/FWD/Shaqai_Forde.csv is missing required columns. Skipping.
File merged_data/2021-22/FWD/Mika_Biereth.csv is missing required columns. Skipping.
File merged_data/2021-22/FWD/João_Pedro_Junqueira_de_Jesus.csv is missing required columns. Skipping.
File merged_data/2021-22/FWD/Joseph_McGlynn.csv is missing required columns. Skipping.
File merged_data/2021-22/FWD/Florin_Andone.csv is missing required columns. Skipping.
F

In [54]:
#CALCULATING FORM FOR THE DEFENDER FOLDER 
import os
import pandas as pd

# Path to merged_data directory
root_dir = 'merged_data'

# Function to calculate form over last five games (or as many as available)
def calculate_form(series, num_games=5):
    form = []
    for i in range(len(series)):
        if i < num_games:
            form.append(series[:i].mean() if i > 0 else 0)  # Avoid empty slice error
        else:
            form.append(series[i-num_games:i].mean())
    return form

# Function to get stats from last season for defenders
def get_last_season_stats_defender(player_name, last_season_dir):
    last_season_file = os.path.join(last_season_dir, 'DEF', player_name)
    
    print(f"Looking for {player_name} in {last_season_file}")  # Debugging
    
    if os.path.exists(last_season_file):
        last_season_df = pd.read_csv(last_season_file)
        if not last_season_df.empty:
            # Ensure the file has the necessary columns
            required_columns = ['xG', 'xA', 'expected_goals_conceded', 'minutes', 'clean_sheets']
            if all(col in last_season_df.columns for col in required_columns):
                total_xG = last_season_df['xG'].sum()
                total_xA = last_season_df['xA'].sum()
                total_expected_goals_conceded = last_season_df['expected_goals_conceded'].sum()
                
                # Calculate last season clean sheet probability
                clean_sheet_games = last_season_df[(last_season_df['minutes'] > 60) & (last_season_df['clean_sheets'] == True)]
                total_games = last_season_df[last_season_df['minutes'] > 60]
                if not total_games.empty:
                    last_season_clean_sheet_prob = len(clean_sheet_games) / len(total_games)
                else:
                    last_season_clean_sheet_prob = 0
                
                print(f"Stats found for {player_name}: xG: {total_xG}, xA: {total_xA}, Expected Goals Conceded: {total_expected_goals_conceded}, Clean Sheet Probability: {last_season_clean_sheet_prob}")  # Debugging
                
                return total_xG, total_xA, total_expected_goals_conceded, last_season_clean_sheet_prob
            else:
                print(f"Missing required columns in {last_season_file}: {set(required_columns) - set(last_season_df.columns)}")  # Debugging
    else:
        print(f"File {last_season_file} not found.")  # Debugging
    
    return 0, 0, 0, 0

# Function to calculate clean sheet probability for current season
def calculate_clean_sheet_probability(df):
    clean_sheet_games = df[(df['minutes'] > 60) & (df['clean_sheets'] == True)]
    total_games = df[df['minutes'] > 60]
    if not total_games.empty:
        return len(clean_sheet_games) / len(total_games)
    return 0

# Loop through the DEF directories across all seasons
for season in ['2021-22', '2022-23', '2023-24']:
    position = 'DEF'
    position_dir = os.path.join(root_dir, season, position)
    last_season_dir = os.path.join(root_dir, f'{int(season[:4])-1}-{int(season[5:7])-1}')
    
    # Check if position directory exists
    if not os.path.exists(position_dir):
        print(f"Directory {position_dir} does not exist. Skipping.")
        continue

    # Loop through each CSV file in the DEF directory
    for player_file in os.listdir(position_dir):
        player_path = os.path.join(position_dir, player_file)
        
        # Read the CSV file and check for empty or corrupted files
        try:
            df = pd.read_csv(player_path)
        except pd.errors.EmptyDataError:
            print(f"File {player_path} is empty or corrupted. Skipping.")
            continue
        
        # Print missing columns for debugging
        required_columns = ['total_points', 'xG', 'assists', 'minutes', 'clean_sheets']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"File {player_path} is missing columns: {missing_columns}. Skipping.")
            continue
        
        # 1. Create 'form' column if 'total_points' is available
        if 'total_points' in df.columns:
            df['form'] = calculate_form(df['total_points'])
        
        # 2. Create 'xG&A_form' column if 'xG' and 'assists' are available
        if 'xG' in df.columns and 'assists' in df.columns:
            df['xG&A_form'] = calculate_form(df[['xG', 'assists']].sum(axis=1))
        
        # 3. Create 'minutes per game' column if 'minutes' is available
        if 'minutes' in df.columns:
            df['minutes_per_game'] = df['minutes'].cumsum() / (df.index + 1)
        
        # 4. Calculate clean sheet probability if 'minutes' and 'clean_sheet' are available
        if 'minutes' in df.columns and 'clean_sheets' in df.columns:
            df['clean_sheet_probability'] = df['clean_sheets'].expanding().apply(lambda x: calculate_clean_sheet_probability(df[:x.index[-1]+1]), raw=False)
        
        # 5. Get last season stats (only for 2022-23 and 2023-24 seasons)
        if season in ['2022-23', '2023-24']:
            last_season_xG, last_season_xA, last_season_expected_goals_conceded, last_season_clean_sheet_prob = get_last_season_stats_defender(player_file, last_season_dir)
            df['last_season_xG'] = last_season_xG
            df['last_season_xA'] = last_season_xA
            df['last_season_expected_goals_conceded'] = last_season_expected_goals_conceded
            df['last_season_clean_sheet_probability'] = last_season_clean_sheet_prob
        
        # Round all numerical columns to two decimal places
        df = df.round(2)
        
        # Save the updated dataframe back to the CSV file
        df.to_csv(player_path, index=False)

print("All defender CSV files have been processed.")




File merged_data/2021-22/DEF/Samir_Caetano_de_Souza_Santos.csv is missing columns: ['xG', 'assists']. Skipping.
File merged_data/2021-22/DEF/Zach_Awe.csv is missing columns: ['xG', 'assists']. Skipping.
File merged_data/2021-22/DEF/Jarell_Quansah.csv is missing columns: ['xG', 'assists']. Skipping.
File merged_data/2021-22/DEF/Emerson_Palmieri_dos_Santos.csv is missing columns: ['xG', 'assists']. Skipping.
File merged_data/2021-22/DEF/Vontae_Daley-Campbell.csv is missing columns: ['xG', 'assists']. Skipping.
File merged_data/2021-22/DEF/Jonathan_Tomkinson.csv is missing columns: ['xG', 'assists']. Skipping.
File merged_data/2021-22/DEF/Reece_Welch.csv is missing columns: ['xG', 'assists']. Skipping.
File merged_data/2021-22/DEF/Robert_Kenedy_Nunes_do_Nascimento.csv is missing columns: ['xG', 'assists']. Skipping.
File merged_data/2021-22/DEF/Emerson_Aparecido_Leite_de_Souza_Junior.csv is missing columns: ['xG', 'assists']. Skipping.
File merged_data/2021-22/DEF/Jonathan_Castro_Otto.csv

In [61]:
#CALCULATING THE FORM FOR THE GK FOLDER
import os
import pandas as pd

# Path to merged_data directory
root_dir = 'merged_data'

# Function to calculate form over last five games (or as many as available)
def calculate_form(series, num_games=5):
    form = []
    for i in range(len(series)):
        if i < num_games:
            form.append(series[:i].mean() if i > 0 else 0)  # Avoid empty slice error
        else:
            form.append(series[i-num_games:i].mean())
    return form

# Function to get last season stats for goalkeepers (including total saves)
def get_last_season_stats_goalkeeper(player_name, last_season_dir):
    last_season_file = os.path.join(last_season_dir, 'GK', player_name)
    
    print(f"Looking for {player_name} in {last_season_file}")  # Debugging
    
    if os.path.exists(last_season_file):
        last_season_df = pd.read_csv(last_season_file)
        if not last_season_df.empty:
            # Ensure the file has the necessary columns
            required_columns = ['penalties_saved', 'expected_goals_conceded', 'minutes', 'clean_sheets', 'saves']
            if all(col in last_season_df.columns for col in required_columns):
                total_penalties_saved = last_season_df['penalties_saved'].sum()
                total_expected_goals_conceded = last_season_df['expected_goals_conceded'].sum()
                total_saves = last_season_df['saves'].sum()
                
                # Calculate last season clean sheet probability
                clean_sheet_games = last_season_df[(last_season_df['minutes'] > 60) & (last_season_df['clean_sheets'] == True)]
                total_games = last_season_df[last_season_df['minutes'] > 60]
                if not total_games.empty:
                    last_season_clean_sheet_prob = len(clean_sheet_games) / len(total_games)
                else:
                    last_season_clean_sheet_prob = 0
                
                print(f"Stats found for {player_name}: Penalties Saved: {total_penalties_saved}, Expected Goals Conceded: {total_expected_goals_conceded}, Last Season Saves: {total_saves}, Last Season Clean Sheet Probability: {last_season_clean_sheet_prob}")  # Debugging
                
                return total_penalties_saved, total_expected_goals_conceded, last_season_clean_sheet_prob, total_saves
            else:
                print(f"Missing required columns in {last_season_file}: {set(required_columns) - set(last_season_df.columns)}")  # Debugging
    else:
        print(f"File {last_season_file} not found.")  # Debugging
    
    return 0, 0, 0, 0

# Function to calculate clean sheet probability for current season
def calculate_clean_sheet_probability(df):
    clean_sheet_games = df[(df['minutes'] > 60) & (df['clean_sheets'] == True)]
    total_games = df[df['minutes'] > 60]
    if not total_games.empty:
        return len(clean_sheet_games) / len(total_games)
    return 0

# Function to calculate saves per game
def calculate_saves_per_game(df):
    return df['saves'].cumsum() / (df.index + 1)

# Loop through the GK directories across all seasons
for season in ['2021-22', '2022-23', '2023-24']:
    position = 'GK'
    position_dir = os.path.join(root_dir, season, position)
    last_season_dir = os.path.join(root_dir, f'{int(season[:4])-1}-{int(season[5:7])-1}')
    
    # Check if position directory exists
    if not os.path.exists(position_dir):
        print(f"Directory {position_dir} does not exist. Skipping.")
        continue

    # Loop through each CSV file in the GK directory
    for player_file in os.listdir(position_dir):
        player_path = os.path.join(position_dir, player_file)
        
        # Read the CSV file and check for empty or corrupted files
        try:
            df = pd.read_csv(player_path)
        except pd.errors.EmptyDataError:
            print(f"File {player_path} is empty or corrupted. Skipping.")
            continue
        
        # Print missing columns for debugging
        required_columns = ['total_points', 'minutes', 'clean_sheets', 'saves']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"File {player_path} is missing columns: {missing_columns}. Skipping.")
            continue
        
        # 1. Create 'form' column if 'total_points' is available
        if 'total_points' in df.columns:
            df['form'] = calculate_form(df['total_points'])
        
        # 2. Calculate clean sheet probability if 'minutes' and 'clean_sheets' are available
        if 'minutes' in df.columns and 'clean_sheets' in df.columns:
            df['clean_sheet_probability'] = df['clean_sheets'].expanding().apply(lambda x: calculate_clean_sheet_probability(df[:x.index[-1]+1]), raw=False)
        
        # 3. Calculate saves per game if 'saves' is available
        if 'saves' in df.columns:
            df['saves_per_game'] = calculate_saves_per_game(df)
        
        # 4. Get last season stats (only for 2022-23 and 2023-24 seasons)
        if season in ['2022-23', '2023-24']:
            last_season_penalties_saved, last_season_expected_goals_conceded, last_season_clean_sheet_prob, last_season_total_saves = get_last_season_stats_goalkeeper(player_file, last_season_dir)
            df['last_season_penalties_saved'] = last_season_penalties_saved
            df['last_season_expected_goals_conceded'] = last_season_expected_goals_conceded
            df['last_season_clean_sheet_probability'] = last_season_clean_sheet_prob
            df['last_season_total_saves'] = last_season_total_saves
        
        # Round all numerical columns to two decimal places
        df = df.round(2)
        
        # Save the updated dataframe back to the CSV file
        df.to_csv(player_path, index=False)

print("All goalkeeper CSV files have been processed.")



Looking for Martin_Dubravka.csv in merged_data/2021-22/GK/Martin_Dubravka.csv
Missing required columns in merged_data/2021-22/GK/Martin_Dubravka.csv: {'expected_goals_conceded'}
Looking for Matija_Šarkić.csv in merged_data/2021-22/GK/Matija_Šarkić.csv
File merged_data/2021-22/GK/Matija_Šarkić.csv not found.
Looking for Hugo_Lloris.csv in merged_data/2021-22/GK/Hugo_Lloris.csv
Missing required columns in merged_data/2021-22/GK/Hugo_Lloris.csv: {'expected_goals_conceded'}
Looking for Stefan_Ortega_Moreno.csv in merged_data/2021-22/GK/Stefan_Ortega_Moreno.csv
File merged_data/2021-22/GK/Stefan_Ortega_Moreno.csv not found.
Looking for Viljami_Sinisalo.csv in merged_data/2021-22/GK/Viljami_Sinisalo.csv
Missing required columns in merged_data/2021-22/GK/Viljami_Sinisalo.csv: {'expected_goals_conceded'}
Looking for Brandon_Austin.csv in merged_data/2021-22/GK/Brandon_Austin.csv
Missing required columns in merged_data/2021-22/GK/Brandon_Austin.csv: {'expected_goals_conceded'}
Looking for Illan

In [25]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Obtaining dependency information for fuzzywuzzy from https://files.pythonhosted.org/packages/43/ff/74f23998ad2f93b945c0309f825be92e04e0348e062026998b5eefef4c33/fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [62]:
#ADD THE NEXT WEEK GAMEWEEK POINTS TO EACH ROW
import os
import pandas as pd

# Define the base directory
base_dir = 'merged_data'

# Define the season subdirectories
seasons = ['2022-23', '2023-24']

# Define the player position directories
positions = ['FWD', 'MID', 'GK', 'DEF']

# Loop through each season
for season_index, season in enumerate(seasons):
    for position in positions:
        # Get the path to the position folder in the current season
        position_dir = os.path.join(base_dir, season, position)

        # Ensure the position directory exists
        if not os.path.exists(position_dir):
            print(f"Position directory {position_dir} does not exist. Skipping.")
            continue

        # Loop through each player CSV file in the position directory
        for player_file in os.listdir(position_dir):
            # Load the CSV file for the player
            player_file_path = os.path.join(position_dir, player_file)
            try:
                df = pd.read_csv(player_file_path)
            except pd.errors.EmptyDataError:
                print(f"File {player_file_path} is empty or corrupted. Skipping.")
                continue

            # Check if the file has a 'total_points' column
            if 'total_points' not in df.columns:
                print(f"File {player_file_path} does not have 'total_points'. Skipping.")
                continue

            # Create a new column for next week's points
            df['next_week_points'] = None

            # Iterate through each row
            for i in range(len(df) - 1):
                df.at[i, 'next_week_points'] = df.at[i + 1, 'total_points']
                
            # Handle the case for the last row, only if it's not the last season
            if season_index < len(seasons) - 1 and len(df) > 0:
                # Look for the same player file in the next season, same position
                next_season = seasons[season_index + 1]
                next_season_dir = os.path.join(base_dir, next_season, position)
                next_season_player_file = os.path.join(next_season_dir, player_file)

                # Ensure the next season file exists and can be read
                if os.path.exists(next_season_player_file):
                    try:
                        next_season_df = pd.read_csv(next_season_player_file)
                        if len(next_season_df) > 0:
                            # Get the first row's total_points from the next season
                            df.at[len(df) - 1, 'next_week_points'] = next_season_df.at[0, 'total_points']
                    except pd.errors.EmptyDataError:
                        print(f"File {next_season_player_file} is empty or corrupted. Skipping last row addition.")
            
            # Save the updated CSV file back to the same location
            df.to_csv(player_file_path, index=False)

print("Processing completed.")


Processing completed.


In [63]:
#ADDING THE TEAM DIFFICULTIES
import os
import pandas as pd
from fuzzywuzzy import process

# Paths to directories
merged_data_dir = 'merged_data'
team_difficulty_ratings_dir = 'team_difficulty_ratings'

# Define seasons and positions
seasons = ['2022-23', '2023-24']  # Add additional seasons as necessary
positions = ['GK', 'DEF', 'MID', 'FWD']

# Team name mappings
team_name_mappings = {
    "Spurs": "Tottenham",
    "Man City": "Manchester City",
    "Man Utd": "Manchester United",
    "Nott'm Forest": "Nottingham Forest"
}

# Function to map team names
def map_team_name(team_name):
    return team_name_mappings.get(team_name, team_name)  # Map if in dict, else return original

# Function to find the closest match for a team name using fuzzywuzzy
def get_closest_match(team_name, options, threshold=75):
    closest_match, score = process.extractOne(team_name, options)
    if score >= threshold:
        return closest_match
    return None

# Function to find the next week's fixture difficulty
def get_next_fixture_difficulty(player_team, was_home, opponent_team):
    # Map the player team name to a canonical version before matching
    mapped_team = map_team_name(player_team)

    # Find the closest team name match in the team_difficulty_ratings directory
    team_files = os.listdir(team_difficulty_ratings_dir)
    closest_team_file = get_closest_match(mapped_team, team_files)
    
    # Load the team's difficulty rating CSV
    if closest_team_file:
        team_difficulty_file_path = os.path.join(team_difficulty_ratings_dir, closest_team_file)
        team_df = pd.read_csv(team_difficulty_file_path)
    
        # Find the closest match for the opponent team in the difficulty ratings
        opponent = get_closest_match(opponent_team, team_df['Opponent'].tolist())
    
        # Get the fixture difficulty based on 'Home Difficulty' or 'Away Difficulty'
        if opponent is not None:
            if was_home:
                difficulty = team_df.loc[team_df['Opponent'] == opponent, 'Home Difficulty'].values
            else:
                difficulty = team_df.loc[team_df['Opponent'] == opponent, 'Away Difficulty'].values
            if len(difficulty) > 0:
                return round(difficulty[0], 2)  # Round difficulty to two decimal places
    return None

# Function to handle moving to the next season for the last row
def get_next_season_fixture(player_file, current_season_index, position):
    if current_season_index < len(seasons) - 1:  # Check if we're not in the last season
        next_season = seasons[current_season_index + 1]
        next_season_dir = os.path.join(merged_data_dir, next_season, position)
        if os.path.exists(next_season_dir):
            # List all player files in the next season position directory
            next_season_files = os.listdir(next_season_dir)
            closest_player_file = get_closest_match(player_file, next_season_files, threshold=75)
            if closest_player_file:
                # Load the first row of the closest player's CSV in the next season
                next_player_path = os.path.join(next_season_dir, closest_player_file)
                next_df = pd.read_csv(next_player_path)
                if len(next_df) > 0:
                    # Return the first row's team, was_home, and opponent_team
                    return next_df.iloc[0]['team'], next_df.iloc[0]['was_home'], next_df.iloc[0]['opponent_team']
    return None, None, None  # Return None if no valid next season match is found

# Traverse the merged_data directory
for season_index, season in enumerate(seasons):
    for position in positions:
        position_dir = os.path.join(merged_data_dir, season, position)
        
        # Check if the position directory exists
        if os.path.exists(position_dir):
            for player_file in os.listdir(position_dir):
                player_file_path = os.path.join(position_dir, player_file)
                
                # Load the player CSV
                try:
                    df = pd.read_csv(player_file_path)
                except pd.errors.EmptyDataError:
                    print(f"File {player_file_path} is empty or corrupted. Skipping.")
                    continue
                
                # Check if the dataframe is empty
                if df.empty:
                    print(f"{player_file_path} is empty. Skipping.")
                    continue
                
                # Ensure the necessary columns exist
                if {'team', 'was_home', 'opponent_team'}.issubset(df.columns):
                    
                    # Shift the columns to get next week's fixture details
                    df['next_team'] = df['team'].shift(-1)
                    df['next_was_home'] = df['was_home'].shift(-1)
                    df['next_opponent_team'] = df['opponent_team'].shift(-1)

                    # Handle the last row separately
                    last_row_index = df.index[-1]
                    last_team, last_was_home, last_opponent_team = df.iloc[last_row_index]['team'], df.iloc[last_row_index]['was_home'], df.iloc[last_row_index]['opponent_team']
                    
                    # For the last row of the season, check the next season's first row for the same player
                    if season_index < len(seasons) - 1:  # Only for seasons other than the last one
                        next_season_team, next_season_was_home, next_season_opponent_team = get_next_season_fixture(player_file, season_index, position)
                        
                        if next_season_team:
                            df.loc[last_row_index, 'next_team'] = next_season_team
                            df.loc[last_row_index, 'next_was_home'] = next_season_was_home
                            df.loc[last_row_index, 'next_opponent_team'] = next_season_opponent_team
                    
                    # Create a new column for next week's fixture difficulty
                    df['next_week_fixture_difficulty'] = df.apply(
                        lambda row: get_next_fixture_difficulty(
                            row['next_team'], row['next_was_home'], row['next_opponent_team']
                        ) if pd.notnull(row['next_team']) and pd.notnull(row['next_opponent_team']) else None, axis=1
                    )
                    
                    # Save the updated CSV back to file
                    df.to_csv(player_file_path, index=False)
                    print(f"Updated {player_file_path} with next_week_fixture_difficulty.")
        else:
            print(f"Position directory {position_dir} does not exist for season {season}.")




Updated merged_data/2022-23/GK/Martin_Dubravka.csv with next_week_fixture_difficulty.
Updated merged_data/2022-23/GK/Matija_Šarkić.csv with next_week_fixture_difficulty.
Updated merged_data/2022-23/GK/Hugo_Lloris.csv with next_week_fixture_difficulty.
Updated merged_data/2022-23/GK/Stefan_Ortega_Moreno.csv with next_week_fixture_difficulty.
Updated merged_data/2022-23/GK/Viljami_Sinisalo.csv with next_week_fixture_difficulty.
Updated merged_data/2022-23/GK/Brandon_Austin.csv with next_week_fixture_difficulty.
Updated merged_data/2022-23/GK/Illan_Meslier.csv with next_week_fixture_difficulty.
Updated merged_data/2022-23/GK/Lukasz_Fabianski.csv with next_week_fixture_difficulty.
Updated merged_data/2022-23/GK/Willy_Caballero.csv with next_week_fixture_difficulty.
Updated merged_data/2022-23/GK/Daniel_Iversen.csv with next_week_fixture_difficulty.
Updated merged_data/2022-23/GK/Gavin_Bazunu.csv with next_week_fixture_difficulty.
Updated merged_data/2022-23/GK/Karl_Hein.csv with next_week_

In [64]:
## CREATING THE TEST DATA 
import os
import pandas as pd

# Define the base directories
base_dir = 'merged_data'
train_data_dir = 'train_data'

# Create the train_data directory if it doesn't exist
if not os.path.exists(train_data_dir):
    os.makedirs(train_data_dir)

# Define the positions and corresponding output files
positions_mapping = {
    'FWD': 'forward.csv',
    'MID': 'midfielder.csv',
    'GK': 'goalkeeper.csv',
    'DEF': 'defender.csv'
}

# Initialize empty dataframes for each position
position_dfs = {
    'FWD': pd.DataFrame(),
    'MID': pd.DataFrame(),
    'GK': pd.DataFrame(),
    'DEF': pd.DataFrame()
}

# Loop through the seasons
for season in ['2022-23', '2023-24']:
    # Loop through each position (FWD, MID, GK, DEF)
    for position, output_file in positions_mapping.items():
        position_dir = os.path.join(base_dir, season, position)

        # Ensure the position directory exists
        if not os.path.exists(position_dir):
            print(f"Position directory {position_dir} does not exist. Skipping.")
            continue

        # Loop through each player CSV file in the position directory
        for player_file in os.listdir(position_dir):
            player_file_path = os.path.join(position_dir, player_file)

            try:
                df = pd.read_csv(player_file_path)
            except pd.errors.EmptyDataError:
                print(f"File {player_file_path} is empty or corrupted. Skipping.")
                continue

            # Filter rows where 'next_week_points' is defined (non-null or non-NaN)
            df_filtered = df[df['next_week_points'].notnull()]

            # Append filtered data to the corresponding dataframe for this position
            position_dfs[position] = pd.concat([position_dfs[position], df_filtered])

# Save each position's dataframe into the corresponding CSV in train_data
for position, output_file in positions_mapping.items():
    output_path = os.path.join(train_data_dir, output_file)
    position_dfs[position].to_csv(output_path, index=False)
    print(f"{output_file} saved with {len(position_dfs[position])} rows.")

print("Processing completed.")


forward.csv saved with 6037 rows.
midfielder.csv saved with 21402 rows.
goalkeeper.csv saved with 4755 rows.
defender.csv saved with 16472 rows.
Processing completed.
