In [4]:
import os
import pandas as pd
import re

# Define the base directories
train_data_directory = os.path.join(os.getcwd(), 'train_data')
fpl_gw_directory = os.path.join(os.getcwd(), 'fpl_gw_data')
understat_gw_directory = os.path.join(os.getcwd(), 'understat_gw_data')

# Create train_data directory and season subdirectories if they don't exist
seasons = ['2022-23', '2023-24']
for season in seasons:
    season_folder = os.path.join(train_data_directory, season)
    if not os.path.exists(season_folder):
        os.makedirs(season_folder)

# Columns to exclude from the final train data
columns_to_exclude = [
    'kickoff_time', 'round', 'team_a_score', 'team_h_score', 
    'transfers_balance', 'transfers_in', 'transfers_out', 
    'value', 'was_home'
]

# Function to get player's gameweek row from fpl_gw_data
def get_player_gw_row(player_path, gw):
    try:
        df = pd.read_csv(player_path)
        if len(df) > gw:
            return df.iloc[gw]  # gw+1 is the (gw)-th index in 0-based indexing
        return None
    except Exception as e:
        print(f"Error reading player file {player_path}: {e}")
        return None

# Function to calculate next_gw_points for a player
def get_next_gw_points(player_path, gw, next_season_player_path=None):
    try:
        df = pd.read_csv(player_path)
        if gw < 37:  # Not the last gameweek
            return df.iloc[gw + 1]['total_points']
        elif next_season_player_path:  # Last gameweek, use the next season's first week
            next_season_df = pd.read_csv(next_season_player_path)
            return next_season_df.iloc[0]['total_points']
        return None
    except Exception as e:
        print(f"Error calculating next_gw_points for {player_path}: {e}")
        return None

# Function to find the correct file in understat_gw_data (handle the xxx suffix)
def find_understat_file(understat_season_folder, player_name):
    player_name_prefix = player_name.replace(' ', '_')
    for file in os.listdir(understat_season_folder):
        if re.match(rf"^{player_name_prefix}_\d{{3}}\.csv$", file):
            return os.path.join(understat_season_folder, file)
    return None

# Function to calculate last five gameweeks' stats and per 90 minutes
def calculate_last_five_stats(player_path, gw, stat_column, minutes_column):
    try:
        df = pd.read_csv(player_path)
        start_index = max(0, gw - 4)  # Ensure we don't go out of bounds
        last_five_gws = df.iloc[start_index:gw]
        total_stat = last_five_gws[stat_column].sum()
        total_minutes = last_five_gws[minutes_column].sum()
        stat_per_90 = total_stat / (total_minutes / 90) if total_minutes > 0 else 0
        return total_stat, stat_per_90
    except Exception as e:
        print(f"Error calculating last five stats for {player_path}: {e}")
        return 0, 0

# Function to calculate form (average total_points over the last 5 gameweeks)
def calculate_form(player_path, gw):
    try:
        df = pd.read_csv(player_path)
        start_index = max(0, gw - 4)
        last_five_gws = df.iloc[start_index:gw]
        return last_five_gws['total_points'].mean()
    except Exception as e:
        print(f"Error calculating form for {player_path}: {e}")
        return 0

# Function to get last season's stats (goals and assists)
def get_last_season_stats(player_path):
    try:
        df = pd.read_csv(player_path)
        if len(df) > 1:
            second_last_row = df.iloc[-2]
            return second_last_row['goals'], second_last_row['assists']
        return 0, 0
    except Exception as e:
        print(f"Error fetching last season stats for {player_path}: {e}")
        return 0, 0

# Function to process each gameweek for each season
def process_gameweek(season, gw):
    fpl_season_folder = os.path.join(fpl_gw_directory, season)
    understat_season_folder = os.path.join(understat_gw_directory, season)
    train_season_folder = os.path.join(train_data_directory, season)

    gw_file = f"gw{gw + 1}.csv"  # Gameweek CSV filename
    gw_data = []  # List to store all player rows for the gameweek

    # Iterate through all player files in fpl_gw_data for this season
    for player_file in os.listdir(fpl_season_folder):
        player_path = os.path.join(fpl_season_folder, player_file)
        player_name = player_file.replace('_', ' ').replace('.csv', '')

        # Get player's gameweek row from fpl_gw_data
        player_gw_row = get_player_gw_row(player_path, gw)
        if player_gw_row is None:
            continue  # Skip if no data for this player in the gameweek

        # Filter out unnecessary columns
        player_gw_row_filtered = player_gw_row.drop(columns_to_exclude)

        # Calculate next_gw_points (use next season if it's gw 38)
        if gw == 37 and season == '2022-23':
            next_season_player_path = os.path.join(fpl_gw_directory, '2023-24', player_file)
        else:
            next_season_player_path = None
        player_gw_row_filtered['next_gw_points'] = get_next_gw_points(player_path, gw, next_season_player_path)

        # Find the correct understat file for the player
        understat_player_file = find_understat_file(understat_season_folder, player_name)
        if understat_player_file is None:
            print(f"Warning: No understat file found for player {player_name} in season {season}")
            continue

        # Fetch data from understat_gw_data (expected stats)
        understat_player_row = get_player_gw_row(understat_player_file, gw)
        if understat_player_row is not None:
            player_gw_row_filtered['expected_assists'] = understat_player_row['expected_assists']
            player_gw_row_filtered['expected_goals'] = understat_player_row['expected_goals']

            # Calculate last 5 gameweeks' stats and per 90 minute stats
            last_five_assists, last_five_assists_p90 = calculate_last_five_stats(understat_player_file, gw, 'expected_assists', 'minutes')
            last_five_goals, last_five_goals_p90 = calculate_last_five_stats(understat_player_file, gw, 'expected_goals', 'minutes')
            player_gw_row_filtered['last_five_expected_assists'] = last_five_assists
            player_gw_row_filtered['last_five_expected_assists_p90'] = last_five_assists_p90
            player_gw_row_filtered['last_five_expected_goals'] = last_five_goals
            player_gw_row_filtered['last_five_expected_goals_p90'] = last_five_goals_p90

        # Calculate form (average total_points over the last 5 gameweeks)
        player_gw_row_filtered['form'] = calculate_form(player_path, gw)

        # Get last season's stats (goals and assists)
        if season == '2023-24':
            last_season_path = os.path.join(fpl_gw_directory, '2022-23', player_file)
        elif season == '2022-23':
            last_season_path = os.path.join(fpl_gw_directory, '2021-22', player_file)
        player_gw_row_filtered['last_season_goals'], player_gw_row_filtered['last_season_assists'] = get_last_season_stats(last_season_path)

        # Append the player's processed row to the gameweek data list
        gw_data.append(player_gw_row_filtered)

    # Convert the list of rows into a DataFrame and write to CSV
    gw_df = pd.DataFrame(gw_data)
    gw_df.to_csv(os.path.join(train_season_folder, gw_file), index=False)
    print(f"Processed and saved {gw_file} for season {season}")

# Process each season and gameweek
for season in seasons:
    for gw in range(38):
        process_gameweek(season, gw)

print("Train data has been successfully created and saved.")



Error fetching last season stats for /Users/advaitabrol/Desktop/fpl.bot/fpl_gw_data/2021-22/Ryan_Yates.csv: [Errno 2] No such file or directory: '/Users/advaitabrol/Desktop/fpl.bot/fpl_gw_data/2021-22/Ryan_Yates.csv'
Error fetching last season stats for /Users/advaitabrol/Desktop/fpl.bot/fpl_gw_data/2021-22/Gonçalo_Manuel_Ganchinho_Guedes.csv: [Errno 2] No such file or directory: '/Users/advaitabrol/Desktop/fpl.bot/fpl_gw_data/2021-22/Gonçalo_Manuel_Ganchinho_Guedes.csv'
Error fetching last season stats for /Users/advaitabrol/Desktop/fpl.bot/fpl_gw_data/2021-22/Nathan_Aké.csv: 'goals'
Error fetching last season stats for /Users/advaitabrol/Desktop/fpl.bot/fpl_gw_data/2021-22/Jamie_Vardy.csv: 'goals'
Error fetching last season stats for /Users/advaitabrol/Desktop/fpl.bot/fpl_gw_data/2021-22/Joe_Rothwell.csv: [Errno 2] No such file or directory: '/Users/advaitabrol/Desktop/fpl.bot/fpl_gw_data/2021-22/Joe_Rothwell.csv'
Error fetching last season stats for /Users/advaitabrol/Desktop/fpl.bo

KeyboardInterrupt: 