In [1]:
import pandas as pd
import csv
import re
import os
import glob


In [2]:
season_list = ['2021_2022','2022_2023','2023_2024']

In [3]:
#Intemediary Solution to Appending the raw player data tables

# List all CSV files in the directory
csv_files = glob.glob('**/*.csv', recursive=True)
keywords = ['_stats', '_keepers', '_shooting', '_misc', '_passing', '_defense']

# Dictionary to hold DataFrames for each keyword
dataframes = {keyword: [] for keyword in keywords}

# Iterate over each CSV file and keyword
for csv in csv_files:
    for keyword in keywords:
        if keyword in csv.lower():
            # Read the CSV file and append to the corresponding list in the dictionary
            df = pd.read_csv(csv)
            dataframes[keyword].append(df)
            # Delete the original CSV file
            os.remove(csv)
            break  # Move to the next CSV file after finding a matching keyword

# Concatenate DataFrames for each keyword and save to new CSV files
for keyword, dfs in dataframes.items():
    if dfs:  # Check if there are any DataFrames to concatenate
        combined_df = pd.concat(dfs, ignore_index=True)
        combined_df.to_csv(f'league_data{keyword}.csv', index=False)



In [49]:
# Cleaning the scraped table: Stats

csv_files = glob.glob(f'*_Stats_*.csv')
print(csv_files)

for csv in csv_files:

    season = csv[0:9]

    # Reading in the csv
    df_cleaning = pd.read_csv(csv)

    # Specific Column Modifications
    df_cleaning['Minutes'] = df_cleaning['Minutes'].astype(str)
    df_cleaning['Minutes'] = df_cleaning['Minutes'].str.replace(',','')

    # Dropping unecessary columns
    df_cleaning = df_cleaning.dropna(how='all').drop('Matches', axis=1)

    #Creating new, desired columns
    df_cleaning['League'] = df_cleaning['Squad'].str[:2]
    df_cleaning['Season'] = season
    df_cleaning['Year of Birth'] = df_cleaning['Year of birth']

    # Manipulating columns to be readable
    df_cleaning['Squad'] = df_cleaning['Squad'].apply(lambda x: re.sub(r'^[a-z]+\s+', '', x))
    df_cleaning['Nation'] = df_cleaning['Nation'].apply(lambda x: re.sub(r'^[a-z]+\s+', '', x))

    # Creating a list of the headers
    header_lst = list(df_cleaning)
    # Ensuring headers are the correct type
    for header in header_lst:
        if header in ('Player','Nation','Position','Squad','League','Season'):
            df_cleaning[header]=df_cleaning[header].astype(str)
        elif header in ('Age', 'Year of birth', 'Matches Played','Starts','Minutes','Goals',
                        'Assists','Goals + Assists','Non-Penalty Goals', 'Penalty Kicks Made', 
                        'Penalty Kicks Attempted', 'Yellow Cards', 'Red Cards', 
                        'Progressive Carries','Progressive Passes', 'Progressive Passes Rec'):
            df_cleaning[header]=df_cleaning[header].astype(int)
            
        else:
            df_cleaning[header]=df_cleaning[header].astype(float)

    df_fact = df_cleaning[['Player','Nation','Position','Squad','League','Age','Year of Birth','Matches Played','Starts','90s PLayed','Yellow Cards','Red Cards','Season']]
    df_attack = df_cleaning.drop(columns=['Position','Squad','League','Age','Year of Birth','Matches Played','Starts','90s PLayed','Yellow Cards','Red Cards'], inplace=True)

    df_attack.to_csv(f'Champions_League_Stats_cleaned_.csv', index=False)

['2022_2023_Champions_League_Stats_stats_raw.csv', '2023_2024_Champions_League_Stats_stats_raw.csv', '2021_2022_Champions_League_Stats_stats_raw.csv']
2022_2023
2023_2024
2021_2022


In [None]:
# Cleaning the scraped table: Defence

for season in season_list:
    # Reading in the csv
    df_cleaning = pd.read_csv(f'{season}_Champions_League_Stats_defence_raw.csv')

    # # Specific Column Modifications
    df_cleaning['Minutes'] = df_cleaning['Minutes'].astype(str)
    df_cleaning['Minutes'] = df_cleaning['Minutes'].str.replace(',','')

    # Dropping unecessary columns
    df_cleaning = df_cleaning.dropna(how='all').drop('Matches', axis=1)

    #Creating new, desired columns
    df_cleaning['League'] = df_cleaning['Squad'].str[:2]
    df_cleaning['Season'] = season

    # Manipulating columns to be readable
    df_cleaning['Squad'] = df_cleaning['Squad'].apply(lambda x: re.sub(r'^[a-z]+\s+', '', x))
    df_cleaning['Nation'] = df_cleaning['Nation'].apply(lambda x: re.sub(r'^[a-z]+\s+', '', x))

    # Creating a list of the headers
    header_lst = list(df_cleaning)
    # Ensuring headers are the correct type
    for header in header_lst:
        if header in ('Player','Nation','Position','Squad','League'):
            df_cleaning[header]=df_cleaning[header].astype(str)
        elif header in ('Age', 'Year of birth', 'Matches Played','Starts','Minutes','Goals',
                        'Assists','Goals + Assists','Non-Penalty Goals', 'Penalty Kicks Made', 
                        'Penalty Kicks Attempted', 'Yellow Cards', 'Red Cards', 
                        'Progressive Carries','Progressive Passes', 'Progressive Passes Rec'):
            df_cleaning[header]=df_cleaning[header].astype(int)
            
        else:
            df_cleaning[header]=df_cleaning[header].astype(float)

    # # Extract the league abbreviation from the 'Squad' column
    # df_cleaned['League'] = df_cleaned['Squad'].str.extract(r'(\b\w{2}\b)$')

    df_cleaning.to_csv('2023_2024_Champions_League_Stats_stats.csv', index=False)

In [32]:
# Cleaning the player transfer value tables

transfer_value_table_lst = ['uefa_atck_transfer_values_raw.csv',
                            'uefa_def_transfer_values_raw.csv',
                            'uefa_mid_transfer_values_raw.csv',
                            'uefa_gkp_transfer_values_raw.csv'
                            ]

df_empty = pd.DataFrame(columns=['player_id', 'market_value'])
first_iter = True

for table in transfer_value_table_lst:

    df_empty = pd.DataFrame(columns=['player_id', 'market_value'])

    table_name = f'{table[:-7]}cleaned.csv'
    print(table_name)

    # Reading in the csv
    df = pd.read_csv(table)

    # Remove all rows with any missing values
    df.dropna(inplace=True)

    # Convert 'name' column from 'first-last' to 'First Last'
    df['Player'] = df['Player'].str.replace('-', ' ').str.title()

    # Converting the market value column to int# Remove currency symbols and letters, then convert to float and scale
    df['Market Value'] = df['Market Value'].str.replace(r'[€m]', '', regex=True).astype(float) * 1e6
    df['Market Value'] = df['Market Value'].astype(int)

    # Renaming value column
    df.rename(columns={'Player': 'player_id', 'Market Value': 'market_value'}, inplace=True)
    
    # Append all the datasets for a complete transfer value table
    first_iter = False

    if first_iter:
        df_concat = pd.concat([df_empty, df], axis=0)
    else:
        df_concat = pd.concat([df_concat, df], axis=0)


df_concat.to_csv('uefa_player_value_lookup.csv', index=False)


uefa_atck_transfer_values_cleaned.csv
uefa_def_transfer_values_cleaned.csv
uefa_mid_transfer_values_cleaned.csv
uefa_gkp_transfer_values_cleaned.csv


In [65]:
# Cleaning and Transforming the club data

import pandas as pd

df = pd.read_csv('club_data.csv')

df = df.dropna(subset=['Overall'])

# Melt the DataFrame to reshape it
df = pd.melt(df, id_vars=['Club Name', 'Country'], value_vars=['21/22', '22/23', '23/24'], 
                    var_name='Season', value_name='Uefa Rating')

# Map the season column to the correct format
season_mapping = {
    '21/22': '2021-2022',
    '22/23': '2022-2023',
    '23/24': '2023-2024'
}

df['Season'] = df['Season'].map(season_mapping)

# Convert 'Uefa Ranking' to numeric, forcing errors to NaN
df['Uefa Rating'] = pd.to_numeric(df['Uefa Rating'], errors='coerce')

# Calculate overall ranking for each club
df['Overall Rating'] = df.groupby('Club Name')['Uefa Rating'].transform('sum')

# Rank the clubs within each season based on their UEFA ranking
df['Season Ranking'] = df.groupby('Season')['Uefa Rating'].rank(ascending=False, method='min')

# Drop duplicates based on 'Club' within each 'Season'
df_unique = df.drop_duplicates(subset=['Season', 'Club Name'])

# Rank the clubs within each season
df_unique['Overall Club Rank'] = df_unique.groupby('Season')['Overall Rating'].rank(ascending=False, method='min')

# Merge the ranks back to the original DataFrame
df = df.merge(df_unique[['Season', 'Club Name', 'Overall Club Rank']], on=['Season', 'Club Name'], how='left')

df.to_csv('club_data_dim.csv', index=False)


In [None]:
# List of Stats collected in initial web scrape
stats_lst = ['defense_raw','gca_raw','keepers_raw','misc_raw',
             'stats_raw','shooting_raw','passing_types_raw','passing_raw','possession_raw']

for stat in stats_lst:

    # Find all CSV files containing the stat key word in their title
    csv_files = glob.glob(f'*{stat}*.csv')

    # Initialize an empty DataFrame
    dataframes = []

    # Read each CSV files and append the df to the list
    for file in csv_files:
        df = pd.read_csv(file)
        dataframes.append(df)

    # New naming convention
    stat = stat.capitalize()
    stat = stat

    # Checking if list is empty
    if len(dataframes)>0:

        # Concat all dfs in the list
        combined_df = pd.concat(dataframes, ignore_index=True, axis=0)

        # Save the combined DataFrame to a new CSV file
        combined_df.to_csv(f'Champions_League_{stat}.csv', index=False)

        # Delete the original CSV files
        for file in csv_files:
            os.remove(file)

        print("CSV files combined and original files deleted.")

    else:
        print(f'No CSVs match the naming convention: {stat}')

In [76]:
# Cleaning and transforming competition data

# Find all CSV files containing the stat key word in their title
csv_files = glob.glob(f'*comp*.csv')

for file in csv_files:

    file_name = str(file)

    # Reading the csv
    df = pd.read_csv(file)

    # Dropping all columns with no values
    df.dropna(axis=1, how='all', inplace=True)

    if file not in glob.glob(f'*intl*.csv'):

        df['Country'] = df['Country'].apply(lambda x: re.sub(r'^[a-z]+\s+', '', x))
    else:
        pass

    df.drop(columns=['Unnamed: 0'], inplace=True)

    # Dropping all rows where column gender not equal to M
    df = df[df['Gender'] == 'M']

    df.to_csv(file_name, index=False)


In [80]:
# Creating Dimension tables for this use case

seasons = ['2021_2022','2022_2023','2023_2024']

dfs = []
for season in seasons:

    # Reading the csv
    df1 = pd.read_csv(f'comps_1_fa_club_league_senior_{season}_raw.csv')

    df1['season'] = season

    df2 = pd.read_csv(f'comps_fa_club_cup_{season}_raw.csv')
    
    df2['season'] = season

    dfs.append(pd.merge(df1,df2, on='Country', how='left'))

combined_df = pd.concat(dfs, ignore_index=True)
combined_df.to_csv('league_stats_dim.csv', index=False)


