In [117]:
import pandas as pd
import csv
import re
import os
import glob
import pycountry_convert as pc


In [3]:
season_list = ['2021_2022','2022_2023','2023_2024']

In [14]:
# Cleaning the scraped Stats tables

# Initialize an empty DataFrame
final_df = pd.DataFrame()

# Path to your CSV files
path = 'raw_data/player_raw_data/player_stats_data_raw/*_Stats_*.csv'

csv_files = glob.glob(path)
# print(csv_files)

for csv in csv_files:

    # Extracting the season
    season_match = re.search(r'(\d{4}_\d{4})', csv)
    season = season_match.group(1) if season_match else None

    # Extracting the league
    league_match = re.search(r'\d{4}_\d{4}_(.*?)_Stats', csv)
    league = league_match.group(1) if league_match else None

    # Reading in the csv
    df_cleaning = pd.read_csv(csv)

    # Specific Column Modifications
    df_cleaning['Minutes'] = df_cleaning['Minutes'].astype(str).str.replace(',', '')

    # Dropping unecessary columns
    df_cleaning = df_cleaning.dropna(how='all').drop('Matches', axis=1)

    #Creating new, desired columns
    df_cleaning['League'] = league
    df_cleaning['Season'] = season
    
    # Manipulating columns to be readable
    df_cleaning['Squad'] = df_cleaning['Squad'].apply(lambda x: re.sub(r'^[a-z]+\s+', '', x))
    df_cleaning['Nation'] = df_cleaning['Nation'].apply(lambda x: re.sub(r'^[a-z]+\s+', '', str(x)))

    # Creating a list of the headers
    header_lst = list(df_cleaning)
    
    # Ensuring headers are the correct type
    for header in header_lst:
        
        if header in ('Player', 'Nation', 'Position', 'Squad', 'League', 'Season'):
            df_cleaning[header] = df_cleaning[header].astype(str)
        elif header in ('Age', 'Year of birth', 'Matches Played', 'Starts', 'Minutes', 'Goals',
                        'Assists', 'Goals + Assists', 'Non-Penalty Goals', 'Penalty Kicks Made', 
                        'Penalty Kicks Attempted', 'Yellow Cards', 'Red Cards', 
                        'Progressive Carries', 'Progressive Passes', 'Progressive Passes Rec'):
            df_cleaning[header] = pd.to_numeric(df_cleaning[header], errors='coerce').astype('Int64')
        else:
            df_cleaning[header] = pd.to_numeric(df_cleaning[header], errors='coerce')

    # Function to convert sports code to ISO code
    def convert_sports_code(code):
        # if code in exceptions:
        #     code = exceptions[code]
        try:
            return pc.country_alpha2_to_country_name(pc.country_alpha3_to_country_alpha2(code))
        except KeyError:
            return None
        
        # Append the manipulated DataFrame to the final DataFrame
    final_df = pd.concat([final_df, df_cleaning], ignore_index=True, sort=False)
    print(f'csv: {csv} concatenated to a combined csv')
    final_df['full_nation_name'] = final_df['Nation'].apply(lambda x: convert_sports_code(x))


new_headers = [col.lower().replace(' ', '_') for col in final_df.columns]
final_df.columns = new_headers

final_df.to_csv(f'staging_data/combined_player_stats_cleaned.csv', index=False)

csv: raw_data/player_raw_data/player_stats_data_raw/2021_2022_Super_League_Greece_Stats_Player.csv concatenated to a combined csv
csv: raw_data/player_raw_data/player_stats_data_raw/2022_2023_Ekstraklasa_Stats_Player.csv concatenated to a combined csv
csv: raw_data/player_raw_data/player_stats_data_raw/2021_2022_Bulgarian_First_League_Stats_Player.csv concatenated to a combined csv
csv: raw_data/player_raw_data/player_stats_data_raw/2022_2023_Serie_A_Stats_Player.csv concatenated to a combined csv
csv: raw_data/player_raw_data/player_stats_data_raw/2023_2024_Premier_League_Stats_Player.csv concatenated to a combined csv
csv: raw_data/player_raw_data/player_stats_data_raw/2021_2022_La_Liga_Stats_Player.csv concatenated to a combined csv
csv: raw_data/player_raw_data/player_stats_data_raw/2022_2023_Ligue_1_Stats_Player.csv concatenated to a combined csv
csv: raw_data/player_raw_data/player_stats_data_raw/2023_2024_Bulgarian_First_League_Stats_Player.csv concatenated to a combined csv
csv

In [5]:
# Cleaning the scraped Player tables

keywords = ['keeper', 'shooting', 'misc', 'passing', 'defense']

for keyword in keywords:

    # Initialize an empty DataFrame
    final_df = pd.DataFrame()

    # Path to your CSV files
    path = f'raw_data/player_raw_data/player_{keyword}_data_raw/**.csv'

    # Print the path to check if it's correct
    print("Looking for files in:", path)

    csv_files = glob.glob(path)
    print("Found files:", csv_files)

    for csv in csv_files:

        # Extracting the season
        season_match = re.search(r'(\d{4}_\d{4})', csv)
        season = season_match.group(1) if season_match else None

        # Extracting the league
        league_match = re.search(rf'\d{{4}}_\d{{4}}_(.*?)_{keyword}', csv, re.IGNORECASE)
        print(league_match)
        league = league_match.group(1) if league_match else None    

        # Reading in the csv
        df_cleaning = pd.read_csv(csv)


        # Dropping unecessary columns
        df_cleaning = df_cleaning.dropna(how='all').drop('Matches', axis=1)

        #Creating new, desired columns
        df_cleaning['League'] = league
        df_cleaning['Season'] = season
        
        # Manipulating columns to be readable
        df_cleaning['Squad'] = df_cleaning['Squad'].apply(lambda x: re.sub(r'^[a-z]+\s+', '', x))
        df_cleaning['Nation'] = df_cleaning['Nation'].apply(lambda x: re.sub(r'^[a-z]+\s+', '', str(x)))

        # Creating a list of the headers
        header_lst = list(df_cleaning)

        # Ensuring headers are the correct type
        for header in header_lst:
    
            if header in ('Player', 'Nation', 'Position', 'Squad', 'League', 'Season'):
                df_cleaning[header] = df_cleaning[header].astype(str)
            if header in ('Age', 'Year of birth'):
                df_cleaning[header] = df_cleaning[header].astype('Int64')

        # Append the manipulated DataFrame to the final DataFrame
        final_df = pd.concat([final_df, df_cleaning], ignore_index=True, sort=False)
        print(f'csv: {csv} concatenated to a combined csv')

    # Creating desired column namings
    new_headers = [col.lower().replace(' ', '_') for col in final_df.columns]
    final_df.columns = new_headers

    final_df.to_csv(f'staging_data/combined_player_{keyword}_cleaned.csv', index=False)

Current working directory: /Users/michaelbutterfield/fb_dataeng_project
Looking for files in: raw_data/player_raw_data/player_keeper_data_raw/**.csv
Found files: ['raw_data/player_raw_data/player_keeper_data_raw/2022_2023_Austrian_Bundesliga_Keepers_Player.csv', 'raw_data/player_raw_data/player_keeper_data_raw/2023_2024_Eredivisie_Keepers_Player.csv', 'raw_data/player_raw_data/player_keeper_data_raw/2022_2023_Bundesliga_Keepers_Player.csv', 'raw_data/player_raw_data/player_keeper_data_raw/2023_2024_Ligue_1_Keepers_Player.csv', 'raw_data/player_raw_data/player_keeper_data_raw/2021_2022_Super_Lig_Keepers_Player.csv', 'raw_data/player_raw_data/player_keeper_data_raw/2023_2024_La_Liga_Keepers_Player.csv', 'raw_data/player_raw_data/player_keeper_data_raw/2022_2023_Bulgarian_First_League_Keepers_Player.csv', 'raw_data/player_raw_data/player_keeper_data_raw/2021_2022_Super_League_Greece_Keepers_Player.csv', 'raw_data/player_raw_data/player_keeper_data_raw/2022_2023_Primeira_Liga_Keepers_Playe

csv: raw_data/player_raw_data/player_keeper_data_raw/2021_2022_Super_Lig_Keepers_Player.csv concatenated to a combined csv
<re.Match object; span=(48, 72), match='2023_2024_La_Liga_Keeper'>
csv: raw_data/player_raw_data/player_keeper_data_raw/2023_2024_La_Liga_Keepers_Player.csv concatenated to a combined csv
<re.Match object; span=(48, 87), match='2022_2023_Bulgarian_First_League_Keeper'>
csv: raw_data/player_raw_data/player_keeper_data_raw/2022_2023_Bulgarian_First_League_Keepers_Player.csv concatenated to a combined csv
<re.Match object; span=(48, 84), match='2021_2022_Super_League_Greece_Keeper'>
csv: raw_data/player_raw_data/player_keeper_data_raw/2021_2022_Super_League_Greece_Keepers_Player.csv concatenated to a combined csv
<re.Match object; span=(48, 78), match='2022_2023_Primeira_Liga_Keeper'>
csv: raw_data/player_raw_data/player_keeper_data_raw/2022_2023_Primeira_Liga_Keepers_Player.csv concatenated to a combined csv
<re.Match object; span=(48, 78), match='2023_2024_Primeira_

In [6]:
# Cleaning and Appending Team Data


keywords = ['stats', 'keeper', 'shooting', 'misc', 'passing', 'defense']

for keyword in keywords:

    # Initialize an empty DataFrame
    final_df = pd.DataFrame()

    # Path to your CSV files
    path = f'raw_data/team_raw_data/team_{keyword}_data_raw/**.csv'

        # Print the current working directory
    print("Current working directory:", os.getcwd())

    # Print the path to check if it's correct
    print("Looking for files in:", path)

    csv_files = glob.glob(path)
    print("Found files:", csv_files)

    for csv in csv_files:

        # Extracting the season
        season_match = re.search(r'(\d{4}_\d{4})', csv)
        season = season_match.group(1) if season_match else None

        # Extracting the league
        league_match = re.search(rf'\d{{4}}_\d{{4}}_(.*?)_{keyword}', csv, re.IGNORECASE)
        print(league_match)
        league = league_match.group(1) if league_match else None    

        # Reading in the csv
        df_cleaning = pd.read_csv(csv)

        # Dropping unecessary columns
        df_cleaning = df_cleaning.dropna(how='all')

        if keyword == 'stats':
                # Specific Column Modifications
                df_cleaning['Minutes'] = df_cleaning['Minutes'].astype(str).str.replace(',', '')

        #Creating new, desired columns
        df_cleaning['League'] = league
        df_cleaning['Season'] = season

        # Creating a list of the headers
        header_lst = list(df_cleaning)
        
        # Ensuring headers are the correct type
        for header in header_lst:
            
            if header == 'Season':
                df_cleaning[header] = df_cleaning[header].astype(str)
            else:
                def convert_value(value):
                 
                    try:
                        return int(value)
                    except ValueError:
                        try:
                            return float(value)
                        except ValueError:
                            return str(value)

                # Apply the function to the DataFrame column
                df_cleaning[header] = df_cleaning[header].apply(convert_value)

        # Append the manipulated DataFrame to the final DataFrame
        final_df = pd.concat([final_df, df_cleaning], ignore_index=True, sort=False)
        print(f'csv: {csv} concatenated to a combined csv')

    # Creating desired column namings
    new_headers = [col.lower().replace(' ', '_') for col in final_df.columns]
    final_df.columns = new_headers

    final_df.to_csv(f'staging_data/combined_team_{keyword}_cleaned.csv', index=False)

Current working directory: /Users/michaelbutterfield/fb_dataeng_project
Looking for files in: raw_data/team_raw_data/team_stats_data_raw/**.csv
Found files: ['raw_data/team_raw_data/team_stats_data_raw/2023_2024_Super_Lig_Stats_Team.csv', 'raw_data/team_raw_data/team_stats_data_raw/2022_2023_Danish_Superliga_Stats_Team.csv', 'raw_data/team_raw_data/team_stats_data_raw/2023_2024_Belgian_Pro_League_Stats_Team.csv', 'raw_data/team_raw_data/team_stats_data_raw/2022_2023_Serie_A_Stats_Team.csv', 'raw_data/team_raw_data/team_stats_data_raw/2023_2024_Primeira_Liga_Stats_Team.csv', 'raw_data/team_raw_data/team_stats_data_raw/2022_2023_Bundesliga_Stats_Team.csv', 'raw_data/team_raw_data/team_stats_data_raw/2021_2022_Danish_Superliga_Stats_Team.csv', 'raw_data/team_raw_data/team_stats_data_raw/2022_2023_Bulgarian_First_League_Stats_Team.csv', 'raw_data/team_raw_data/team_stats_data_raw/2021_2022_Belgian_Pro_League_Stats_Team.csv', 'raw_data/team_raw_data/team_stats_data_raw/2021_2022_Serbian_Sup

In [184]:

# Cleaning the player transfer value tables
file_path = 'raw_data/transfer_value_data_raw'

transfer_value_table_lst = ['atck_transfer_values_raw.csv',
                            'def_transfer_values_raw.csv',
                            'mid_transfer_values_raw.csv',
                            'gkp_transfer_values_raw.csv'
                            ]

df_concat = pd.DataFrame()

for table in transfer_value_table_lst:

    table_name = f'{table[:-7]}cleaned.csv'
    print(table_name)

    # Reading in the csv
    df = pd.read_csv(f'{file_path}/{table}')

    # Remove all rows with any missing values
    df.dropna(inplace=True)

    # Convert 'name' column from 'first-last' to 'First Last'
    df['Player'] = df['Player'].str.replace('-', ' ').str.title()

    def convert_date_to_season(date_str):
        date = pd.to_datetime(date_str, dayfirst=True)
        year = date.year
        month = date.month

        if month >= 6:
            season = f"{year}_{year + 1}"
        else:
            season = f"{year - 1}_{year}"
        
        return season

    # Apply function to Date column and rename it to Season
    df['season'] = df['Date'].apply(convert_date_to_season)
    df.drop('Date', axis=1, inplace=True)

    # Converting the market value column to int
    # Remove currency symbols and letters, then convert to float and scale
    df['Record MV'] = df['Record MV'].str.replace(r'[€m]', '', regex=True).astype(float) * 1e6
    df['Record MV'] = df['Record MV'].astype(int)

    df['Age'] = df['Age'].astype(int)
    # Append all the datasets for a complete transfer value table
    
    # Renaming value column
    df.rename(columns={'Player': 'player', 'Record MV': 'max_market_value'}, inplace=True)

    df_concat = pd.concat([df_concat, df], axis=0)

# Creating desired column namings
new_headers = [col.lower().replace(' ', '_') for col in df_concat.columns]
df_concat.columns = new_headers

df_concat.to_csv('staging_data/combined_player_max_value_cleaned.csv', index=False)


atck_transfer_values_cleaned.csv
def_transfer_values_cleaned.csv
mid_transfer_values_cleaned.csv
gkp_transfer_values_cleaned.csv


In [174]:
# Not needed at the minute, team names don't match other tables
# Cleaning and Transforming the club data
from fuzzywuzzy import process, fuzz
from difflib import SequenceMatcher

      
# Load your tables into pandas DataFrames
df = pd.read_csv('raw_data/club_raw_data/club_rank_data.csv')
df2 = pd.read_csv('staging_data/combined_team_stats_cleaned.csv')

# Mapping dictionary: add more leagues as needed
country_mapping = {
    'Austrian_Bundesliga': 'Austria',
    'Belgian_Pro_League': 'Belgium',
    'Bulgarian_First_League': 'Bulgaria',
    'Bundesliga': 'Germany',
    'Danish_Superliga': 'Denmark',
    'Ekstraklasa': 'Poland',
    'Eredivisie': 'Netherlands',
    'La_Liga': 'Spain',
    'Ligue_1': 'France',
    'Premier_League': 'England',
    'Primeira_Liga': 'Portugal',
    'Serbian_SuperLiga': 'Serbia',
    'Serie_A': 'Italy',
    'Super_League_Greece': 'Greece',
    'Super_Lig': 'Turkey'
}
country_list = list(country_mapping.values())

# Filter the DataFrame
df = df[df['Country'].isin(country_list)]

# Function to match club names using multiple algorithms
def match_club_name(club_name, choices):
    matches = []
    
    # FuzzyWuzzy matching
    fw_match = process.extractOne(club_name, choices, scorer=fuzz.token_sort_ratio)
    if fw_match:
        matches.append((fw_match[0], fw_match[1]))

    # Difflib matching
    seq_match = max(choices, key=lambda x: SequenceMatcher(None, club_name, x).ratio())
    seq_score = SequenceMatcher(None, club_name, seq_match).ratio() * 100
    matches.append((seq_match, seq_score))

    # Get the best match
    match, score = max(matches, key=lambda x: x[1])
    return match, score

# Applying the matching function
df['Match'], df['Score'] = zip(*df['Club Name'].apply(lambda x: match_club_name(x, df2['squad'])))

# Function to check if any string in Column B is in club_words
def check_word_inclusion(row, squad_list):
    club_words = [word for word in row['Club Name'].split() if len(word) > 2]
    match_words = row['Match'].split()
    score = row['Score']
    
    # Check word inclusion between Club Name and Match
    if any(word.lower() in row['Club Name'].lower() for word in match_words) or score > 75.0:
        return row['Match']
    
    # Check if any squad name is in club_words
    for squad in squad_list:
        squad_words = squad.split()
        if any(word.lower() in row['Club Name'].lower() for word in squad_words):
            return squad
    
    return 'No Match'

# Apply the function to create a new column
df['Compare'] = df.apply(lambda row: check_word_inclusion(row, df2['squad']), axis=1)

# Update 'club' column based on conditions
df['club'] = df.apply(lambda row: row['Match'] if row['Match'] in row['Club Name'] or row['Score'] > 75.0 else (row['Compare'] if any(word in row['Compare'].split() for word in row['Club Name'].split()) else 'No Match'), axis=1)

# Make manual adjustments
df.loc[df['Club Name'] == 'PAOK Thessaloniki', 'club'] = 'PAOK'
df.loc[df['Club Name'] == 'Borussia Mönchengladbach', 'club'] = 'Gladbach'
df.loc[df['Club Name'] == 'Stade Brestois 29', 'club'] = 'Brest'
df.loc[df['Club Name'] == 'Aarhus GF', 'club'] = 'AGF'
df.loc[df['Club Name'] == 'Wolverhampton Wanderers', 'club'] = 'Wolves'


df = df.dropna(subset=['Overall'])

# Melt the DataFrame to reshape it
df = pd.melt(df, id_vars=['club', 'Country'], value_vars=['21/22', '22/23', '23/24'], 
                    var_name='Season', value_name='Uefa Rating')

# Map the season column to the correct format
season_mapping = {
    '21/22': '2021_2022',
    '22/23': '2022_2023',
    '23/24': '2023_2024'
}

df['Season'] = df['Season'].map(season_mapping)
df['confederation'] = 'UEFA'

df.to_csv('test4.csv')
# Convert 'Uefa Rating' to numeric, forcing errors to NaN
df['Uefa Rating'] = pd.to_numeric(df['Uefa Rating'], errors='coerce')

# Calculate overall ranking for each club
df['Overall Uefa Rating'] = df.groupby('club')['Uefa Rating'].transform('sum')

# Rank the clubs within each season based on their UEFA ranking
df['Uefa Season Ranking'] = df.groupby('Season')['Uefa Rating'].rank(ascending=False, method='min')

# Drop duplicates based on 'Club' within each 'Season'
df_unique = df.drop_duplicates(subset=['Season', 'club'])

# Rank the clubs within each season
df_unique['Overall Uefa Club Rank'] = df_unique.groupby('Season')['Overall Uefa Rating'].rank(ascending=False, method='min')

# Merge the ranks back to the original DataFrame
df = df.merge(df_unique[['Season', 'club', 'Overall Uefa Club Rank']], on=['Season', 'club'], how='left')

df.rename(columns={'club': 'squad'}, inplace=True)

df['Uefa Rating'] = df['Uefa Rating'].fillna(-1).astype('int').astype('Int64')
df['Overall Uefa Rating'] = df['Overall Uefa Rating'].fillna(-1).astype('int').astype('Int64')
df['Uefa Season Ranking'] = df['Uefa Season Ranking'].fillna(-1).astype('int').astype('Int64')
df['Overall Uefa Club Rank'] = df['Overall Uefa Club Rank'].fillna(-1).astype('int').astype('Int64')
# Creating desired column namings
new_headers = [col.lower().replace(' ', '_') for col in df.columns]

df.columns = new_headers

df.to_csv('staging_data/combined_team_uefa_rank_cleaned.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['Overall Uefa Club Rank'] = df_unique.groupby('Season')['Overall Uefa Rating'].rank(ascending=False, method='min')


In [114]:
# Cleaning and transforming competition data

# Find all CSV files containing the stat key word in their title
csv_files = glob.glob(f'raw_data/competitions_raw_data/**.csv')

final_df_dom_comp = pd.DataFrame()
final_df_intl_comp = pd.DataFrame()

for file in csv_files:
    print(file)
    # Reading the csv
    df = pd.read_csv(file)
    
    match = re.search(r'(\d{4}_\d{4})_raw', file)
    season = match.group(1)

    df['Season'] = season

    # Dropping all columns with no values
    df.dropna(axis=1, how='all', inplace=True)    
    
    # Dropping all rows where column gender not equal to M
    df = df[df['Gender'] == 'M']

    # Dropping unwanted columns
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=['Unnamed: 0'], inplace=True)

    if 'intl' not in file:

        print('not intl:', file)
        df['Country'] = df['Country'].apply(lambda x: re.sub(r'^[a-z]+\s+', '', x))

        # Append the manipulated DataFrame to the final DataFrame
        final_df_dom_comp = pd.concat([final_df_dom_comp, df], ignore_index=True, sort=False)
        print(f'file: concatenated to a combined csv')

    else:
        print('intl:', file)

        # Append the manipulated DataFrame to the final DataFrame
        final_df_intl_comp = pd.concat([final_df_intl_comp, df], ignore_index=True, sort=False)
        print(f'file: concatenated to a combined csv')


# Creating desired column namings
new_headers = [col.lower().replace(' ', '_') for col in final_df_dom_comp.columns]
final_df_dom_comp.columns = new_headers

final_df_dom_comp.to_csv(f'staging_data/combined_dom_comp_cleaned.csv', index=False)

# Creating desired column namings
new_headers = [col.lower().replace(' ', '_') for col in final_df_intl_comp.columns]
print(new_headers)
final_df_intl_comp.columns = new_headers

final_df_intl_comp.to_csv(f'staging_data/combined_intl_comp_cleaned.csv', index=False)



raw_data/competitions_raw_data/comps_1_fa_club_league_senior_2022_2023_raw.csv
not intl: raw_data/competitions_raw_data/comps_1_fa_club_league_senior_2022_2023_raw.csv
file: concatenated to a combined csv
raw_data/competitions_raw_data/comps_intl_club_cup_2021_2022_raw.csv
intl: raw_data/competitions_raw_data/comps_intl_club_cup_2021_2022_raw.csv
file: concatenated to a combined csv
raw_data/competitions_raw_data/comps_fa_club_cup_2021_2022_raw.csv
not intl: raw_data/competitions_raw_data/comps_fa_club_cup_2021_2022_raw.csv
file: concatenated to a combined csv
raw_data/competitions_raw_data/comps_intl_club_cup_2023_2024_raw.csv
intl: raw_data/competitions_raw_data/comps_intl_club_cup_2023_2024_raw.csv
file: concatenated to a combined csv
raw_data/competitions_raw_data/comps_fa_club_cup_2023_2024_raw.csv
not intl: raw_data/competitions_raw_data/comps_fa_club_cup_2023_2024_raw.csv
file: concatenated to a combined csv
raw_data/competitions_raw_data/comps_1_fa_club_league_senior_2023_2024_

In [115]:
# Appending nation data

# Function to check if a column can be converted to numeric
def can_convert_to_numeric(series):
    try:
        pd.to_numeric(series)
        return True
    except ValueError:
        return False

nation_df = pd.DataFrame()

csv_files = glob.glob(f'raw_data/nation_raw_data/*rank*.csv')

for file in csv_files:

    dt_search = re.search(r'data_(\d{4})-\d{2}-\d{2}_raw', file)
    year = int(dt_search.group(1))
    season = f"{year-1}_{year}"

    print(season)
    print(file)
    # Reading the csv
    df = pd.read_csv(file)

    df['season'] = season

    df['nation'] = df['nation'].str.replace('Türkiye', 'Turkey').replace('Bosnia-Herzegovina', 'Bosnia and Herzegovina').replace('Brunei Darussalam', 'Brunei').replace('The Gambia', 'Gambia')

        # Check if 'confederation' can be converted to numeric
    if can_convert_to_numeric(df['confederation']):

        # Join with nation_rank_data_2022-09-20_raw as it's known it has confederation data
         # Drop the confederation column from df
        df.drop(columns=['confederation'], inplace=True)
        
        # Read table b
        nat_df = pd.read_csv('raw_data/nation_raw_data/nation_rank_data_2022-09-20_raw.csv')
        nat_df['season'] = season
        nat_df['nation'] = nat_df['nation'].str.replace('Türkiye', 'Turkey').replace('Bosnia-Herzegovina', 'Bosnia and Herzegovina').replace('Brunei Darussalam', 'Brunei').replace('The Gambia', 'Gambia')
        
        # Take only the confederation column from b
        nat_df = nat_df[['nation', 'confederation']]
        
        # Join df with b on the nation column
        df = df.merge(nat_df, on='nation', how='left')

    nation_df = pd.concat([df, nation_df], ignore_index=True, sort=False)


# Creating desired column namings
new_headers = [col.lower().replace(' ', '_') for col in nation_df.columns]
nation_df.columns = new_headers

nation_df.to_csv(f'staging_data/combined_nation_data_cleaned.csv', index=False)

2021_2022
raw_data/nation_raw_data/nation_rank_data_2022-09-20_raw.csv
2023_2024
raw_data/nation_raw_data/nation_rank_data_2024-09-19_raw.csv
2022_2023
raw_data/nation_raw_data/nation_rank_data_2023-09-21_raw.csv


In [39]:
import pandas as pd
import glob
import re

# Initialize an empty DataFrame
final_df = pd.DataFrame()

# Path to your CSV files
path = f'raw_data/team_raw_data/team_stats_data_raw/*placing*.csv'

# Print the path to check if it's correct
print("Looking for files in:", path)

csv_files = glob.glob(path)
print("Found files:", csv_files)

for csv in csv_files:

        # Extracting the season
        season_match = re.search(r'(\d{4}_\d{4})', csv)
        season = season_match.group(1) if season_match else None
        print("Season:", season)

        # Extracting the league
        league_match = re.search(r'\d{4}_\d{4}_(.*?)_Stats', csv, re.IGNORECASE)
        league = league_match.group(1) if league_match else None

        # Reading in the csv
        df_cleaning = pd.read_csv(csv)

        # Dropping unnecessary columns
        df_cleaning = df_cleaning.dropna(how='all')

        def replace_values(note):
        
            if 'champions league' in str(note).lower():
                return 'CL'
            elif 'europa league' in str(note).lower():
                return 'EL'
            elif 'europa conference league' in str(note).lower():
                return 'ECL'
            elif 'relegation round' in str(note).lower():
                return 'RR'           
            elif 'relegated' in str(note).lower():
                return 'R'
            elif 'play off' in str(note).lower() or 'playoff' in str(note).lower() or 'play-off' in str(note).lower():
                return 'PO'
            elif 'championship round' in str(note).lower():
                return 'CR'
            else:
                return None

        # Creating new, desired columns
        df_cleaning['League'] = league
        df_cleaning['Season'] = season
        df_cleaning['Notes'] = df_cleaning['Notes'].apply(replace_values)
        df_cleaning['rank'] = range(1, len(df_cleaning) +1)
        df_cleaning['Attendance/Game'] = df_cleaning.get('Attendance/Game', pd.Series()).astype(str).str.replace(',', '')

        # Reset index to ensure unique index values
        df_cleaning = df_cleaning.reset_index(drop=True)

        # Remove duplicate columns
        df_cleaning = df_cleaning.loc[:, ~df_cleaning.columns.duplicated()]

        # Drop duplicate rows
        df_cleaning = df_cleaning.drop_duplicates()

        # Creating a list of the headers
        header_lst = list(df_cleaning)

        # Ensuring headers are the correct type
        for header in header_lst:
            
            if header in ('Top Team Scorer','Goalkeeper','Notes','League','Season','Squad'):
                df_cleaning[header] = df_cleaning[header].astype(str)
            elif header in ('Points/Match','xG','xG Allowed','xG Difference','xG Difference/90'):
                df_cleaning[header] = pd.to_numeric(df_cleaning[header], errors='coerce')
            else:
                df_cleaning[header] = pd.to_numeric(df_cleaning[header], errors='coerce').astype('Int64')

                
        
        # Append the manipulated DataFrame to the final DataFrame
        final_df = pd.concat([final_df, df_cleaning], ignore_index=True, sort=False)
        print(f'csv: {csv} concatenated to a combined csv')

# Creating desired column namings
new_headers = [col.lower().replace(' ', '_') for col in final_df.columns]
final_df.columns = new_headers

final_df.to_csv(f'staging_data/combined_team_placing_cleaned.csv', index=False)


Looking for files in: raw_data/team_raw_data/team_stats_data_raw/*placing*.csv
Found files: ['raw_data/team_raw_data/team_stats_data_raw/2023_2024_Bulgarian_First_League_Stats_table_placings.csv', 'raw_data/team_raw_data/team_stats_data_raw/2022_2023_Primeira_Liga_Stats_table_placings.csv', 'raw_data/team_raw_data/team_stats_data_raw/2022_2023_Serbian_SuperLiga_Stats_table_placings.csv', 'raw_data/team_raw_data/team_stats_data_raw/2022_2023_Eredivisie_Stats_table_placings.csv', 'raw_data/team_raw_data/team_stats_data_raw/2022_2023_Super_League_Greece_Stats_table_placings.csv', 'raw_data/team_raw_data/team_stats_data_raw/2022_2023_Premier_League_Stats_table_placings.csv', 'raw_data/team_raw_data/team_stats_data_raw/2021_2022_Super_Lig_Stats_table_placings.csv', 'raw_data/team_raw_data/team_stats_data_raw/2023_2024_Super_Lig_Stats_table_placings.csv', 'raw_data/team_raw_data/team_stats_data_raw/2023_2024_Serie_A_Stats_table_placings.csv', 'raw_data/team_raw_data/team_stats_data_raw/2021_

In [106]:
country_code_df = pd.read_csv('raw_data/nation_raw_data/fifa_country_codes.csv')

country_code_df.drop(columns=['Confederation'], inplace=True)
country_code_df['Country'] = country_code_df['Country'].str.replace('U.S.', 'United States').replace('East Timor', 'Timor-Leste').replace('DR Congo', 'Democratic Republic of the Congo').replace('Congo', 'Republic of the Congo')

# Function to clean the code column
def clean_code(code):
    return re.sub(r'\[\d+\]', '', code)

# Apply the function to the 'code' column
country_code_df['Code'] = country_code_df['Code'].apply(clean_code)

new_headers = [col.lower() for col in country_code_df.columns]

country_code_df.columns = new_headers

country_code_df.to_csv(f'staging_data/fifa_country_codes.csv', index=False)