# Aggregating all player data for in-depth analyses and comparisons

In [21]:
# libraries
import pandas as pd
import numpy as np
import time
import re

#### Getting the goal-specific stats for all players playing for clubs from the Top5 leagues in 2018/19

**First, I retrieve the data (1), then I rename the lower level index to club and delete the last column (matches) and delete the higher index row. The nation row gets cleaned too**

In [22]:
# creating list of urls for each top5 league club from the 18-19 season
url_list = open('../urls/top5_league_clubs.txt').read().splitlines()
#read the first url
table = pd.read_html(url_list[0])

HTTPError: HTTP Error 429: Too Many Requests

In [20]:
from urllib.parse import urlparse
# Initialize an empty list to store dataframes
dfs = []

# Set the delay between requests (in seconds)
delay_between_requests = 1  # Adjust this value as needed

for url in url_list:
    try:
        # Read HTML tables from the URL
        tables = pd.read_html(url, attrs={"id": "stats_gca_combined"})

        df = tables[0]
        print(tables[0].head())
        # Drop the last two rows
        df = df.iloc[:-2]
        
        # Extract club name from the URL using regex
        club_name_match = re.search(r"all_comps/(.*?)-Stats-All-Competitions", url)
        if club_name_match:
            club_name = club_name_match.group(1).replace('-', ' ')
        else:
            raise ValueError("Club name not found in URL pattern.")
        
        # Create a new column with the club name
        df['Club'] = club_name
        
        # Append the dataframe to the list
        dfs.append(df)
        
        # Introduce a delay before making the next request
        time.sleep(delay_between_requests)
    except Exception as e:
        print(f"Error reading data from {url}: {str(e)}")

# Concatenate all dataframes into one
total_df = pd.concat(dfs, ignore_index=True)

Error reading data from https://fbref.com/en/squads/b8fd03ef/2018-2019/all_comps/Manchester-City-Stats-All-Competitions: HTTP Error 429: Too Many Requests
Error reading data from https://fbref.com/en/squads/822bd0ba/2018-2019/all_comps/Liverpool-Stats-All-Competitions: HTTP Error 429: Too Many Requests
Error reading data from https://fbref.com/en/squads/cff3d9bb/2018-2019/all_comps/Chelsea-Stats-All-Competitions: HTTP Error 429: Too Many Requests
Error reading data from https://fbref.com/en/squads/361ca564/2018-2019/all_comps/Tottenham-Hotspur-Stats-All-Competitions: HTTP Error 429: Too Many Requests
Error reading data from https://fbref.com/en/squads/18bb7c10/2018-2019/all_comps/Arsenal-Stats-All-Competitions: HTTP Error 429: Too Many Requests
Error reading data from https://fbref.com/en/squads/19538871/2018-2019/all_comps/Manchester-United-Stats-All-Competitions: HTTP Error 429: Too Many Requests
Error reading data from https://fbref.com/en/squads/8cec06e1/2018-2019/all_comps/Wolverh

ValueError: No objects to concatenate

In [6]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2630 entries, 0 to 2629
Data columns (total 29 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   (Unnamed: 0_level_0, Player)    2630 non-null   object 
 1   (Unnamed: 1_level_0, Nation)    2628 non-null   object 
 2   (Unnamed: 2_level_0, Pos)       2630 non-null   object 
 3   (Unnamed: 3_level_0, Age)       2630 non-null   float64
 4   (Unnamed: 4_level_0, 90s)       2630 non-null   float64
 5   (Touches, Touches)              2541 non-null   float64
 6   (Touches, Def Pen)              2541 non-null   float64
 7   (Touches, Def 3rd)              2541 non-null   float64
 8   (Touches, Mid 3rd)              2541 non-null   float64
 9   (Touches, Att 3rd)              2541 non-null   float64
 10  (Touches, Att Pen)              2541 non-null   float64
 11  (Touches, Live)                 2541 non-null   float64
 12  (Take-Ons, Att)                 25

In [5]:
total_df_saved = total_df.copy()

In [6]:
total_df = total_df_saved.copy()

In [4]:
if 'Club' in total_df.columns:
    total_df.insert(1, "Club", total_df.pop("Club"))
else:
    total_df.insert(1, "Club", "")

# remove the club column
# total_df.drop(columns=['Club'], inplace=True)

#drop all rows with that have NaN values in "Sh"
# total_df.dropna(subset=['Sh'], inplace=True)

In [176]:
total_df.info()
total_df.rename(columns={'': 'Club'}, inplace=True)
# aceesss the sot/90 column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  2225 non-null   object 
 1           2225 non-null   object 
 2   Nation  2224 non-null   object 
 3   Pos     2225 non-null   object 
 4   Age     2225 non-null   float64
 5   90s     2225 non-null   float64
 6   Gls     511 non-null    float64
 7   Sh      402 non-null    float64
 8   SoT     402 non-null    float64
 9   SoT%    302 non-null    float64
 10  Sh/90   402 non-null    float64
 11  SoT/90  402 non-null    float64
 12  G/Sh    302 non-null    float64
 13  G/SoT   227 non-null    float64
 14  Dist    152 non-null    float64
dtypes: float64(11), object(4)
memory usage: 260.9+ KB


In [8]:
total_df = total_df.iloc[:, :-1]

In [9]:
total_df.columns = total_df.columns.get_level_values(1)

In [10]:
# cleaning Nation values so only the uppercase string will be kept
total_df['Nation'] = total_df['Nation'].str.split().str[1]

In [15]:
# print out all column names which are duplicate in the df
print(total_df.columns[total_df.columns.duplicated()])

Index(['PassLive', 'PassDead', 'TO', 'Sh', 'Fld', 'Def'], dtype='object')


In [216]:
# Drop rows where the value for "Min" column equals zero or is NaN
total_df = total_df[(total_df["Sh"] != 0) & (~total_df["Sh"].isna() == False)]

In [12]:
total_df.to_csv("t5_leagues_players_gca.csv", index=False)