# Aggregating all player data for in-depth analyses and comparisons

In [17]:
# libraries
import pandas as pd
import numpy as np
import time
import re

#### Getting the standard stats for all players playing for clubs from the Top5 leagues in 2018/19

In [18]:
# creating list of urls for each top5 league club from the 18-19 season
url_list = open('../urls/top5_league_clubs.txt').read().splitlines()
#read the first url
table = pd.read_html(url_list[0])

In [19]:
from urllib.parse import urlparse
# Initialize an empty list to store dataframes
dfs = []

# Set the delay between requests (in seconds)
delay_between_requests = 1  # Adjust this value as needed

for url in url_list:
    try:
        # Read HTML tables from the URL
        tables = pd.read_html(url, attrs={"id": "stats_gca_combined"})

        df = tables[0]
        print(tables[0].head())
        # Drop the last two rows
        df = df.iloc[:-2]
        
        # Extract club name from the URL using regex
        club_name_match = re.search(r"all_comps/(.*?)-Stats-All-Competitions", url)
        if club_name_match:
            club_name = club_name_match.group(1).replace('-', ' ')
        else:
            raise ValueError("Club name not found in URL pattern.")
        
        # Create a new column with the club name
        df['Club'] = club_name
        
        # Append the dataframe to the list
        dfs.append(df)
        
        # Introduce a delay before making the next request
        time.sleep(delay_between_requests)
    except Exception as e:
        print(f"Error reading data from {url}: {str(e)}")

# Concatenate all dataframes into one
total_df = pd.concat(dfs, ignore_index=True)

In [6]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2630 entries, 0 to 2629
Data columns (total 29 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   (Unnamed: 0_level_0, Player)    2630 non-null   object 
 1   (Unnamed: 1_level_0, Nation)    2628 non-null   object 
 2   (Unnamed: 2_level_0, Pos)       2630 non-null   object 
 3   (Unnamed: 3_level_0, Age)       2630 non-null   float64
 4   (Unnamed: 4_level_0, 90s)       2630 non-null   float64
 5   (Touches, Touches)              2541 non-null   float64
 6   (Touches, Def Pen)              2541 non-null   float64
 7   (Touches, Def 3rd)              2541 non-null   float64
 8   (Touches, Mid 3rd)              2541 non-null   float64
 9   (Touches, Att 3rd)              2541 non-null   float64
 10  (Touches, Att Pen)              2541 non-null   float64
 11  (Touches, Live)                 2541 non-null   float64
 12  (Take-Ons, Att)                 25

In [5]:
total_df_saved = total_df.copy()

In [6]:
total_df = total_df_saved.copy()

In [9]:
total_df.shape


(2630, 29)

In [176]:
total_df.info()
total_df.rename(columns={'': 'Club'}, inplace=True)
# aceesss the sot/90 column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  2225 non-null   object 
 1           2225 non-null   object 
 2   Nation  2224 non-null   object 
 3   Pos     2225 non-null   object 
 4   Age     2225 non-null   float64
 5   90s     2225 non-null   float64
 6   Gls     511 non-null    float64
 7   Sh      402 non-null    float64
 8   SoT     402 non-null    float64
 9   SoT%    302 non-null    float64
 10  Sh/90   402 non-null    float64
 11  SoT/90  402 non-null    float64
 12  G/Sh    302 non-null    float64
 13  G/SoT   227 non-null    float64
 14  Dist    152 non-null    float64
dtypes: float64(11), object(4)
memory usage: 260.9+ KB


In [8]:
total_df = total_df.iloc[:, :-1]

In [16]:
total_df.head(10)

Unnamed: 0,Player,Club,Nation,Pos,Age,90s,SCA,SCA90,PassLive,PassDead,...,Fld,Def,GCA,GCA90,PassLive.1,PassDead.1,TO,Sh,Fld.1,Def.1
0,Ederson,Manchester City,BRA,GK,24.0,55.3,7.0,0.15,4.0,3.0,...,0.0,0.0,2.0,0.04,1.0,1.0,0.0,0.0,0.0,0.0
1,Aymeric Laporte,Manchester City,ESP,DF,24.0,48.3,50.0,1.14,46.0,0.0,...,0.0,0.0,7.0,0.16,7.0,0.0,0.0,0.0,0.0,0.0
2,Kyle Walker,Manchester City,ENG,DF,28.0,48.1,75.0,1.91,69.0,0.0,...,2.0,1.0,5.0,0.13,5.0,0.0,0.0,0.0,0.0,0.0
3,Raheem Sterling,Manchester City,ENG,FW,23.0,45.8,198.0,4.89,133.0,3.0,...,14.0,1.0,34.0,0.84,21.0,1.0,6.0,0.0,6.0,0.0
4,Bernardo Silva,Manchester City,POR,"MF,FW",23.0,44.8,173.0,4.54,139.0,8.0,...,5.0,1.0,20.0,0.52,18.0,0.0,0.0,0.0,2.0,0.0
5,David Silva,Manchester City,ESP,MF,32.0,41.2,189.0,5.46,156.0,9.0,...,4.0,1.0,25.0,0.72,22.0,1.0,0.0,0.0,1.0,1.0
6,İlkay Gündoğan,Manchester City,GER,MF,27.0,38.0,125.0,4.13,89.0,27.0,...,2.0,0.0,16.0,0.53,15.0,1.0,0.0,0.0,0.0,0.0
7,Fernandinho,Manchester City,BRA,"MF,DF",33.0,37.4,88.0,2.62,77.0,0.0,...,4.0,4.0,11.0,0.33,9.0,0.0,0.0,0.0,1.0,1.0
8,Sergio Agüero,Manchester City,ARG,FW,30.0,37.2,107.0,3.25,73.0,1.0,...,7.0,2.0,22.0,0.67,17.0,0.0,2.0,2.0,1.0,0.0
9,John Stones,Manchester City,ENG,DF,24.0,31.7,20.0,0.82,19.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
total_df.columns = total_df.columns.get_level_values(1)

In [7]:
total_df.rename(columns={'': 'Club'}, inplace=True)

In [15]:
total_df.head(810)

Unnamed: 0,Player,Club,Nation,Pos,Age,90s,Touches,Def Pen,Def 3rd,Mid 3rd,...,Carries,TotDist,PrgDist,PrgC,1/3,CPA,Mis,Dis,Rec,PrgR
0,Ederson,Manchester City,BRA,GK,24.0,55.3,1466.0,1234.0,1446.0,22.0,...,920.0,4269.0,2993.0,0.0,0.0,0.0,1.0,0.0,785.0,0.0
1,Aymeric Laporte,Manchester City,ESP,DF,24.0,48.3,4253.0,235.0,1235.0,2770.0,...,3213.0,17709.0,13281.0,118.0,103.0,2.0,16.0,7.0,3330.0,14.0
2,Kyle Walker,Manchester City,ENG,DF,28.0,48.1,3574.0,103.0,652.0,2186.0,...,2293.0,11207.0,5669.0,107.0,98.0,18.0,32.0,16.0,2625.0,114.0
3,Raheem Sterling,Manchester City,ENG,FW,23.0,45.8,2252.0,13.0,106.0,746.0,...,1741.0,10293.0,5519.0,209.0,133.0,114.0,94.0,118.0,1857.0,571.0
4,Bernardo Silva,Manchester City,POR,"MF,FW",23.0,44.8,2537.0,37.0,238.0,1088.0,...,1803.0,10898.0,5982.0,182.0,123.0,58.0,52.0,48.0,1876.0,325.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805,Stefan Bell,Mainz 05,GER,DF,26.0,27.3,1497.0,152.0,685.0,757.0,...,853.0,4620.0,3015.0,24.0,17.0,0.0,11.0,11.0,892.0,8.0
806,Daniel Brosinski,Mainz 05,GER,DF,30.0,26.8,1573.0,73.0,388.0,681.0,...,665.0,3119.0,1818.0,47.0,46.0,8.0,25.0,10.0,793.0,112.0
807,Robin Quaison,Mainz 05,SWE,"FW,MF",24.0,25.6,663.0,6.0,37.0,299.0,...,472.0,2691.0,1033.0,36.0,35.0,15.0,38.0,58.0,512.0,122.0
808,Florian Müller,Mainz 05,GER,GK,20.0,25.0,909.0,758.0,907.0,2.0,...,454.0,2328.0,1440.0,0.0,0.0,0.0,0.0,0.0,377.0,0.0


In [4]:
if 'Club' in total_df.columns:
    total_df.insert(1, "Club", total_df.pop("Club"))
else:
    total_df.insert(1, "Club", "")

# remove the club column
# total_df.drop(columns=['Club'], inplace=True)

#drop all rows with that have NaN values in "Sh"
# total_df.dropna(subset=['Sh'], inplace=True)

In [10]:
# cleaning Nation values so only the uppercase string will be kept
total_df['Nation'] = total_df['Nation'].str.split().str[1]

In [15]:
# print out all column names which are duplicate in the df
print(total_df.columns[total_df.columns.duplicated()])

Index(['PassLive', 'PassDead', 'TO', 'Sh', 'Fld', 'Def'], dtype='object')


In [216]:
# Drop rows where the value for "Min" column equals zero or is NaN
total_df = total_df[(total_df["Sh"] != 0) & (~total_df["Sh"].isna() == False)]

In [12]:
total_df.to_csv("t5_leagues_players_gca.csv", index=False)