# Aggregating all player data for in-depth analyses and comparisons

In [1]:
# libraries
import pandas as pd
import numpy as np
import time
import re

#### Getting the standard stats for all players playing for clubs from the Top5 leagues in 2018/19

In [2]:
# creating list of urls for each top5 league club from the 18-19 season
url_list = open('urls/top5_league_clubs.txt').read().splitlines()
#read the first url
table = pd.read_html(url_list[0])

In [3]:
counter = 0
for tab in table:
    print("TABLE: "+str(counter))
    print(tab.head(5))
    print("----------------------")
    print("----------------------")
    print("----------------------")
    counter += 1

TABLE: 0
  Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0 Unnamed: 3_level_0  \
              Player             Nation                Pos                Age   
0            Ederson             br BRA                 GK                 24   
1    Aymeric Laporte             es ESP                 DF                 24   
2        Kyle Walker            eng ENG                 DF                 28   
3    Raheem Sterling            eng ENG                 FW                 23   
4     Bernardo Silva             pt POR              MF,FW                 23   

  Unnamed: 4_level_0 Playing Time               Performance        ...  \
                  MP       Starts     Min   90s         Gls   Ast  ...   
0                 55           55  4980.0  55.3         0.0   2.0  ...   
1                 51           49  4351.0  48.3         5.0   3.0  ...   
2                 52           47  4333.0  48.1         2.0   1.0  ...   
3                 51           45  4121.0  45.8      

In [None]:
from urllib.parse import urlparse
# Initialize an empty list to store dataframes
dfs = []

# Set the delay between requests (in seconds)
delay_between_requests = 5  # Adjust this value as needed

for url in url_list:
    try:
        # Read HTML tables from the URL
        tables = pd.read_html(url)

        df = tables[19]
        print(tables[19].head())
        # Drop the last two rows
        df = df.iloc[:-2]
        
        # Extract club name from the URL using regex
        club_name_match = re.search(r"all_comps/(.*?)-Stats-All-Competitions", url)
        if club_name_match:
            club_name = club_name_match.group(1).replace('-', ' ')
        else:
            raise ValueError("Club name not found in URL pattern.")
        
        # Create a new column with the club name
        df['Club'] = club_name
        
        # Append the dataframe to the list
        dfs.append(df)
        
        # Introduce a delay before making the next request
        time.sleep(delay_between_requests)
    except Exception as e:
        print(f"Error reading data from {url}: {str(e)}")

# Concatenate all dataframes into one
total_df = pd.concat(dfs, ignore_index=True)

In [122]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  2225 non-null   object 
 1   Club    2225 non-null   object 
 2   Nation  2224 non-null   object 
 3   Pos     2225 non-null   object 
 4   Age     2225 non-null   float64
 5   90s     2225 non-null   float64
 6   Gls     511 non-null    float64
 7   Sh      402 non-null    float64
 8   SoT     402 non-null    float64
 9   SoT%    302 non-null    float64
 10  Sh/90   402 non-null    float64
 11  SoT/90  402 non-null    float64
 12  G/Sh    302 non-null    float64
 13  G/SoT   227 non-null    float64
 14  Dist    152 non-null    float64
dtypes: float64(11), object(4)
memory usage: 260.9+ KB


In [17]:
total_df_saved = total_df.copy()

In [155]:
total_df = total_df_saved.copy()

In [156]:
total_df.shape


(2225, 72)

In [140]:
total_df.info()
total_df.rename(columns={'': 'Club'}, inplace=True)
# aceesss the sot/90 column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  2225 non-null   object 
 1           2225 non-null   object 
 2   Nation  2224 non-null   object 
 3   Pos     2225 non-null   object 
 4   Age     2225 non-null   float64
 5   90s     2225 non-null   float64
 6   Gls     511 non-null    float64
 7   Sh      402 non-null    float64
 8   SoT     402 non-null    float64
 9   SoT%    302 non-null    float64
 10  Sh/90   402 non-null    float64
 11  SoT/90  402 non-null    float64
 12  G/Sh    302 non-null    float64
 13  G/SoT   227 non-null    float64
 14  Dist    152 non-null    float64
dtypes: float64(11), object(4)
memory usage: 260.9+ KB


In [132]:
for i in range(14, 71):
    total_df = total_df.iloc[:, :-1]

In [141]:
total_df.head(10)

Unnamed: 0,Player,Club,Nation,Pos,Age,90s,Gls,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist
0,Ederson,Manchester City,BRA,GK,24.0,55.3,0.0,0.0,0.0,,0.0,0.0,,,
1,Aymeric Laporte,Manchester City,ESP,DF,24.0,48.3,5.0,36.0,9.0,25.0,0.74,0.19,0.14,0.56,15.0
2,Kyle Walker,Manchester City,ENG,DF,28.0,48.1,2.0,30.0,6.0,20.0,0.62,0.12,0.07,0.33,27.8
3,Raheem Sterling,Manchester City,ENG,FW,23.0,45.8,25.0,113.0,55.0,48.7,2.47,1.2,0.22,0.45,14.2
4,Bernardo Silva,Manchester City,POR,"MF,FW",23.0,44.8,13.0,85.0,35.0,41.2,1.9,0.78,0.15,0.37,15.9
5,David Silva,Manchester City,ESP,MF,32.0,41.2,10.0,77.0,25.0,32.5,1.87,0.61,0.13,0.4,13.7
6,İlkay Gündoğan,Manchester City,GER,MF,27.0,38.0,6.0,67.0,13.0,19.4,1.76,0.34,0.09,0.46,20.1
7,Fernandinho,Manchester City,BRA,"MF,DF",33.0,37.4,1.0,40.0,8.0,20.0,1.07,0.21,0.03,0.13,22.9
8,Sergio Agüero,Manchester City,ARG,FW,30.0,37.2,32.0,166.0,67.0,40.4,4.47,1.8,0.17,0.42,15.4
9,John Stones,Manchester City,ENG,DF,24.0,31.7,0.0,10.0,3.0,30.0,0.32,0.09,0.0,0.0,10.1


In [133]:
total_df.columns = total_df.columns.get_level_values(1)

In [131]:
df_clean_total.rename(columns={'': 'Club'}, inplace=True)

In [157]:
total_df.head(810)

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Standard,Standard,Standard,Standard,Standard,...,SCA Types,GCA,GCA,GCA Types,GCA Types,GCA Types,GCA Types,GCA Types,GCA Types,Unnamed: 21_level_0
Unnamed: 0_level_1,Player,Nation,Pos,Age,90s,Gls,Sh,SoT,SoT%,Sh/90,...,Def,GCA,GCA90,PassLive,PassDead,TO,Sh,Fld,Def,Matches
0,Ederson,br BRA,GK,24.0,55.3,0.0,0.0,0.0,,0.00,...,,,,,,,,,,
1,Aymeric Laporte,es ESP,DF,24.0,48.3,5.0,36.0,9.0,25.0,0.74,...,,,,,,,,,,
2,Kyle Walker,eng ENG,DF,28.0,48.1,2.0,30.0,6.0,20.0,0.62,...,,,,,,,,,,
3,Raheem Sterling,eng ENG,FW,23.0,45.8,25.0,113.0,55.0,48.7,2.47,...,,,,,,,,,,
4,Bernardo Silva,pt POR,"MF,FW",23.0,44.8,13.0,85.0,35.0,41.2,1.90,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805,Simon Rhein,de GER,MF,20.0,6.8,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
806,Törles Knöll,de GER,"FW,MF",20.0,5.1,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
807,Kevin Goden,de GER,"DF,MF",19.0,4.0,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches
808,Timmy Tillman,de GER,"FW,MF",19.0,2.3,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches


In [130]:
if 'Club' in total_df.columns:
    total_df.insert(1, "Club", total_df.pop("Club"))
else:
    total_df.insert(1, "Club", "")

# remove the club column
# total_df.drop(columns=['Club'], inplace=True)

#drop all rows with that have NaN values in "Sh"
# total_df.dropna(subset=['Sh'], inplace=True)

In [135]:
# cleaning Nation values so only the uppercase string will be kept
total_df['Nation'] = total_df['Nation'].str.split().str[1]

In [154]:
#print all rows which have a "Club" value of Dortmund
total_df[total_df['Club'] == 'Girona']

Unnamed: 0,Player,Club,Nation,Pos,Age,90s,Gls,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist
1241,Bernardo Espinosa,Girona,COL,DF,29.0,35.0,,,,,,,,,
1242,Juanpe,Girona,ESP,DF,27.0,33.2,,,,,,,,,
1243,Portu,Girona,ESP,"FW,MF",26.0,32.8,,,,,,,,,
1244,Pere Pons,Girona,ESP,MF,25.0,31.9,,,,,,,,,
1245,Cristhian Stuani,Girona,URU,FW,31.0,31.9,,,,,,,,,
1246,Yassine Bounou,Girona,MAR,GK,27.0,31.8,,,,,,,,,
1247,Borja García,Girona,ESP,"MF,FW",27.0,30.5,,,,,,,,,
1248,Álex Granell,Girona,ESP,"MF,DF",29.0,30.5,,,,,,,,,
1249,Pedro Porro,Girona,ESP,DF,18.0,26.6,,,,,,,,,
1250,Pedro Alcalá,Girona,ESP,DF,29.0,24.3,,,,,,,,,


In [126]:
# Drop rows where the value for "Min" column equals zero or is NaN
total_df = total_df[(total_df["Sh"] != 0) & (~total_df["Sh"].isna() == False)]

In [158]:
total_df.to_csv("t5_leagues_players_shooting.csv", index=False)