# Aggregating all player data for in-depth analyses and comparisons

In [4]:
# libraries
import pandas as pd
import numpy as np
import time
import re

#### Getting the standard stats for all players playing for clubs from the Top5 leagues in 2018/19

In [5]:
# creating list of urls for each top5 league club from the 18-19 season
url_list = open('../urls/matchday_reports_pl_updated.txt').read().splitlines()
print(url_list[-1])
print(len(url_list))

https://fbref.com/en/matches/f54cc2e1/Manchester-City-Bournemouth-December-1-2018-Premier-League
380


**Most important part: loading every table of a set of links with a specific id into a df (by also converting the table)**

In [8]:
from urllib.parse import urlparse
# Initialize an empty list to store dataframes
dfs = []

# Set the delay between requests (in seconds)
delay_between_requests = 2  # Adjust this value as needed

for url in url_list:
    try:
        # Read HTML tables from the URL
        tables = pd.read_html(url, attrs={"id": "shots_all"}) # only Liverpool: "id": "shots_822bd0ba"
        df = tables[0]
        
        # Extract the date from the URL using regex
        date_match = re.search(r'(\w+-\d{1,2}-\d{4})', url)
        if date_match:
            date = pd.to_datetime(date_match.group(1), format="%B-%d-%Y").strftime("%Y-%m-%d")
        else:
            raise ValueError("Date not found in URL")
        
        # Add the Date column to the DataFrame
        df['Date'] = date
        
        # append df to the list
        dfs.append(df)
        
        print(url)
        # Introduce a delay before making the next request
        time.sleep(delay_between_requests)
    except Exception as e:
        print(f"Error reading data from {url}: {str(e)}")

# Concatenate all dataframes into one
total_df = pd.concat(dfs, ignore_index=True)

https://fbref.com/en/matches/fc2c1788/Liverpool-West-Ham-United-August-12-2018-Premier-League
https://fbref.com/en/matches/24e4536f/Crystal-Palace-Liverpool-August-20-2018-Premier-League
https://fbref.com/en/matches/db5e61cc/Liverpool-Brighton-and-Hove-Albion-August-25-2018-Premier-League
https://fbref.com/en/matches/265bb7a5/Leicester-City-Liverpool-September-1-2018-Premier-League
https://fbref.com/en/matches/83ca29de/Tottenham-Hotspur-Liverpool-September-15-2018-Premier-League
https://fbref.com/en/matches/65050401/Liverpool-Southampton-September-22-2018-Premier-League
https://fbref.com/en/matches/83743ea9/Chelsea-Liverpool-September-29-2018-Premier-League
https://fbref.com/en/matches/73e7196e/Liverpool-Manchester-City-October-7-2018-Premier-League
https://fbref.com/en/matches/356d83b4/Huddersfield-Town-Liverpool-October-20-2018-Premier-League
https://fbref.com/en/matches/8df7e453/Liverpool-Cardiff-City-October-27-2018-Premier-League
https://fbref.com/en/matches/09767606/Arsenal-Liver

  tables = pd.read_html(url, attrs={"id": "shots_all"}) # only Liverpool: "id": "shots_822bd0ba"
  soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)


Error reading data from #https://fbref.com/en/matches/fbfb16e5/Burnley-Crystal-Palace-March-2-2019-Premier-League: No tables found
https://fbref.com/en/matches/64a4127a/Crystal-Palace-Brighton-and-Hove-Albion-March-9-2019-Premier-League
https://fbref.com/en/matches/9de9e8e3/Crystal-Palace-Huddersfield-Town-March-30-2019-Premier-League
https://fbref.com/en/matches/521ebfbb/Cardiff-City-Crystal-Palace-May-4-2019-Premier-League
https://fbref.com/en/matches/23ad3530/Crystal-Palace-Bournemouth-May-12-2019-Premier-League
https://fbref.com/en/matches/55af3b90/Fulham-Burnley-August-26-2018-Premier-League
https://fbref.com/en/matches/51055c21/Brighton-and-Hove-Albion-Fulham-September-1-2018-Premier-League
https://fbref.com/en/matches/cf44aa9f/Fulham-Watford-September-22-2018-Premier-League
https://fbref.com/en/matches/2e19378a/Manchester-City-Fulham-September-15-2018-Premier-League
https://fbref.com/en/matches/fe1817e3/Cardiff-City-Fulham-October-20-2018-Premier-League
https://fbref.com/en/matc

In [9]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9985 entries, 0 to 9984
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   (Unnamed: 0_level_0, Minute)     9606 non-null   object 
 1   (Unnamed: 1_level_0, Player)     9606 non-null   object 
 2   (Unnamed: 2_level_0, Squad)      9606 non-null   object 
 3   (Unnamed: 3_level_0, xG)         9605 non-null   float64
 4   (Unnamed: 4_level_0, PSxG)       3233 non-null   float64
 5   (Unnamed: 5_level_0, Outcome)    9606 non-null   object 
 6   (Unnamed: 6_level_0, Distance)   9606 non-null   float64
 7   (Unnamed: 7_level_0, Body Part)  9606 non-null   object 
 8   (Unnamed: 8_level_0, Notes)      1487 non-null   object 
 9   (SCA 1, Player)                  9143 non-null   object 
 10  (SCA 1, Event)                   9166 non-null   object 
 11  (SCA 2, Player)                  7672 non-null   object 
 12  (SCA 2, Event)      

In [31]:
total_df.tail()

Unnamed: 0,Date,Minute,Player,Squad,xG,PSxG,Outcome,Distance,Body Part,Notes
9980,2018-12-01,68.0,Fernandinho,Manchester City,0.07,,Off Target,13.0,Left Foot,
9981,2018-12-01,76.0,Steve Cook,Bournemouth,0.02,,Off Target,29.0,Left Foot,
9982,2018-12-01,79.0,İlkay Gündoğan,Manchester City,0.35,0.31,Goal,6.0,Right Foot,
9983,2018-12-01,85.0,Leroy Sané,Manchester City,0.08,0.04,Saved,8.0,Left Foot,
9984,2018-12-01,86.0,Leroy Sané,Manchester City,0.03,,Blocked,19.0,Left Foot,


In [13]:
# move the last column to index 0
date_column_name = total_df.columns[-1]
date_column = total_df.pop(date_column_name)
total_df.insert(0, date_column_name, date_column)

In [18]:
total_df.columns

Index(['', 'Minute', 'Player', 'Squad', 'xG', 'PSxG', 'Outcome', 'Distance',
       'Body Part', 'Notes', 'Player', 'Event', 'Player', 'Event'],
      dtype='object')

In [None]:
#safety first - always make a copy of dataframes that took you over 20 minutes to scrape :D (im speaking of experience here)
df_clean_total = total_df.copy()

In [None]:
#removing the great multi-index feature of poorly built stat websites
total_df.columns = total_df.columns.get_level_values(1)

In [26]:
# drop last 4 cols
total_df = total_df.iloc[:, :-4]

In [28]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9985 entries, 0 to 9984
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       9985 non-null   object 
 1   Minute     9606 non-null   object 
 2   Player     9606 non-null   object 
 3   Squad      9606 non-null   object 
 4   xG         9605 non-null   float64
 5   PSxG       3233 non-null   float64
 6   Outcome    9606 non-null   object 
 7   Distance   9606 non-null   float64
 8   Body Part  9606 non-null   object 
 9   Notes      1487 non-null   object 
dtypes: float64(3), object(7)
memory usage: 780.2+ KB


In [22]:
total_df = total_df.rename(columns={"": 'Date'})

In [29]:
total_df.iloc[:, 2].value_counts()

Player
Mohamed Salah          134
Aleksandar Mitrović    130
Sergio Agüero          116
Raúl Jiménez           109
Harry Kane              98
                      ... 
Neeskens Kebano          1
Jesse Lingard (pen)      1
Ben Gibson               1
Tim Ream                 1
Jermain Defoe            1
Name: count, Length: 466, dtype: int64

In [30]:
total_df.iloc[:, 1].unique()

array(['17', '18', '19', '24', '25', '27', '30', '36', '37', '45+2', nan,
       '49', '53', '65', '68', '70', '73', '78', '80', '88', '90', '90+1',
       '3', '23', '35', '44', '45', '54', '60', '61', '66', '77', '81',
       '84', '85', '87', '90+3', '5', '6', '9', '15', '31', '34', '46',
       '48', '58', '62', '69', '72', '86', '89', '90+2', 4.0, 10.0, 16.0,
       21.0, 23.0, 26.0, 32.0, 33.0, 39.0, 43.0, 45.0, 50.0, 52.0, 53.0,
       54.0, 56.0, 63.0, 82.0, 86.0, 88.0, '22', '29', '39', '43', '50',
       '63', '67', '75', '4', '8', '10', '13', '21', '28', '45+1', '45+3',
       '11', '32', '47', '59', '64', '90+5', 12.0, 38.0, 58.0, 61.0, 62.0,
       65.0, 69.0, 75.0, 79.0, '26', '38', '42', '55', '82', '41', '79',
       '90+4', 9.0, 13.0, 14.0, 18.0, 24.0, 25.0, 31.0, 74.0, 83.0, 84.0,
       20.0, 22.0, 28.0, 29.0, 30.0, 34.0, 35.0, 41.0, 59.0, 68.0, 81.0,
       85.0, 90.0, '76', '12', '57', '90+6', '52', 19.0, 48.0, 57.0, 71.0,
       77.0, 80.0, '2', '7', '14', '33', '

In [204]:
df_clean_total.iloc[:, 0].unique()

array([17., 18., 19., 24., 25., 36., 37., 45., nan, 53., 65., 68., 70.,
       73., 78., 80., 88., 90.,  3., 23., 27., 35., 44., 49., 60., 61.,
       77., 81., 84., 87.,  5.,  9., 15., 31., 34., 46., 58., 69., 72.,
       86.,  4., 10., 16., 21., 43., 54., 82.,  6., 22., 39., 63.,  7.,
       26., 30., 32., 89., 85., 11., 59., 38., 12., 50., 62., 79., 47.,
       64., 20., 48., 51., 56., 76., 13., 41., 42., 66., 74., 83.,  8.,
       33., 71., 28., 67., 29., 57., 52., 14.,  2., 75., 55.,  1.])

In [36]:
def convert_to_int(value):
    if pd.isna(value):
        return value
    if isinstance(value, str):
        return int(value.strip()[:2])
    elif isinstance(value, float):
        return int(value)
    else:
        return None

# Apply the function to the Minutes column
df_clean_total['Minute'] = df_clean_total['Minute'].apply(convert_to_int)

In [37]:
df_clean_total["Minute"] = df_clean_total["Minute"].astype("Int64")

In [38]:
df_clean_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260 entries, 0 to 1259
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1260 non-null   object 
 1   Minute     1209 non-null   Int64  
 2   Player     1209 non-null   object 
 3   Squad      1209 non-null   object 
 4   xG         1208 non-null   float64
 5   PSxG       426 non-null    float64
 6   Outcome    1209 non-null   object 
 7   Distance   1209 non-null   float64
 8   Body Part  1209 non-null   object 
 9   Notes      183 non-null    object 
dtypes: Int64(1), float64(3), object(6)
memory usage: 99.8+ KB


In [39]:
df_clean_total.head()

Unnamed: 0,Date,Minute,Player,Squad,xG,PSxG,Outcome,Distance,Body Part,Notes
0,2018-08-12,17,Sadio Mané,Liverpool,0.04,,Blocked,16.0,Right Foot,
1,2018-08-12,18,Roberto Firmino,Liverpool,0.59,,Off Target,5.0,Right Foot,Volley
2,2018-08-12,19,Mohamed Salah,Liverpool,0.74,0.95,Goal,4.0,Right Foot,
3,2018-08-12,24,Trent Alexander-Arnold,Liverpool,0.06,0.57,Saved,31.0,Right Foot,Free kick
4,2018-08-12,25,Georginio Wijnaldum,Liverpool,0.09,,Off Target,12.0,Left Foot,


In [40]:
# values for 90 and 45 include shots attempted in extra time (found no better solution)
df_clean_total["Minute"].value_counts()

Minute
90    76
45    29
82    25
85    21
80    20
      ..
19     7
5      4
46     4
40     4
1      3
Name: count, Length: 90, dtype: Int64

In [47]:
df_clean_total.dropna(subset=['Minute'], inplace=True)

In [48]:
df_clean_total.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1209 entries, 0 to 1259
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       1209 non-null   object 
 1   Minute     1209 non-null   Int64  
 2   Player     1209 non-null   object 
 3   Squad      1209 non-null   object 
 4   xG         1208 non-null   float64
 5   PSxG       426 non-null    float64
 6   Outcome    1209 non-null   object 
 7   Distance   1209 non-null   float64
 8   Body Part  1209 non-null   object 
 9   Notes      183 non-null    object 
dtypes: Int64(1), float64(3), object(6)
memory usage: 105.1+ KB


In [32]:
total_df['Player'] = total_df['Player'].str.replace(r'\s*\(pen\)', '', regex=True)

In [33]:
total_df.to_csv("../urls/ALL_SHOTS.csv", index=False)

In [34]:
import pandas as pd

# CSV-Datei einlesen
input_file = '../ALL_SHOTS_CLEANED.csv'
df = pd.read_csv(input_file)

# Konvertiere die Date-Spalte in das Datetime-Format
df['Date'] = pd.to_datetime(df['Date'])

# Leere Spalte für MatchDay hinzufügen
df['MatchDay'] = 0

# Für jeden Verein einen chronologischen MatchDay zuweisen
for squad in df['Squad'].unique():
    squad_df = df[df['Squad'] == squad].sort_values('Date')
    match_days = range(1, len(squad_df['Date'].unique()) + 1)
    date_to_matchday = {date: match_day for date, match_day in zip(squad_df['Date'].unique(), match_days)}
    df.loc[df['Squad'] == squad, 'MatchDay'] = df[df['Squad'] == squad]['Date'].map(date_to_matchday)

# Ergebnis speichern
output_file = '../ALL_SHOTS_WITH_MATCHDAYS.csv'
df.to_csv(output_file, index=False)

print(f"Erweiterte CSV-Datei wurde gespeichert unter: {output_file}")


Erweiterte CSV-Datei wurde gespeichert unter: ../ALL_SHOTS_WITH_MATCHDAYS.csv


In [54]:
df_clean_total["Outcome"].unique()

array(['Blocked', 'Off Target', 'Goal', 'Saved', 'Woodwork',
       'Saved off Target'], dtype=object)