# Analysis: Bakayoko replacement

Analysis: Bakayoko replacement \
Name: Yasin Tunçbilek \
Date: 9 February 2024

## Packages

Library to send HTTP requests

In [3]:
import requests

Library to parse HTML

In [4]:
from bs4 import BeautifulSoup

Library to deal with various types of I/O. Needed because read_html gives an error.

In [5]:
from io import StringIO

Pandas library for data analysis and manipulation

In [6]:
import pandas as pd

## Player stats

### URLs to stats

URL of a competition where stats reside in

In [7]:
competition_url = "https://fbref.com/en/comps/23/2023-2024/stats/2023-2024-Eredivisie-Stats"

Download HTML of competition page

In [10]:
data = requests.get(competition_url)

Initialise library with downloaded HTML

In [14]:
soup = BeautifulSoup(data.text)

Select menu where URLs to various stats reside in

In [20]:
menu_stats = soup.select('li.full.hasmore')[0]

Find all tags where URLs to various stats reside in

In [22]:
urls_stats = menu_stats.find_all('a')

Get href property of each link

In [24]:
urls_stats = [u.get("href") for u in urls_stats]

Make full URLs

In [26]:
urls_stats = [f"https://fbref.com{l}" for l in urls_stats]

Print list of URLS where various stats reside in

In [27]:
urls_stats

['https://fbref.com/en/comps/23/2023-2024/stats/2023-2024-Eredivisie-Stats',
 'https://fbref.com/en/comps/23/2023-2024/keepers/2023-2024-Eredivisie-Stats',
 'https://fbref.com/en/comps/23/2023-2024/keepersadv/2023-2024-Eredivisie-Stats',
 'https://fbref.com/en/comps/23/2023-2024/shooting/2023-2024-Eredivisie-Stats',
 'https://fbref.com/en/comps/23/2023-2024/passing/2023-2024-Eredivisie-Stats',
 'https://fbref.com/en/comps/23/2023-2024/passing_types/2023-2024-Eredivisie-Stats',
 'https://fbref.com/en/comps/23/2023-2024/gca/2023-2024-Eredivisie-Stats',
 'https://fbref.com/en/comps/23/2023-2024/defense/2023-2024-Eredivisie-Stats',
 'https://fbref.com/en/comps/23/2023-2024/possession/2023-2024-Eredivisie-Stats',
 'https://fbref.com/en/comps/23/2023-2024/playingtime/2023-2024-Eredivisie-Stats',
 'https://fbref.com/en/comps/23/2023-2024/misc/2023-2024-Eredivisie-Stats']

### Standard stats

URL of standard stats

In [29]:
standard_url = urls_stats[0]

Download HTML of standard stats page. Player table is commented out and therefore not retrievable. Therefore a replace.

In [39]:
players_standard_data = requests.get(standard_url).text.replace('<!--','').replace('-->','')

Wrap in a 'StringIO' object to read from a literal string

In [40]:
players_standard_data = StringIO(players_standard_data)

Create dataframe of player standard stats

In [41]:
players_standard_stats = pd.read_html(players_standard_data, attrs = {'id':'stats_standard'})[0]

Drop the first level of the dataframe

In [42]:
players_standard_stats.columns = players_standard_stats.columns.droplevel()

Drop rows where column names occur because of the 'folded' table

In [43]:
players_standard_stats = players_standard_stats.loc[players_standard_stats['Player'] != 'Player']

Keep only relevant columns

In [44]:
players_standard_stats = players_standard_stats[["Player", "Nation", "Pos", "Squad", "Age", "Min", "90s"]]

Print first rows of dataframe

In [45]:
players_standard_stats.head()

Unnamed: 0,Player,Nation,Pos,Squad,Age,Min,90s
0,Patrick van Aanholt,nl NED,DF,PSV Eindhoven,32,1114,12.4
1,Paxten Aaronson,us USA,MF,Vitesse,19,1253,13.9
2,Jayden Addai,nl NED,FW,AZ Alkmaar,17,297,3.3
3,Bobby Adekanye,nl NED,FW,Go Ahead Eag,24,1804,20.0
4,Shawn Adewoye,be BEL,DF,RKC Waalwijk,23,2338,26.0


### Shooting stats

URL of shooting stats

In [46]:
shooting_url = urls_stats[3]

Download HTML of shooting page. Table is commented out. Therefore a replace.

In [47]:
players_shooting_data = requests.get(shooting_url).text.replace('<!--','').replace('-->','')

## Competitions

In [5]:
# List of competitions with advanced data from Opta for the analysis
list_competitions = ["https://fbref.com/en/comps/9/Premier-League-Stats",
                    "https://fbref.com/en/comps/12/La-Liga-Stats",
                    "https://fbref.com/en/comps/11/Serie-A-Stats",
                    "https://fbref.com/en/comps/20/Bundesliga-Stats",
                    "https://fbref.com/en/comps/13/Ligue-1-Stats",
                    "https://fbref.com/en/comps/10/Championship-Stats",
                    "https://fbref.com/en/comps/23/Eredivisie-Stats",
                    "https://fbref.com/en/comps/22/Major-League-Soccer-Stats",
                    "https://fbref.com/en/comps/32/Primeira-Liga-Stats",
                    "https://fbref.com/en/comps/24/Serie-A-Stats",
                    "https://fbref.com/en/comps/31/Liga-MX-Stats",
                    "https://fbref.com/en/comps/37/Belgian-Pro-League-Stats",
                    "https://fbref.com/en/comps/17/Segunda-Division-Stats",
                    "https://fbref.com/en/comps/60/Ligue-2-Stats",
                    "https://fbref.com/en/comps/33/2-Bundesliga-Stats",
                    "https://fbref.com/en/comps/18/Serie-B-Stats",
                    "https://fbref.com/en/comps/21/Primera-Division-Stats"]

In [6]:
# URL of a competition where stats reside in
competition_url = list_competitions[0]

### Shooting stats

In [23]:
# URL of shooting stats
shooting_url = urls_stats[3]

In [24]:
# Download HTML of shooting page. Table is commented out. Therefore a replace.
data_players = requests.get(shooting_url).text.replace('<!--','').replace('-->','')

In [25]:
# Wrap in a 'StringIO' object to read from a literal string
data_players = StringIO(data_players)

In [27]:
# Create dataframe of player shooting stats
player_shooting_stats = pd.read_html(data_players, attrs = {'id':'stats_shooting'})[0]

In [28]:
# Drop the first level of the dataframe
player_shooting_stats.columns = player_shooting_stats.columns.droplevel()

In [29]:
# Drop rows where column names occur because of the 'folded' table
player_shooting_stats = player_shooting_stats.loc[player_shooting_stats['Player'] != 'Player']

In [33]:
# Keep only relevant columns
player_shooting_stats = player_shooting_stats[["Player", "Squad", "Age", "Sh", "npxG"]]

In [35]:
player_shooting_stats.head()

Unnamed: 0,Player,Squad,Age,Sh,npxG
0,Max Aarons,Bournemouth,24-066,1,0.0
1,Bénie Adama Traore,Sheffield Utd,21-101,1,0.3
2,Tosin Adarabioyo,Fulham,26-168,10,0.6
3,Elijah Adebayo,Luton Town,26-063,28,5.6
4,Simon Adingra,Brighton,22-069,31,3.0


### Team stats

In [357]:
# Download HTML of standard stats page.
data_teams = requests.get(standard_url)

In [358]:
# Create dataframe of team standard stats of first table on the page. Note: could not solve StringIO error-message here.
team_standard_stats = pd.read_html(data_teams.text)[0]

  team_standard_stats = pd.read_html(data_teams.text)[0]


In [360]:
# Drop the first level of the dataframe
team_standard_stats.columns = team_standard_stats.columns.droplevel()

In [361]:
# Keep only relevant columns
team_standard_stats = team_standard_stats[["Squad", "Age", "MP", "Starts", "Min", "90s"]]

In [362]:
# Specify columns to add a prefix to
cols = team_standard_stats.loc[:, team_standard_stats.columns != "Squad"]

In [363]:
# Add "team_" in front of team variables to distinguish between player stats
team_standard_stats = team_standard_stats.rename(columns = {c: 'team_' + c for c in team_standard_stats.columns if c in cols})

In [377]:
team_standard_stats.head()

Unnamed: 0,Squad,team_Age,team_MP,team_Starts,team_Min,team_90s
0,Arsenal,25.5,26,286,2340,26.0
1,Aston Villa,27.6,27,297,2430,27.0
2,Bournemouth,26.3,25,275,2250,25.0
3,Brentford,27.4,27,297,2430,27.0
4,Brighton,26.7,27,297,2430,27.0


In [365]:
# Merge both dataframes into one dataframe
merged_standard_stats = player_standard_stats.merge(team_standard_stats
                            [["Squad", "team_Age", "team_MP", "team_Starts", "team_Min", "team_90s"]], 
                                                    on = 'Squad', how = 'left')

In [376]:
merged_standard_stats.head()

Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,team_Age,team_MP,team_Starts,team_Min,team_90s
0,Max Aarons,eng ENG,DF,Bournemouth,24-059,2000,14,12,1085,12.1,26.3,25,275,2250,25.0
1,Bénie Adama Traore,ci CIV,"FW,MF",Sheffield Utd,21-094,2002,8,3,387,4.3,26.6,26,286,2340,26.0
2,Tosin Adarabioyo,eng ENG,DF,Fulham,26-161,1997,13,11,993,11.0,29.3,27,297,2430,27.0
3,Elijah Adebayo,eng ENG,FW,Luton Town,26-056,1998,23,13,1162,12.9,27.4,26,286,2340,26.0
4,Simon Adingra,ci CIV,FW,Brighton,22-062,2002,20,15,1367,15.2,26.7,27,297,2430,27.0


## Shooting stats

### Player stats

In [379]:
# URL of shooting stats
shooting_url = urls_stats[3]

In [380]:
# Download HTML of shooting page. Table is commented out. Therefore a replace.
data_players = requests.get(shooting_url).text.replace('<!--','').replace('-->','')

In [369]:
# Wrap in a 'StringIO' object to read from a literal string
data_players = StringIO(data_players)

In [381]:
# Create dataframe of player shooting stats
player_shooting_stats = pd.read_html(data, attrs = {'id':'stats_shooting'})[0]

In [382]:
# Drop the first level of the dataframe
player_shooting_stats.columns = player_shooting_stats.columns.droplevel()

In [383]:
# Drop rows where column names occur because of the 'folded' table
player_shooting_stats = player_shooting_stats.loc[player_shooting_stats['Player'] != 'Player']

In [None]:
# Keep only relevant columns
player_shooting_stats = player_shooting_stats[["Player", "Nation", "Pos", "Squad", "Age", "Born", "MP", "Starts",
                                              "Min", "90s"]]

In [384]:
# Show first five rows of dataframe
player_shooting_stats.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,90s,Gls,Sh,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Matches
0,1,Max Aarons,eng ENG,DF,Bournemouth,24-059,2000,12.1,0,1,...,23.9,0,0,0,0.0,0.0,0.02,0.0,0.0,Matches
1,2,Bénie Adama Traore,ci CIV,"FW,MF",Sheffield Utd,21-094,2002,4.3,0,1,...,15.3,0,0,0,0.3,0.3,0.27,-0.3,-0.3,Matches
2,3,Tosin Adarabioyo,eng ENG,DF,Fulham,26-161,1997,11.0,1,9,...,16.2,0,0,0,0.3,0.3,0.04,0.7,0.7,Matches
3,4,Elijah Adebayo,eng ENG,FW,Luton Town,26-056,1998,12.9,9,28,...,9.5,0,0,0,5.6,5.6,0.2,3.4,3.4,Matches
4,5,Simon Adingra,ci CIV,FW,Brighton,22-062,2002,15.2,6,31,...,15.8,0,0,0,3.0,3.0,0.1,3.0,3.0,Matches


### Team stats

In [385]:
# Download HTML of shooting page
data_teams = requests.get(shooting_url)

In [387]:
# Create dataframe of team shooting stats of first table on the page. Note: could not solve StringIO error-message here.
team_shooting_stats = pd.read_html(data_teams.text, match = "Squad Shooting")[0]

  team_shooting_stats = pd.read_html(data_teams.text, match = "Squad Shooting")[0]


In [388]:
# Drop the first level of the dataframe
team_shooting_stats.columns = team_shooting_stats.columns.droplevel()

In [None]:
# Keep only relevant columns
team_shooting_stats = team_shooting_stats[["Squad", "Age", "MP", "Starts", "Min", "90s"]]

In [389]:
# Show first five rows of dataframe
team_shooting_stats.head()

Unnamed: 0,Squad,# Pl,90s,Gls,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG
0,Arsenal,25,26.0,60,431,139,32.3,16.58,5.35,0.12,0.37,16.0,3,8,8,53.1,47.0,0.11,6.9,5.0
1,Aston Villa,28,27.0,56,394,146,37.1,14.59,5.41,0.13,0.36,16.2,13,4,4,50.1,47.0,0.12,5.9,5.0
2,Bournemouth,27,25.0,33,333,109,32.7,13.32,4.36,0.1,0.29,16.7,10,1,1,35.8,35.0,0.11,-2.8,-3.0
3,Brentford,28,27.0,38,340,111,32.6,12.59,4.11,0.1,0.32,15.1,12,3,3,43.0,40.8,0.12,-5.0,-5.8
4,Brighton,27,27.0,46,404,157,38.9,14.96,5.81,0.1,0.26,16.5,10,5,5,44.9,41.2,0.1,1.1,-0.2


In [190]:
# Specify columns to add a prefix to
cols = team_shooting_stats.loc[:, team_shooting_stats.columns != "Squad"]

In [191]:
# Add "team_" in front of team variables to distinguish between player stats
team_shooting_stats = team_shooting_stats.rename(columns = {c: 'team_' + c for c in team_shooting_stats.columns if c in cols})

In [None]:
# Create a loop 
for competition in list_competitions:
    