# Analysis: Bakayoko replacement

In [None]:
# Analysis: Bakayoko replacement
# Name: Yasin Tunçbilek
# Date: 9 February 2024
# Test VSC-Github: 9 August 2024

## Packages

In [1]:
# Library to send HTTP requests
import requests

In [2]:
# Library to parse HTML
from bs4 import BeautifulSoup

In [3]:
# Library to deal with various types of I/O. Needed because read_html gives an error.
from io import StringIO

In [4]:
# Pandas library for data analysis and manipulation
import pandas as pd

  from pandas.core import (


## Competitions

In [5]:
# List of competitions with advanced data from Opta for the analysis
list_competitions = ["https://fbref.com/en/comps/9/Premier-League-Stats",
                    "https://fbref.com/en/comps/12/La-Liga-Stats",
                    "https://fbref.com/en/comps/11/Serie-A-Stats",
                    "https://fbref.com/en/comps/20/Bundesliga-Stats",
                    "https://fbref.com/en/comps/13/Ligue-1-Stats",
                    "https://fbref.com/en/comps/10/Championship-Stats",
                    "https://fbref.com/en/comps/23/Eredivisie-Stats",
                    "https://fbref.com/en/comps/22/Major-League-Soccer-Stats",
                    "https://fbref.com/en/comps/32/Primeira-Liga-Stats",
                    "https://fbref.com/en/comps/24/Serie-A-Stats",
                    "https://fbref.com/en/comps/31/Liga-MX-Stats",
                    "https://fbref.com/en/comps/37/Belgian-Pro-League-Stats",
                    "https://fbref.com/en/comps/17/Segunda-Division-Stats",
                    "https://fbref.com/en/comps/60/Ligue-2-Stats",
                    "https://fbref.com/en/comps/33/2-Bundesliga-Stats",
                    "https://fbref.com/en/comps/18/Serie-B-Stats",
                    "https://fbref.com/en/comps/21/Primera-Division-Stats"]

In [6]:
# URL of a competition where stats reside in
competition_url = list_competitions[0]

In [7]:
# Download HTML of Eredivisie page
data = requests.get(competition_url)

In [8]:
# Initialise library with downloaded HTML
soup = BeautifulSoup(data.text)

In [9]:
# Select menu where URLs to various stats reside in
menu_stats = soup.select('li.full.hasmore')[0]

In [10]:
# Print HTML
menu_stats

<li class="full hasmore" data-fade-selector="#inpage_nav"><span>Squad &amp; Player Stats</span>
<div>
<p class="listhead">Squad &amp; Player Stats</p>
<ul class="">
<li><a href="/en/comps/9/stats/Premier-League-Stats">Standard Stats</a></li>
<li><a href="/en/comps/9/keepers/Premier-League-Stats">Goalkeeping</a></li>
<li><a href="/en/comps/9/keepersadv/Premier-League-Stats">Advanced Goalkeeping</a></li>
<li><a href="/en/comps/9/shooting/Premier-League-Stats">Shooting</a></li>
<li><a href="/en/comps/9/passing/Premier-League-Stats">Passing</a></li>
<li><a href="/en/comps/9/passing_types/Premier-League-Stats">Pass Types</a></li>
<li><a href="/en/comps/9/gca/Premier-League-Stats">Goal and Shot Creation</a></li>
<li><a href="/en/comps/9/defense/Premier-League-Stats">Defensive Actions</a></li>
<li><a href="/en/comps/9/possession/Premier-League-Stats">Possession</a></li>
<li><a href="/en/comps/9/playingtime/Premier-League-Stats">Playing Time</a></li>
<li><a href="/en/comps/9/misc/Premier-Leagu

In [11]:
# Find all tags where URLs to various stats reside in
urls_stats = menu_stats.find_all('a')

In [12]:
# Get href property of each link
urls_stats = [u.get("href") for u in urls_stats]

In [13]:
# Make full URLs
urls_stats = [f"https://fbref.com{l}" for l in urls_stats]

In [14]:
# Print list of URLS where various stats reside in
urls_stats

['https://fbref.com/en/comps/9/stats/Premier-League-Stats',
 'https://fbref.com/en/comps/9/keepers/Premier-League-Stats',
 'https://fbref.com/en/comps/9/keepersadv/Premier-League-Stats',
 'https://fbref.com/en/comps/9/shooting/Premier-League-Stats',
 'https://fbref.com/en/comps/9/passing/Premier-League-Stats',
 'https://fbref.com/en/comps/9/passing_types/Premier-League-Stats',
 'https://fbref.com/en/comps/9/gca/Premier-League-Stats',
 'https://fbref.com/en/comps/9/defense/Premier-League-Stats',
 'https://fbref.com/en/comps/9/possession/Premier-League-Stats',
 'https://fbref.com/en/comps/9/playingtime/Premier-League-Stats',
 'https://fbref.com/en/comps/9/misc/Premier-League-Stats']

## Player stats

### Standard stats 

In [15]:
# URL of standard stats
standard_url = urls_stats[0]

In [16]:
# Download HTML of standard stats page. Player table is commented out and therefore not retrievable. Therefore a replace.
data_players = requests.get(standard_url).text.replace('<!--','').replace('-->','')

In [17]:
# Wrap in a 'StringIO' object to read from a literal string
data_players = StringIO(data_players)

In [18]:
# Create dataframe of player standard stats
player_standard_stats = pd.read_html(data_players, attrs = {'id':'stats_standard'})[0]

In [19]:
# Drop the first level of the dataframe
player_standard_stats.columns = player_standard_stats.columns.droplevel()

In [20]:
# Drop rows where column names occur because of the 'folded' table
player_standard_stats = player_standard_stats.loc[player_standard_stats['Player'] != 'Player']

In [21]:
# Keep only relevant columns
player_standard_stats = player_standard_stats[["Player", "Nation", "Pos", "Squad", "Age", "Min", "90s"]]

In [22]:
# Print first rows of dataframe
player_standard_stats.head()

Unnamed: 0,Player,Nation,Pos,Squad,Age,Min,90s
0,Max Aarons,eng ENG,DF,Bournemouth,24-066,1085,12.1
1,Bénie Adama Traore,ci CIV,"FW,MF",Sheffield Utd,21-101,387,4.3
2,Tosin Adarabioyo,eng ENG,DF,Fulham,26-168,1083,12.0
3,Elijah Adebayo,eng ENG,FW,Luton Town,26-063,1162,12.9
4,Simon Adingra,ci CIV,FW,Brighton,22-069,1367,15.2


### Shooting stats

In [23]:
# URL of shooting stats
shooting_url = urls_stats[3]

In [24]:
# Download HTML of shooting page. Table is commented out. Therefore a replace.
data_players = requests.get(shooting_url).text.replace('<!--','').replace('-->','')

In [25]:
# Wrap in a 'StringIO' object to read from a literal string
data_players = StringIO(data_players)

In [27]:
# Create dataframe of player shooting stats
player_shooting_stats = pd.read_html(data_players, attrs = {'id':'stats_shooting'})[0]

In [28]:
# Drop the first level of the dataframe
player_shooting_stats.columns = player_shooting_stats.columns.droplevel()

In [29]:
# Drop rows where column names occur because of the 'folded' table
player_shooting_stats = player_shooting_stats.loc[player_shooting_stats['Player'] != 'Player']

In [33]:
# Keep only relevant columns
player_shooting_stats = player_shooting_stats[["Player", "Squad", "Age", "Sh", "npxG"]]

In [35]:
player_shooting_stats.head()

Unnamed: 0,Player,Squad,Age,Sh,npxG
0,Max Aarons,Bournemouth,24-066,1,0.0
1,Bénie Adama Traore,Sheffield Utd,21-101,1,0.3
2,Tosin Adarabioyo,Fulham,26-168,10,0.6
3,Elijah Adebayo,Luton Town,26-063,28,5.6
4,Simon Adingra,Brighton,22-069,31,3.0


### Team stats

In [357]:
# Download HTML of standard stats page.
data_teams = requests.get(standard_url)

In [358]:
# Create dataframe of team standard stats of first table on the page. Note: could not solve StringIO error-message here.
team_standard_stats = pd.read_html(data_teams.text)[0]

  team_standard_stats = pd.read_html(data_teams.text)[0]


In [360]:
# Drop the first level of the dataframe
team_standard_stats.columns = team_standard_stats.columns.droplevel()

In [361]:
# Keep only relevant columns
team_standard_stats = team_standard_stats[["Squad", "Age", "MP", "Starts", "Min", "90s"]]

In [362]:
# Specify columns to add a prefix to
cols = team_standard_stats.loc[:, team_standard_stats.columns != "Squad"]

In [363]:
# Add "team_" in front of team variables to distinguish between player stats
team_standard_stats = team_standard_stats.rename(columns = {c: 'team_' + c for c in team_standard_stats.columns if c in cols})

In [377]:
team_standard_stats.head()

Unnamed: 0,Squad,team_Age,team_MP,team_Starts,team_Min,team_90s
0,Arsenal,25.5,26,286,2340,26.0
1,Aston Villa,27.6,27,297,2430,27.0
2,Bournemouth,26.3,25,275,2250,25.0
3,Brentford,27.4,27,297,2430,27.0
4,Brighton,26.7,27,297,2430,27.0


In [365]:
# Merge both dataframes into one dataframe
merged_standard_stats = player_standard_stats.merge(team_standard_stats
                            [["Squad", "team_Age", "team_MP", "team_Starts", "team_Min", "team_90s"]], 
                                                    on = 'Squad', how = 'left')

In [376]:
merged_standard_stats.head()

Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,90s,team_Age,team_MP,team_Starts,team_Min,team_90s
0,Max Aarons,eng ENG,DF,Bournemouth,24-059,2000,14,12,1085,12.1,26.3,25,275,2250,25.0
1,Bénie Adama Traore,ci CIV,"FW,MF",Sheffield Utd,21-094,2002,8,3,387,4.3,26.6,26,286,2340,26.0
2,Tosin Adarabioyo,eng ENG,DF,Fulham,26-161,1997,13,11,993,11.0,29.3,27,297,2430,27.0
3,Elijah Adebayo,eng ENG,FW,Luton Town,26-056,1998,23,13,1162,12.9,27.4,26,286,2340,26.0
4,Simon Adingra,ci CIV,FW,Brighton,22-062,2002,20,15,1367,15.2,26.7,27,297,2430,27.0


## Shooting stats

### Player stats

In [379]:
# URL of shooting stats
shooting_url = urls_stats[3]

In [380]:
# Download HTML of shooting page. Table is commented out. Therefore a replace.
data_players = requests.get(shooting_url).text.replace('<!--','').replace('-->','')

In [369]:
# Wrap in a 'StringIO' object to read from a literal string
data_players = StringIO(data_players)

In [381]:
# Create dataframe of player shooting stats
player_shooting_stats = pd.read_html(data, attrs = {'id':'stats_shooting'})[0]

In [382]:
# Drop the first level of the dataframe
player_shooting_stats.columns = player_shooting_stats.columns.droplevel()

In [383]:
# Drop rows where column names occur because of the 'folded' table
player_shooting_stats = player_shooting_stats.loc[player_shooting_stats['Player'] != 'Player']

In [None]:
# Keep only relevant columns
player_shooting_stats = player_shooting_stats[["Player", "Nation", "Pos", "Squad", "Age", "Born", "MP", "Starts",
                                              "Min", "90s"]]

In [384]:
# Show first five rows of dataframe
player_shooting_stats.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,90s,Gls,Sh,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Matches
0,1,Max Aarons,eng ENG,DF,Bournemouth,24-059,2000,12.1,0,1,...,23.9,0,0,0,0.0,0.0,0.02,0.0,0.0,Matches
1,2,Bénie Adama Traore,ci CIV,"FW,MF",Sheffield Utd,21-094,2002,4.3,0,1,...,15.3,0,0,0,0.3,0.3,0.27,-0.3,-0.3,Matches
2,3,Tosin Adarabioyo,eng ENG,DF,Fulham,26-161,1997,11.0,1,9,...,16.2,0,0,0,0.3,0.3,0.04,0.7,0.7,Matches
3,4,Elijah Adebayo,eng ENG,FW,Luton Town,26-056,1998,12.9,9,28,...,9.5,0,0,0,5.6,5.6,0.2,3.4,3.4,Matches
4,5,Simon Adingra,ci CIV,FW,Brighton,22-062,2002,15.2,6,31,...,15.8,0,0,0,3.0,3.0,0.1,3.0,3.0,Matches


### Team stats

In [385]:
# Download HTML of shooting page
data_teams = requests.get(shooting_url)

In [387]:
# Create dataframe of team shooting stats of first table on the page. Note: could not solve StringIO error-message here.
team_shooting_stats = pd.read_html(data_teams.text, match = "Squad Shooting")[0]

  team_shooting_stats = pd.read_html(data_teams.text, match = "Squad Shooting")[0]


In [388]:
# Drop the first level of the dataframe
team_shooting_stats.columns = team_shooting_stats.columns.droplevel()

In [None]:
# Keep only relevant columns
team_shooting_stats = team_shooting_stats[["Squad", "Age", "MP", "Starts", "Min", "90s"]]

In [389]:
# Show first five rows of dataframe
team_shooting_stats.head()

Unnamed: 0,Squad,# Pl,90s,Gls,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG
0,Arsenal,25,26.0,60,431,139,32.3,16.58,5.35,0.12,0.37,16.0,3,8,8,53.1,47.0,0.11,6.9,5.0
1,Aston Villa,28,27.0,56,394,146,37.1,14.59,5.41,0.13,0.36,16.2,13,4,4,50.1,47.0,0.12,5.9,5.0
2,Bournemouth,27,25.0,33,333,109,32.7,13.32,4.36,0.1,0.29,16.7,10,1,1,35.8,35.0,0.11,-2.8,-3.0
3,Brentford,28,27.0,38,340,111,32.6,12.59,4.11,0.1,0.32,15.1,12,3,3,43.0,40.8,0.12,-5.0,-5.8
4,Brighton,27,27.0,46,404,157,38.9,14.96,5.81,0.1,0.26,16.5,10,5,5,44.9,41.2,0.1,1.1,-0.2


In [190]:
# Specify columns to add a prefix to
cols = team_shooting_stats.loc[:, team_shooting_stats.columns != "Squad"]

In [191]:
# Add "team_" in front of team variables to distinguish between player stats
team_shooting_stats = team_shooting_stats.rename(columns = {c: 'team_' + c for c in team_shooting_stats.columns if c in cols})

In [None]:
# Create a loop 
for competition in list_competitions:
    