### Downloading and Exploring the EPL Stats Page

In [1]:
import requests

In [2]:
url = "https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats"

In [3]:
domain = "https://fbref.com"

In [4]:
page = requests.get(url)

In [5]:
page.status_code # if the code is 200 then OK

200

In [6]:
page.headers

{'Date': 'Wed, 23 Oct 2024 06:47:35 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'sr-miss-memd-key': '/en/comps/9/2021-2022/2021-2022-Premier-League-Stats', 'strict-transport-security': 'max-age=2592000; includeSubDomains', 'vary': 'Accept-Encoding', 'Last-Modified': 'Wed, 23 Oct 2024 06:14:24 GMT', 'CF-Cache-Status': 'HIT', 'Expires': 'Wed, 23 Oct 2024 10:47:35 GMT', 'Cache-Control': 'public, max-age=14400', 'Set-Cookie': '__cf_bm=DSC00TwOQ6MuklQ3yl81O3ls09wZcGqAwBOuy2MWdGw-1729666055-1.0.1.1-r2tce768QRIusNq2rGN9Yr6YOhrCqGHSzGyRPjnKc_grU18FHanDqA9MMiR1_G2MrcpRmgmCggNY6hnexbMmjA; path=/; expires=Wed, 23-Oct-24 07:17:35 GMT; domain=.fbref.com; HttpOnly; Secure; SameSite=None', 'Server': 'cloudflare', 'CF-RAY': '8d6fc6cdce1ec30a-VIE', 'Content-Encoding': 'gzip'}

### Parsing HTML Links

In [7]:
from bs4 import BeautifulSoup

In [8]:
soup = BeautifulSoup(page.content, "html.parser")

In [9]:
teams_table = soup.find("table", id="results2021-202291_overall")

In [10]:
links = teams_table.find_all("a")

In [11]:
"""
Example links:

Link: /en/squads/19538871/2021-2022/Manchester-United-Stats, Text: Manchester Utd    - if contains word 'squads' then it refer to club
Link: /en/players/dea698d9/Cristiano-Ronaldo, Text: Cristiano Ronaldo
"""

club_paths = []

for link in links:
    path = link.get("href")
    
    if ("squads") in path:
        club_paths.append(path)

In [12]:
club_paths

['/en/squads/b8fd03ef/2021-2022/Manchester-City-Stats',
 '/en/squads/822bd0ba/2021-2022/Liverpool-Stats',
 '/en/squads/cff3d9bb/2021-2022/Chelsea-Stats',
 '/en/squads/361ca564/2021-2022/Tottenham-Hotspur-Stats',
 '/en/squads/18bb7c10/2021-2022/Arsenal-Stats',
 '/en/squads/19538871/2021-2022/Manchester-United-Stats',
 '/en/squads/7c21e445/2021-2022/West-Ham-United-Stats',
 '/en/squads/a2d435b3/2021-2022/Leicester-City-Stats',
 '/en/squads/d07537b9/2021-2022/Brighton-and-Hove-Albion-Stats',
 '/en/squads/8cec06e1/2021-2022/Wolverhampton-Wanderers-Stats',
 '/en/squads/b2b47a98/2021-2022/Newcastle-United-Stats',
 '/en/squads/47c64c55/2021-2022/Crystal-Palace-Stats',
 '/en/squads/cd051869/2021-2022/Brentford-Stats',
 '/en/squads/8602292d/2021-2022/Aston-Villa-Stats',
 '/en/squads/33c895d4/2021-2022/Southampton-Stats',
 '/en/squads/d3fd31cc/2021-2022/Everton-Stats',
 '/en/squads/5bfb9659/2021-2022/Leeds-United-Stats',
 '/en/squads/943e8050/2021-2022/Burnley-Stats',
 '/en/squads/2abfe087/2021-

In [13]:
club_urls = [domain + path for path in club_paths] # creating a list of full URLs (domain url + paths of each club)

In [14]:
club_urls

['https://fbref.com/en/squads/b8fd03ef/2021-2022/Manchester-City-Stats',
 'https://fbref.com/en/squads/822bd0ba/2021-2022/Liverpool-Stats',
 'https://fbref.com/en/squads/cff3d9bb/2021-2022/Chelsea-Stats',
 'https://fbref.com/en/squads/361ca564/2021-2022/Tottenham-Hotspur-Stats',
 'https://fbref.com/en/squads/18bb7c10/2021-2022/Arsenal-Stats',
 'https://fbref.com/en/squads/19538871/2021-2022/Manchester-United-Stats',
 'https://fbref.com/en/squads/7c21e445/2021-2022/West-Ham-United-Stats',
 'https://fbref.com/en/squads/a2d435b3/2021-2022/Leicester-City-Stats',
 'https://fbref.com/en/squads/d07537b9/2021-2022/Brighton-and-Hove-Albion-Stats',
 'https://fbref.com/en/squads/8cec06e1/2021-2022/Wolverhampton-Wanderers-Stats',
 'https://fbref.com/en/squads/b2b47a98/2021-2022/Newcastle-United-Stats',
 'https://fbref.com/en/squads/47c64c55/2021-2022/Crystal-Palace-Stats',
 'https://fbref.com/en/squads/cd051869/2021-2022/Brentford-Stats',
 'https://fbref.com/en/squads/8602292d/2021-2022/Aston-Vill

### Extracting Match Stats

In [15]:
page = requests.get(club_urls[5]) # Get the stats of Manchester United ( My favourite club :) )

In [16]:
page.status_code

200

In [17]:
soup = BeautifulSoup(page.content, 'html.parser')

In [18]:
mu_stats = soup.find(id="matchlogs_for") # ID of table that contains the "Scores & Fixtures" is "match_for_logs"

In [19]:
# Extracting stats from table called "Scores & Fixtures" 

def extract_stats(table):
    
    columns = [] # It will be the column names of the DataFrame.

    for name in table.find_all("tr")[0]:
        if name.string != " ": # It stores a space between elements. I will just ommit them.
            columns.append(name.string)

    match_stats = []

    for match in table.find_all("tr")[1:]: # Ommiting the header (first "tr" tag)
        stats = []
    
        for stat in match:
            if stat.find("span"): # The time data is inner a "span" tag within "td" tag
                stats.append(stat.find("span").text)
            else:
                stats.append(stat.string) # else I added the value os "td" tag to list

        match_stats.append(stats)

    return columns, match_stats

In [20]:
import pandas as pd

In [21]:
columns, data = extract_stats(mu_stats)

In [22]:
mu_table = pd.DataFrame(data=data, columns=columns)

In [23]:
mu_table.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Opp Formation,Referee,Match Report,Notes
0,2021-08-14,12:30,Premier League,Matchweek 1,Sat,Home,W,5,1,Leeds United,1.5,0.5,49,72732,Harry Maguire,4-2-3-1,4-1-4-1,Paul Tierney,Match Report,
1,2021-08-22,14:00,Premier League,Matchweek 2,Sun,Away,D,1,1,Southampton,1.8,0.7,63,32000,Harry Maguire,4-2-3-1,4-4-2,Craig Pawson,Match Report,
2,2021-08-29,16:30,Premier League,Matchweek 3,Sun,Away,W,1,0,Wolves,0.6,2.1,56,30621,Harry Maguire,4-2-3-1,3-4-3,Mike Dean,Match Report,
3,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,4,1,Newcastle Utd,2.5,0.4,63,72732,Harry Maguire,4-2-3-1,5-4-1,Anthony Taylor,Match Report,
4,2021-09-14,18:45,Champions Lg,Group stage,Tue,Away,L,1,2,ch,0.5,1.4,46,31120,Harry Maguire,4-2-3-1,4-2-3-1,François Letexier,Match Report,


### Getting Match Shooting Stats

I will gathering the another stats (such as the number of shots, the number of shots on target, the number of free kicks, and the number of penalty kicks) for Manchester United which are in another table in a Tab called Shoots.

In [24]:
links_on_page = soup.find_all("a")

In [25]:
import re

shoots_link = domain + soup.find("a", href=re.compile("/matchlogs/all_comps/shooting/")).get("href")

In [26]:
shoots_link

'https://fbref.com/en/squads/19538871/2021-2022/matchlogs/all_comps/shooting/Manchester-United-Match-Logs-All-Competitions'

In [27]:
page = requests.get(shoots_link)
soup = BeautifulSoup(page.content, "html.parser")

shoots_stat_table = soup.find("table", id="matchlogs_for")

In [28]:
def extract_shooting_stats(table):
    
    shooting_stats = [stat for stat in table.find_all("tr")]

    columns = []

    for column_name in shooting_stats[1].find_all("th"):
        columns.append(column_name.string)

    data = []

    for row in shooting_stats[2:]: # omitting header
        match_data = [] # storing data from each row and after added to the data list that contains all of them
    
        for col in row:
            if col.find("span"): # the time data is inner a "span" tag within "td" tag
                match_data.append(col.find("span").text)
            else:
                match_data.append(col.string)

        data.append(match_data)

    return columns, data

In [29]:
columns, data = extract_shooting_stats(shoots_stat_table)

In [30]:
shoots_table = pd.DataFrame(data=data, columns=columns)

In [31]:
shoots_table.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2021-08-14,12:30,Premier League,Matchweek 1,Sat,Home,W,5,1,Leeds United,...,18.2,0,0,0,1.5,1.5,0.09,3.5,3.5,Match Report
1,2021-08-22,14:00,Premier League,Matchweek 2,Sun,Away,D,1,1,Southampton,...,15.1,1,0,0,1.8,1.8,0.14,-0.8,-0.8,Match Report
2,2021-08-29,16:30,Premier League,Matchweek 3,Sun,Away,W,1,0,Wolves,...,18.8,1,0,0,0.6,0.6,0.06,0.4,0.4,Match Report
3,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,4,1,Newcastle Utd,...,20.5,0,0,0,2.5,2.5,0.12,1.5,1.5,Match Report
4,2021-09-14,18:45,Champions Lg,Group stage,Tue,Away,L,1,2,ch,...,10.8,0,0,0,0.5,0.5,0.26,0.5,0.5,Match Report


### Cleaning and Merging Scraped Data¶

In [32]:
mu_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Date           49 non-null     object
 1   Time           49 non-null     object
 2   Comp           49 non-null     object
 3   Round          49 non-null     object
 4   Day            49 non-null     object
 5   Venue          49 non-null     object
 6   Result         49 non-null     object
 7   GF             48 non-null     object
 8   GA             48 non-null     object
 9   Opponent       49 non-null     object
 10  xG             46 non-null     object
 11  xGA            46 non-null     object
 12  Poss           49 non-null     object
 13  Attendance     49 non-null     object
 14  Captain        49 non-null     object
 15  Formation      49 non-null     object
 16  Opp Formation  49 non-null     object
 17  Referee        49 non-null     object
 18  Match Report   49 non-null     o

In [33]:
mu_table.shape

(49, 20)

In [34]:
shoots_table.shape

(50, 26)

In [35]:
shoots_table.tail()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
45,2022-04-28,19:45,Premier League,Matchweek 37,Thu,Home,D,1.0,1.0,Chelsea,...,15.9,0,0,0,0.5,0.5,0.09,0.5,0.5,Match Report
46,2022-05-02,20:00,Premier League,Matchweek 35,Mon,Home,W,3.0,0.0,Brentford,...,20.3,1,1,1,2.0,1.2,0.15,1.0,0.8,Match Report
47,2022-05-07,17:30,Premier League,Matchweek 36,Sat,Away,L,0.0,4.0,Brighton,...,19.4,1,0,0,0.9,0.9,0.06,-0.9,-0.9,Match Report
48,2022-05-22,16:00,Premier League,Matchweek 38,Sun,Away,L,0.0,1.0,Crystal Palace,...,19.8,1,0,0,0.7,0.7,0.08,-0.7,-0.7,Match Report
49,,,,,,,--,,,,...,17.7,24,3,6,,,0.11,70.0,67.0,


In [36]:
shoots_table = shoots_table[:49] # The last row was overall scores in table. I just omitted them

In [37]:
shoots_table = shoots_table.copy().drop(["xG", "Match Report"], axis=1) # These columns aren't necessary and they aren't at begin to omitt easily via slice so I drop them before concating. Creating copy due to warning message

In [38]:
scraped_stats = pd.concat([mu_table, shoots_table.loc[:, "Gls":]], axis=1) # The latter table contains same columns in their begin so I omitted them

In [39]:
scraped_stats.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,G/Sh,G/SoT,Dist,FK,PK,PKatt,npxG,npxG/Sh,G-xG,np:G-xG
0,2021-08-14,12:30,Premier League,Matchweek 1,Sat,Home,W,5,1,Leeds United,...,0.31,0.63,18.2,0,0,0,1.5,0.09,3.5,3.5
1,2021-08-22,14:00,Premier League,Matchweek 2,Sun,Away,D,1,1,Southampton,...,0.07,0.33,15.1,1,0,0,1.8,0.14,-0.8,-0.8
2,2021-08-29,16:30,Premier League,Matchweek 3,Sun,Away,W,1,0,Wolves,...,0.1,0.33,18.8,1,0,0,0.6,0.06,0.4,0.4
3,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,4,1,Newcastle Utd,...,0.19,0.67,20.5,0,0,0,2.5,0.12,1.5,1.5
4,2021-09-14,18:45,Champions Lg,Group stage,Tue,Away,L,1,2,ch,...,0.5,0.5,10.8,0,0,0,0.5,0.26,0.5,0.5


### Scraping Data for Multiple Seasons and Teams

In [40]:
import time
import pandas
import re

In [41]:
# generating the links of seasons from 2021-22 to 2023-24 descending

season_links = [f"https://fbref.com/en/comps/9/20{i:02d}-20{i+1:02d}/20{i:02d}-20{i+1:02d}-Premier-League-Stats" for i in range(23, 21, -1)]

In [42]:
dataframes = [] # each DataFrames in list is a match log for one team in one season

In [45]:
##### Scraping over multiple seasons

for url in season_links:
    
    print("Starting a new season ")
    
    page = requests.get(url)
    time.sleep(3)
    
    soup = BeautifulSoup(page.content, "html.parser")

    results_table = soup.find("table")

    club_urls = []

    for url in results_table.find_all("a"):
        path = url.get("href") 
    
        if "squads" in path: # this contains the url of club stats
            club_urls.append(domain + path)

    for url in club_urls:

        # Getting Club stats and create dataframe
    
        page = requests.get(url)
        if page.status_code == 200:
            print(f"{url} is loaded succesful and getting start to scrapping")
            time.sleep(3)

        soup = BeautifulSoup(page.content, 'html.parser')

        club_stats = soup.find(id="matchlogs_for") # ID of table that contains the "Scores & Fixtures" is "match_for_logs"

        columns, data = extract_stats(club_stats)

        club_df = pd.DataFrame(data=data, columns=columns)
        club_df = club_df.copy().drop(["Match Report", "Notes"], axis=1) # Those isn't necessary columns and make hard in creating of table due to plain text

        # Extracting the season and team information from url and sav to variables (added to final dataframe in later)

        pattern = re.search(r"/(?P<content>.*?)-Stats", url) # Example: 'https://fbref.com/en/squads/19538871/2021-2022/Manchester-United-Stats'
        information = pattern.group("content").split("/")
    
        season = information[-2] # After slicing, the season there is before the last "/". Before tha last element in list
        team = information[-1].replace("-", " ") # The last element in the slicing list

        # Getting shots stats refer to the actual club

        links_on_page = soup.find_all("a")

        shoots_link = domain + soup.find("a", href=re.compile("/matchlogs/all_comps/shooting/")).get("href")

        page = requests.get(shoots_link)
        soup = BeautifulSoup(page.content, "html.parser")
        print(" - Shoot stats for actual club is loaded. Data process is beginning")
        time.sleep(3)

        shoots_stat_table = soup.find("table", id="matchlogs_for")

        columns, data = extract_shooting_stats(shoots_stat_table)

        shoots_df = pd.DataFrame(data=data, columns=columns)
        shoots_df = shoots_df[:49] # last, summary row is ommited
        shoots_df = shoots_df.copy().drop(["xG", "Match Report"], axis=1)

        combined_stats = pd.concat([club_df, shoots_df.loc[:, "Gls":]], axis=1) # The latter table contains same columns in their begin so I omitted them

        # Added the two addition information which was extracted earlier
        combined_stats["Season"] = season
        combined_stats["Team"] = team

        dataframes.append(combined_stats)
        print("Dataframe has been added.")
        time.sleep(5)

Starting a new season 
https://fbref.com/en/squads/b8fd03ef/2023-2024/Manchester-City-Stats is loaded succesful and getting start to scrapping
 - Shoot stats for actual club is loaded. Data process is beginning
Dataframe has been added.
https://fbref.com/en/squads/18bb7c10/2023-2024/Arsenal-Stats is loaded succesful and getting start to scrapping
 - Shoot stats for actual club is loaded. Data process is beginning
Dataframe has been added.
https://fbref.com/en/squads/822bd0ba/2023-2024/Liverpool-Stats is loaded succesful and getting start to scrapping
 - Shoot stats for actual club is loaded. Data process is beginning
Dataframe has been added.
https://fbref.com/en/squads/8602292d/2023-2024/Aston-Villa-Stats is loaded succesful and getting start to scrapping
 - Shoot stats for actual club is loaded. Data process is beginning
Dataframe has been added.
https://fbref.com/en/squads/361ca564/2023-2024/Tottenham-Hotspur-Stats is loaded succesful and getting start to scrapping
 - Shoot stats fo

In [46]:
dataframes[0].head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,npxG,npxG/Sh,G-xG,np:G-xG,Season,Team
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,,,Arsenal,...,,,0,0,,,,,2023-2024,Manchester City
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3.0,0.0,Burnley,...,13.9,0.0,0,0,1.9,0.12,1.1,1.1,2023-2024,Manchester City
2,2023-08-16,22:00,Super Cup,UEFA Super Cup,Wed,Home,D,,,es,...,,,0,0,,,,,2023-2024,Manchester City
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1.0,0.0,Newcastle Utd,...,17.9,0.0,0,0,1.0,0.07,0.0,0.0,2023-2024,Manchester City
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Sheffield Utd,...,17.3,2.0,0,1,2.8,0.1,-1.5,-0.8,2023-2024,Manchester City


In [47]:
epl_stats = pd.concat(dataframes, axis=0, ignore_index=True)

In [48]:
epl_stats = epl_stats[epl_stats["Comp"] == "Premier League"]

In [49]:
epl_stats.to_csv("epl.stats.csv", index=False)