# **Python Web Scraper for Historical Data on Premier League teams from 2024 to 2022**

Using BeautifulSoup and pandas, data from previous seasons such as wins and goals anhg xGs are scraped and then made into a csv file for data analysis.

In [None]:
import requests
!pip install backoff lxml
import backoff

In [19]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [20]:
data = requests.get(standings_url)

In [21]:
from bs4 import BeautifulSoup
import time
from io import StringIO

In [22]:
if data.status_code == 200:
    soup = BeautifulSoup(data.text, "html.parser")

    # Check if the table exists before accessing it
    standings_tables = soup.select('table.stats_table')
    if standings_tables:
        standings_table = standings_tables[0]
        links = standings_table.find_all('a')
        links = [l.get("href") for l in links]
        links = [l for l in links if '/squads/' in l]
    else:
        print("Table not found. The website structure might have changed.") # for future, in case if the website changes
else:
    print(f"Request failed with status code: {data.status_code}")
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [23]:
team_urls = [f"https://fbref.com{l}" for l in links]

In [24]:
data = requests.get(team_urls[0])

In [25]:
import pandas as pd
matches = pd.read_html(StringIO(data.text), match="Scores & Fixtures")[0]

In [26]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [27]:
data = requests.get(f"https://fbref.com{links[0]}")

In [28]:
shooting = pd.read_html(StringIO(data.text), match="Shooting")[0]

In [29]:
shooting.columns = shooting.columns.droplevel()

In [30]:
team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

In [31]:
team_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Opp Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,...,4-2-3-1,Tim Robinson,Match Report,,18,5,14.8,0.0,0,0
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,...,4-4-2,Stuart Attwell,Match Report,,19,8,13.6,1.0,0,0
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3.0,0.0,Manchester Utd,...,4-2-3-1,Anthony Taylor,Match Report,,11,3,13.4,0.0,0,0
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0.0,1.0,Nott'ham Forest,...,4-2-3-1,Michael Oliver,Match Report,,14,5,14.9,0.0,0,0
4,2024-09-17,21:00,Champions Lg,League phase,Tue,Away,W,3.0,1.0,it Milan,...,4-2-3-1,Espen Eskås,Match Report,,23,11,15.7,1.0,0,0


In [32]:
years = list(range(2024, 2021, -1))
all_matches = []

In [33]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [34]:
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=8)
def get_with_backoff(url):
    time.sleep(2)
    return requests.get(url)


for year in years:
    data = get_with_backoff(standings_url)
    soup = BeautifulSoup(data.text, features="lxml")
    standings_tables = soup.select('table.stats_table')
    if standings_tables:
        standings_table = standings_tables[0]
    else:
        print(f"Standings table not found for year {year}. Skipping...")
        # Optionally, you can try a different selector or inspect the HTML
        # to find the correct selector for the standings table.
        continue

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"

    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")

        data = get_with_backoff(team_url)

        # Checking if request was successful
        if data.status_code == 200:
            try:
                matches = pd.read_html(StringIO(data.text), match="Scores & Fixtures")[0]
            except ValueError:
                print(f"No Scores & Fixtures table found for {team_name} in {year}. Skipping...")
                continue
        else:
            print(f"Request failed for {team_name} in {year} with status code: {data.status_code}. Skipping...")
            continue

        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]

        if not links:
            print(f"No shooting link found for {team_name} in {year}. Skipping...")
            continue

        data = get_with_backoff(f"https://fbref.com{links[0]}")

        # Checking if request was successful
        if data.status_code == 200:
            try:
                shooting = pd.read_html(StringIO(data.text), match="Shooting")[0]
            except ValueError:
                print(f"No Shooting table found for {team_name} in {year}. Skipping...")
                continue
        else:
            print(f"Request failed for {team_name} in {year} with status code: {data.status_code}. Skipping...")
            continue

        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            print(f"Merge failed for {team_name} in {year}. Skipping...")
            continue

        team_data = team_data[team_data["Comp"] == "Premier League"]
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)

        time.sleep(1)

Request failed for Chelsea in 2024 with status code: 429. Skipping...
Request failed for Bournemouth in 2024 with status code: 429. Skipping...
Request failed for Aston Villa in 2024 with status code: 429. Skipping...
Request failed for Brighton and Hove Albion in 2024 with status code: 429. Skipping...
Request failed for Fulham in 2024 with status code: 429. Skipping...
Request failed for Brentford in 2024 with status code: 429. Skipping...
Request failed for Manchester United in 2024 with status code: 429. Skipping...
Request failed for Crystal Palace in 2024 with status code: 429. Skipping...
Request failed for West Ham United in 2024 with status code: 429. Skipping...
Request failed for Tottenham Hotspur in 2024 with status code: 429. Skipping...
Request failed for Everton in 2024 with status code: 429. Skipping...
Request failed for Leicester City in 2024 with status code: 429. Skipping...
Request failed for Wolverhampton Wanderers in 2024 with status code: 429. Skipping...
Reques

In [35]:
len(all_matches)

5

In [36]:
match_df = pd.concat(all_matches)

In [37]:
match_df.columns = [c.lower() for c in match_df.columns]

In [38]:
match_df.iloc[50]

Unnamed: 0,6
date,2024-09-28
time,15:00
comp,Premier League
round,Matchweek 6
day,Sat
venue,Home
result,L
gf,0
ga,1
opponent,Fulham


In [41]:
match_df.to_csv("matches.csv")