In [61]:
import requests

stands_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

data = requests.get(stands_url)

In [None]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(data.text)

""" Filter PL Standings Table for Team Links """
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get('href') for l in links]
links = [l for l in links if '/squads' in l]

team_urls = [f"https://fbref.com{l}" for l in links]

team_url = team_urls[0]
data = requests.get(team_url)

In [None]:
import pandas as pd
from io import StringIO

""" Filter Team data for Matches Table """
matches = pd.read_html(StringIO(data.text), match="Scores & Fixtures")
matches[0].head()

In [None]:
soup = BeautifulSoup(data.text)

""" Filter Team data for Shooting Table """
links = soup.find_all('a')
links = [l.get('href') for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]

data = requests.get(f"https://fbref.com{links[0]}")

shooting = pd.read_html(StringIO(data.text), match="Shooting")[0]
shooting.columns = shooting.columns.droplevel()
shooting.head()

""" Merge Matches and Shooting Tables """
team_data = matches[0].merge(shooting[['Date', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']], on='Date')
team_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Opp Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2025-08-10,15:00,FA Community Shield,FA Community Shield,Sun,Neutral,D,2 (2),2 (3),Crystal Palace,...,3-4-3,Chris Kavanagh,Match Report,Crystal Palace won on penalty kicks following ...,12,5,,,0,0
1,2025-08-15,20:00,Premier League,Matchweek 1,Fri,Home,W,4,2,Bournemouth,...,4-1-4-1,Anthony Taylor,Match Report,,19,10,16.6,0.0,0,0
2,2025-08-25,20:00,Premier League,Matchweek 2,Mon,Away,W,3,2,Newcastle Utd,...,4-3-3,Simon Hooper,Match Report,,5,4,19.3,0.0,0,0
3,2025-08-31,16:30,Premier League,Matchweek 3,Sun,Home,W,1,0,Arsenal,...,4-3-3,Chris Kavanagh,Match Report,,9,3,22.4,1.0,0,0
4,2025-09-14,14:00,Premier League,Matchweek 4,Sun,Away,W,1,0,Burnley,...,5-4-1,Michael Oliver,Match Report,,26,3,18.5,1.0,1,1


In [None]:
import time

years = list(range(2022, 2019, -1))
all_matches = [] # Match logs of one team in one season
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0] # Indv team links -> match data for each team

    links = [l.get('href') for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select('a.prev')[0].get('href')
    standings_url = f"https://fbref.com{previous_season}"


    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")

        data = requests.get(team_url)
        matches = pd.read_html(StringIO(data.text), match="Scores & Fixtures")[0]

        soup = BeautifulSoup(data.text)
        shooting_links = soup.find_all('a')
        shooting_links = [l.get('href') for l in shooting_links]
        shooting_links = [l for l in shooting_links if l and '/all_comps/shooting/' in l]
        
        if shooting_links:  # Check if links exist
            shooting_data = requests.get(f"https://fbref.com{shooting_links[0]}")
            shooting = pd.read_html(StringIO(shooting_data.text), match="Shooting")[0]
        else:
            print(f"No shooting data found for {team_name}")
            continue
        shooting.columns = shooting.columns.droplevel()

        try: 
            team_data = matches.merge(shooting[['Date', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']], on='Date')
        except ValueError:
            continue
        
        team_data = team_data[team_data["Comp"] == "Premier League"] # Filter for PL matches
        team_data["Season"] = year # Add season column
        team_data["Team"] = team_name # Add team name column

        all_matches.append(team_data)
        time.sleep(1)

# Create final dataframe after all years and teams are processed
match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]
match_df.to_csv("match_data.csv")

# Display the dataframe
print(f"Total matches collected: {len(match_df)}")
print(f"Columns: {list(match_df.columns)}")
print("\nFirst 5 rows:")
match_df.head()