In [17]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [14]:
standings_url = "https://fbref.com/en/comps/9/2023-2024/2023-2024-Premier-League-Stats"

data = requests.get(standings_url)

In [None]:
print("Status Code:", data.status_code)

if data.status_code == 200:
    soup = BeautifulSoup(data.text, 'html.parser')
    standings = soup.select('table.stats_table')
    if standings:
        links = standings[0].find_all('a')
        links = [l.get("href") for l in links]
        links = [l for l in links if '/squads/' in l]
        print("Found links:", links)
    else:
        print("No tables found.")
else:
    print("Failed to retrieve data. Status Code:", data.status_code)

time.sleep(5)

In [4]:
team_urls = [f"https://fbref.com{l}" for l in links]
team_urls

['https://fbref.com/en/squads/b8fd03ef/2023-2024/Manchester-City-Stats',
 'https://fbref.com/en/squads/18bb7c10/2023-2024/Arsenal-Stats',
 'https://fbref.com/en/squads/822bd0ba/2023-2024/Liverpool-Stats',
 'https://fbref.com/en/squads/8602292d/2023-2024/Aston-Villa-Stats',
 'https://fbref.com/en/squads/361ca564/2023-2024/Tottenham-Hotspur-Stats',
 'https://fbref.com/en/squads/cff3d9bb/2023-2024/Chelsea-Stats',
 'https://fbref.com/en/squads/b2b47a98/2023-2024/Newcastle-United-Stats',
 'https://fbref.com/en/squads/19538871/2023-2024/Manchester-United-Stats',
 'https://fbref.com/en/squads/7c21e445/2023-2024/West-Ham-United-Stats',
 'https://fbref.com/en/squads/47c64c55/2023-2024/Crystal-Palace-Stats',
 'https://fbref.com/en/squads/d07537b9/2023-2024/Brighton-and-Hove-Albion-Stats',
 'https://fbref.com/en/squads/4ba7cbea/2023-2024/Bournemouth-Stats',
 'https://fbref.com/en/squads/fd962109/2023-2024/Fulham-Stats',
 'https://fbref.com/en/squads/8cec06e1/2023-2024/Wolverhampton-Wanderers-Stat

In [None]:
team_url = team_urls[0]
data = requests.get(team_url)

matches = pd.read_html(data.text, match = "Scores & Fixtures")
matches[0].head()

In [6]:
soup = BeautifulSoup(data.text)

links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]
links

['/en/squads/b8fd03ef/2023-2024/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2023-2024/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2023-2024/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions',
 '/en/squads/b8fd03ef/2023-2024/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions']

In [None]:
data = requests.get(f"https://fbref.com{links[0]}")
shooting = pd.read_html(data.text, match = "Shooting")[0]
shooting.columns = shooting.columns.droplevel()
shooting.head()


In [8]:
team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on = "Date")

team_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Opp Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (1),1 (4),Arsenal,...,4-3-3,Stuart Attwell,Match Report,,8,4,,,0,0
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,5-4-1,Craig Pawson,Match Report,,17,8,13.9,0.0,0,0
2,2023-08-16,22:00,Super Cup,UEFA Super Cup,Wed,Home,D,1 (5),1 (4),es Sevilla,...,4-2-3-1,François Letexier,Match Report,,23,7,,,0,0
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,4-3-3,Robert Jones,Match Report,,14,4,17.9,0.0,0,0
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,3-5-2,Jarred Gillett,Match Report,,29,9,17.3,2.0,0,1


In [9]:
years = list(range(2023, 2021, -1))

all_matches = []
standings_url = "https://fbref.com/en/comps/9/2023-2024/2023-2024-Premier-League-Stats"

In [None]:
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)

        try:
            matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        except ValueError:
            continue

        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")

        try:
            shooting = pd.read_html(data.text, match="Shooting")[0]
        except ValueError:
            continue
        shooting.columns = shooting.columns.droplevel()
        
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(1)

In [None]:
match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]
match_df.to_csv("matches_dsv")