In [5]:
import requests

In [7]:
from bs4 import BeautifulSoup

In [18]:
import pandas as pd

In [22]:
from io import StringIO

In [15]:
def fetch_url(url, retries=3):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
    for attempt in range(retries):
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            return response
        elif response.status_code == 429:
            print("Rate limit reached, retrying in 5 seconds...")
            time.sleep(5)  # Wait before retrying
        else:
            print(f"Error: Received unexpected status code {response.status_code} for {url}")
            return None
    return None

In [13]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [14]:
data = requests.get(standings_url)

In [19]:
soup = BeautifulSoup(data.text, 'html.parser')
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [16]:
team_urls = [f"https://fbref.com{l}" for l in links]

In [17]:
data = requests.get(team_urls[0])

In [20]:
html_buffer = StringIO(data.text)

In [21]:
matches = pd.read_html(html_buffer, match="Scores & Fixtures")[0]

In [22]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [26]:
data = requests.get(f"https://fbref.com{links[0]}")

In [27]:
html_buffer = StringIO(data.text)

In [28]:
shooting = pd.read_html(html_buffer, match="Shooting")[0]

In [29]:
shooting.head()

Unnamed: 0_level_0,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2024-08-10,15:00,FA Community Shield,FA Community Shield,Sat,Home,D,1 (7),1 (6),Manchester Utd,...,,,0,0,,,,,,Match Report
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,Chelsea,...,19.1,0.0,0,0,0.8,0.8,0.07,1.2,1.2,Match Report
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4,1,Ipswich Town,...,17.8,1.0,1,1,3.3,2.6,0.2,0.7,0.4,Match Report
3,2024-08-31,17:30,Premier League,Matchweek 3,Sat,Away,W,3,1,West Ham,...,15.0,1.0,0,0,3.0,3.0,0.13,0.0,0.0,Match Report
4,,,,,,,--,,,,...,16.7,2.0,1,1,,,0.13,10.0,9.0,


In [30]:
shooting.columns = shooting.columns.droplevel()

In [31]:
team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

In [32]:
team_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Opp Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2024-08-10,15:00,FA Community Shield,FA Community Shield,Sat,Home,D,1 (7),1 (6),Manchester Utd,...,4-2-3-1,Jarred Gillett,Match Report,Manchester City won on penalty kicks following...,9,1,,,0,0
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,Chelsea,...,4-2-3-1,,Match Report,,11,5,19.1,0.0,0,0
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4,1,Ipswich Town,...,5-4-1,,Match Report,,13,4,17.8,1.0,1,1
3,2024-08-31,17:30,Premier League,Matchweek 3,Sat,Away,W,3,1,West Ham,...,4-2-3-1,Michael Oliver,Match Report,,23,8,15.0,1.0,0,0


In [3]:
years = list(range(2022, 2020, -1))
all_matches = []

In [1]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [31]:
import time
for year in years:
    data = fetch_url(standings_url)
    if not data:
        continue  # Skip this year if the request failed
    
    soup = BeautifulSoup(data.text, 'html.parser')
    standings_table = soup.select('table.stats_table')
    
    if not standings_table:
        print("No standings table found.")
        continue
    
    standings_table = standings_table[0]
    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    # Fetch previous season's URL
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        team_data = fetch_url(team_url)
        if not team_data:
            continue

        # Wrap HTML content in StringIO and read tables
        html_buffer = StringIO(team_data.text)
        try:
            matches = pd.read_html(html_buffer, match="Scores & Fixtures")[0]
        except ValueError:
            print(f"No 'Scores & Fixtures' table found for {team_name}.")
            continue

        soup = BeautifulSoup(team_data.text, 'html.parser')
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]

        # Fetch shooting stats URL
        if not links:
            print(f"No 'Shooting' link found for {team_name}.")
            continue
        shooting_url = f"https://fbref.com{links[0]}"
        shooting_data = fetch_url(shooting_url)
        if not shooting_data:
            continue

        # Read the 'Shooting' table
        html_buffer = StringIO(shooting_data.text)
        try:
            shooting = pd.read_html(html_buffer, match="Shooting")[0]
        except ValueError:
            print(f"No 'Shooting' table found for {team_name}.")
            continue

        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            print(f"Data merging failed for {team_name}.")
            continue

        # Filter for Premier League games only
        team_data = team_data[team_data["Comp"] == "Premier League"]
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        
        # Pause to respect the website's rate limits
        time.sleep(1)

Rate limit reached, retrying in 5 seconds...
Rate limit reached, retrying in 5 seconds...
Rate limit reached, retrying in 5 seconds...
Rate limit reached, retrying in 5 seconds...
Rate limit reached, retrying in 5 seconds...
Rate limit reached, retrying in 5 seconds...


In [24]:
final_data = pd.concat(all_matches, ignore_index=True)

In [27]:
len(all_matches)

9

In [28]:
match_df = pd.concat(all_matches)

In [29]:
match_df.columns = [c.lower() for c in match_df.columns]

In [30]:
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,...,Match Report,,13,1,18.7,1.0,1,1,2022,Manchester City
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,...,Match Report,,19,7,17.5,0.0,0,0,2022,Manchester City
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,...,Match Report,,21,10,16.2,1.0,0,0,2022,Manchester City
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,...,Match Report,,18,5,14.1,0.0,0,0,2022,Manchester City
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,...,Match Report,,17,9,14.8,0.0,0,0,2022,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2023-04-29,15:00,Premier League,Matchweek 34,Sat,Home,W,2,1,Nott'ham Forest,...,Match Report,,14,8,16.8,1.0,0,0,2022,Brentford
37,2023-05-06,17:30,Premier League,Matchweek 35,Sat,Away,L,0,1,Liverpool,...,Match Report,,5,1,19.5,1.0,0,0,2022,Brentford
38,2023-05-14,14:00,Premier League,Matchweek 36,Sun,Home,W,2,0,West Ham,...,Match Report,,24,9,12.8,0.0,0,0,2022,Brentford
39,2023-05-20,12:30,Premier League,Matchweek 37,Sat,Away,W,3,1,Tottenham,...,Match Report,,11,4,18.1,0.0,0,0,2022,Brentford


In [32]:
match_df.to_csv("matches.csv")