# 1. Webscraping Soccer Matches from the EPL

In [82]:
import time

import requests
from bs4 import BeautifulSoup

In [83]:
standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'

In [84]:
data = requests.get(standings_url)

In [102]:
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]

# Getting href links for all squads in the table
links = standings_table.find_all('a')
links = [l.get('href') for l in links]
links = [l for l in links if '/squads/' in l]
links

['/en/squads/cff3d9bb/Chelsea-Stats',
 '/en/squads/4ba7cbea/Bournemouth-Stats',
 '/en/squads/b2b47a98/Newcastle-United-Stats',
 '/en/squads/8602292d/Aston-Villa-Stats',
 '/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 '/en/squads/d20821dd/LASK-Stats',
 '/en/squads/7c21e445/West-Ham-United-Stats',
 '/en/squads/a2d435b3/Leicester-City-Stats',
 '/en/squads/361ca564/Tottenham-Hotspur-Stats',
 '/en/squads/e14f61a5/Union-SG-Stats',
 '/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 '/en/squads/d3fd31cc/Everton-Stats',
 '/en/squads/3f8c4b5f/Toulouse-Stats',
 '/en/squads/e4a775cb/Nottingham-Forest-Stats',
 '/en/squads/4ba7cbea/Bournemouth-Stats',
 '/en/squads/e297cd13/Luton-Town-Stats',
 '/en/squads/3f8c4b5f/Toulouse-Stats',
 '/en/squads/cd051869/Brentford-Stats',
 '/en/squads/b8fd03ef/Manchester-City-Stats',
 '/en/squads/d20821dd/LASK-Stats',
 '/en/squads/fd962109/Fulham-Stats',
 '/en/squads/1df6b87e/Sheffield-United-Stats',
 '/en/squads/47c64c55/Crystal-Palace-Stats',
 '/en/squads

In [None]:
teams_urls = [f"https://fbref.com{l}" for l in links]
teams_urls

### 1.1 Extracting Match Stats using Pandas and Requests

In [None]:
import pandas as pd

In [None]:
team_url = teams_urls[0]
data1 = requests.get(team_url)

matches = pd.read_html(data1.text, match='Scores & Fixtures')
matches[0]

### 1.2 Extracting Match Shooting Stats with Requests and Pandas

In [None]:
soup1 = BeautifulSoup(data1.text)
links = soup1.select('a')
links = [l.get('href') for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [None]:
data2 = requests.get(f"https://fbref.com{links[0]}")
shooting = pd.read_html(data2.text, match='Shooting')[0]
shooting.head()

### 1.3 Cleaning and Merging Scraped Data with Pandas

In [None]:
# Cleaning the headers
shooting.columns = shooting.columns.droplevel()
shooting.head()

In [None]:
# Combining Matches and Shooting Dataframes
team_data = matches[0].merge(shooting[['Date', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']], on='Date')
team_data.shape

In [103]:
# For multiple teams and multiple years
years = list(range(2020, 2025))
years

[2020, 2021, 2022, 2023, 2024]

In [104]:
all_matches = []
standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'

for year in years:
    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    # }
    
    # data = requests.get(standings_url, headers=headers)
    data = requests.get(standings_url)


    # if data.status_code != 200:
    #     print(f"Request failed with status code {data.status_code}")
    #     continue
    
    
    if data.status_code == 429:
        retry_after = int(data.headers.get("Retry-After", 60)) # Default to 60 seconds if header is not present
        print(f"Rate limit hit. Retrying after {retry_after} seconds")
        time.sleep(retry_after)
        continue
    
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"

    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]

        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(5)
        
        


In [106]:
match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]
match_df.to_csv('matches.csv')