# Scrapping Premier League
In this initial project, we're going to use web scraping to get the necessary data on the EPL match results and then load them into pandas as a cleaned table ready for further machine learning.

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
import numpy as np
from itertools import compress
import time

In [2]:
url = 'https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats'

In [3]:
from urllib.parse import urlparse
parsed = urlparse(url)
scheme = parsed.scheme
netloc = parsed.netloc

## Extract all the links of the team statistics

In [4]:
req = requests.get(url)
bs = BeautifulSoup(req.text, 'html.parser')
table = bs.select('table.stats_table')[0].find_all('a', href= re.compile('squads'))
links = []
for link in table:
    if 'href' in link.attrs:
        link = scheme + '://' + netloc + link.get('href')
        links.append(link)
        print(link)

IndexError: list index out of range

In [12]:
req = requests.get(url)
req.headers

{'Date': 'Thu, 11 Jan 2024 16:17:51 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Retry-After': '1799', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Set-Cookie': '__cf_bm=Fm6pjEykTTZAzeqgOuLPwXYFDR5ZDynd4ctO6A.WW7I-1704989871-1-AU1A0AuNcPr93CfOoop2fmW5Q7RGGZhPs7DiQvE2ry2EhstKwrhQIHy2NIucE+Mb7cMIF/82TQQrIWZsf6Ib+W8=; path=/; expires=Thu, 11-Jan-24 16:47:51 GMT; domain=.fbref.com; HttpOnly; Secure; SameSite=None', 'Vary': 'Accept-Encoding', 'Server': 'cloudflare', 'CF-RAY': '843e78e64ceb3a79-FRA'}

In [None]:
#  the table containing scores and fixtures for all the matches of Manchester City
mancurl = links[0]
req = requests.get(mancurl)
bs = BeautifulSoup(req.text, 'html.parser')
matches = pd.read_html(req.text, match="Scores & Fixtures")[0]
matches.head()


Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,,,57,,Fernandinho,4-3-3,Paul Tierney,Match Report,
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,1.8,1.0,65,58262.0,Fernandinho,4-3-3,Anthony Taylor,Match Report,
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,2.6,0.1,67,51437.0,İlkay Gündoğan,4-3-3,Graham Scott,Match Report,
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,4.4,0.2,80,52276.0,İlkay Gündoğan,4-3-3,Martin Atkinson,Match Report,
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,2.8,0.6,61,32087.0,İlkay Gündoğan,4-3-3,Paul Tierney,Match Report,


In [None]:
#  the shooting stats for the Manchester City - the number of shots, the number of shots on target, the number of free kicks, and the number of penalty kick
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks
internallinksmancity = getInternalLinks(bs, mancurl)
shooting_links = []
for internallink in internallinksmancity:
    if 'shooting' in internallink and 'All' in internallink:
        shooting_links.append(internallink)
        print(internallink)

https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions
https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions
https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions
https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions


In [None]:
req = requests.get(shooting_links[0])
shootings = pd.read_html(req.text, match = re.compile(r"Shooting"))[0][:-1]
shootings.columns = shootings.columns.droplevel(0)
print(f'Matches shape:{matches.shape}')
print(f'Shootings shape:{shootings.shape}')

Matches shape:(58, 19)
Shootings shape:(58, 26)


In [None]:
# Concatenate to get the final table for Man City
target_columns = ['Date', 'Gls', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']
new_names_target_columns = ['Date', 'Goals', 'Shots', 'Shots_on_target', 'Distance', 'Freekicks', 'Penalties', 'Attempted_penalties']
rename_map = dict(zip(target_columns, new_names_target_columns))
shootings = shootings.loc[:, [col for col in target_columns if col in shootings.columns]]



try:
    mcmatch_shootings = matches.merge(shootings, on ='Date')
except Exception as e:
    print(f'Error: {e}')
mcmatch_shootings.rename(columns= rename_map, inplace= True)
mcmatch_shootings.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Referee,Match Report,Notes,Goals,Shots,Shots_on_target,Distance,Freekicks,Penalties,Attempted_penalties
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,...,Paul Tierney,Match Report,,0,12,3,,,0,0
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,Anthony Taylor,Match Report,,0,18,4,17.3,1.0,0,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,Graham Scott,Match Report,,4,16,4,18.5,1.0,0,0
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,Martin Atkinson,Match Report,,5,25,10,14.8,0.0,0,0
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,Paul Tierney,Match Report,,1,25,8,14.3,0.0,0,0


## Extraction on scale
This section tries to scrap data on a larger scale. It aims to allow the user to enter two years and retrieve the tables of the years in between.

In [34]:
#  the shooting stats - the number of shots, the number of shots on target, the number of free kicks, and the number of penalty kick
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

In [45]:
def safereq(url):
    try:
        time.sleep(3)
        req = requests.get(url)
        if req.status_code == 429:
            print(f'Need to wait {10 + int(req.headers.get("Retry-After"))}')
            time.sleep(10 + int(req.headers.get('Retry-After')))
            req = requests.get(url)
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []
    return req


def safeget(url):
    req = safereq(url)
    req.encoding = 'utf-8'
    bs = BeautifulSoup(req.text, 'html.parser')
    return bs

def geturlparts(url):
    parsed = urlparse(url)
    scheme = parsed.scheme
    netloc = parsed.netloc
    bs = safeget(url)
    return scheme,netloc,bs

In [25]:
# Enter a year number and get a link from where to start
def link_selector(latest_season):
    url = 'https://fbref.com/en/comps/9/history/Premier-League-Seasons'
    bs = safeget(url)
    year_table = bs.select('table.stats_table')
    selected_year_tables = year_table[0].find_all('a', string = re.compile(r'19\d\d|200\d|201\d|202[123]'))
    selected_links = [link_tag.get('href') for link_tag in selected_year_tables]
    selected_links = [scheme + '://' + netloc + link if link.startswith('/') else link for link in selected_links]
    starting_url = ''
    for link in selected_links:
        if '-' + str(latest_season) in link:
            starting_url = link # 2020 would yield 2019-2020
    return starting_url

In [18]:
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
# Returns the list of links of the teams of this season/year
def team_stats_of_a_season_links_extractor(url):
    scheme, netloc, bs = geturlparts(url)
    table = bs.select('table.stats_table')[0].find_all('a', href=re.compile('squads'))
    links = [f"{scheme}://{netloc}{link.get('href')}" for link in table if 'href' in link.attrs]
    return links




In [19]:
def prevyear_url_extractor(url):
    scheme, netloc, bs = geturlparts(url)
    desired_tag = bs.find('a', class_ = 'button2 prev')
    prevlink = scheme + '://' + netloc + desired_tag.get('href')
    return prevlink

In [20]:
# Step 5.1 For each website above, extract the name
def name_extractor(url):
    team_name_pattern = re.compile(r'\d{4}-\d{4}/([A-Za-z-]+)-Stats')
    season_pattern = re.compile(r'(\d{4}-\d{4})/(?:[A-Za-z-]+)-Stats')
    season_in_case = re.findall(season_pattern, url)[0]
    team_name = re.findall(team_name_pattern, url)[0].replace('-', ' ')
    return season_in_case, team_name

In [21]:
# Step 5.2 For each website above, extract the Scores & Fixtures table
def scores_fixtures_extractor(url):
    try:
        time.sleep(2)
        req = safereq(url)
    except requests.exceptions.RequestException as e:
        return []
    
    try:
        tables = pd.read_html(req.text, match="Scores & Fixtures")
        if tables:
            matches = tables[0]

            # Deal with matches not played
            # matches = matches.dropna(axis = 0, subset = ['Referee'])

            print(f'Matches shape:{matches.shape} from {url}')
        else:
            print('No S&F table')
    except Exception as e:
        print(f'Error: {e}')
        return []
    return matches


In [23]:
# Step 5.3 For each website above, first get the Shootings page, where the Shootings table is obtained
def shootings_extractor(url):
    bs = safeget(url)
    internallinks = getInternalLinks(bs, url)
    shooting_links = []
    for internallink in internallinks:
        if 'shooting' in internallink and 'All' in internallink:
            shooting_links.append(internallink)

    try:
        time.sleep(2)
        req = safereq(shooting_links[0]) # The first one is for all competitions
    except requests.exceptions.RequestException as e:
        return []
    

    try:
        tables = pd.read_html(req.text, match = re.compile(r"Shooting"))
    except Exception as e:
        print(f'Error: {e}')
        return []

    if tables:
        shootings = tables[0][:-1] # exclude total row
        shootings.columns = shootings.columns.droplevel(0)
        print(f'Shootings shape:{shootings.shape} from {url}')
    return shootings

In [24]:
# 5.4 It takes a url, gets the matches and shootings and merges them
def match_shooting_merger(url):
    matches = scores_fixtures_extractor(url)
    shootings = shootings_extractor(url)
    season_in_case, team_name = name_extractor(url)
    
    if matches is not [] and shootings is not []:

        target_columns = ['Date', 'Gls', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']
        new_names_target_columns = ['Date', 'Goals', 'Shots', 'Shots_on_target', 'Distance', 'Freekicks', 'Penalties', 'Attempted_penalties']
        rename_map = dict(zip(target_columns, new_names_target_columns))
        shootings = shootings.loc[:, [col for col in target_columns if col in shootings.columns]]



        try:
            match_shootings = matches.merge(shootings, on ='Date')
        except Exception as e:
            print(f'Error: {e}')
        match_shootings.rename(columns= rename_map, inplace= True)


        # make it Premier League only
        match_shootings['ispml'] = match_shootings['Comp'].str.contains('[Pp]remier\s+[Ll]eague', na=False, regex=True) # Fuzzy match
        pml = match_shootings.copy()[match_shootings['ispml'] == True]
        pml['Season'] = season_in_case
        pml['Team'] = team_name
        pml.drop('ispml', axis = 1, inplace= True)
        return pml


    else:
        print(f'Either table missing for {team_name} in {season_in_case}')
        return []


In [43]:
# Scraping the tables through the list of URLs!
def stats_scraper(start_season, end_season):
    seasons = list(range(end_season, start_season -1, -1))  # This will create a list from 2022 to 2010 in reverse order.
    start_url = link_selector(end_season)
    team_stats_tables = []
    for season in seasons:
        print(f'\nCurrent season {season-1} - {season}\n')
        # Extract the absolute URLs for the teams from the table
        scheme, netloc, bs = geturlparts(start_url)
        table = bs.select('table.stats_table')[0].find_all('a', href=re.compile('squads'))
        team_stats_of_this_season_links = [f"{scheme}://{netloc}{link.get('href')}" for link in table if 'href' in link.attrs]

        # Extract the stat tables of this season
        for link in team_stats_of_this_season_links:
            try:
                pml = match_shooting_merger(link)
                team_stats_tables.append(pml)
            except Exception as e:
                if requests.get(link).status_code == 429:
                    time.sleep(int(requests.get(link).headers.get('Retry-After')))
                continue
            time.sleep(2)

        # Extract the absolute URL for the previous season
        desired_tag = bs.find('a', class_ = 'button2 prev')
        prevlink = scheme + '://' + netloc + desired_tag.get('href')
        #  re-assign the starting URL to it for the subsequent iterations through the years
        start_url = prevlink
        
    entire_table = pd.concat(team_stats_tables)
    entire_table.to_csv(f'Premier League Tables {start_season} - {end_season}.csv', index= False)
    print(f'Successful execution - table no. {len(team_stats_tables)}')
    return team_stats_tables

In [46]:
team_stats_tables = stats_scraper(2021, 2022)

Current season 2021 - 2022

Matches shape:(58, 19) from https://fbref.com/en/squads/b8fd03ef/2021-2022/Manchester-City-Stats
Shootings shape:(58, 26) from https://fbref.com/en/squads/b8fd03ef/2021-2022/Manchester-City-Stats
Matches shape:(63, 19) from https://fbref.com/en/squads/822bd0ba/2021-2022/Liverpool-Stats
Shootings shape:(63, 26) from https://fbref.com/en/squads/822bd0ba/2021-2022/Liverpool-Stats
Matches shape:(61, 19) from https://fbref.com/en/squads/cff3d9bb/2021-2022/Chelsea-Stats
Shootings shape:(61, 26) from https://fbref.com/en/squads/cff3d9bb/2021-2022/Chelsea-Stats
Matches shape:(54, 19) from https://fbref.com/en/squads/361ca564/2021-2022/Tottenham-Hotspur-Stats
Shootings shape:(53, 26) from https://fbref.com/en/squads/361ca564/2021-2022/Tottenham-Hotspur-Stats
Matches shape:(45, 19) from https://fbref.com/en/squads/18bb7c10/2021-2022/Arsenal-Stats
Shootings shape:(45, 26) from https://fbref.com/en/squads/18bb7c10/2021-2022/Arsenal-Stats
Matches shape:(49, 19) from http