# Scrapping Premier League
In this initial project, we're going to use web scraping to get the necessary data on the EPL match results and then load them into pandas as a cleaned table ready for further machine learning.

In [145]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
import numpy as np
from itertools import compress

In [65]:
url = 'https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats'

In [66]:
from urllib.parse import urlparse
parsed = urlparse(url)
scheme = parsed.scheme
netloc = parsed.netloc

In [67]:
req = requests.get(url)
bs = BeautifulSoup(req.text, 'html.parser')
table = bs.select('table.stats_table')[0].find_all('a', href= re.compile('squads'))
links = []
for link in table:
    if 'href' in link.attrs:
        link = scheme + '://' + netloc + link.get('href')
        links.append(link)
        print(link)

https://fbref.com/en/squads/b8fd03ef/2021-2022/Manchester-City-Stats
https://fbref.com/en/squads/822bd0ba/2021-2022/Liverpool-Stats
https://fbref.com/en/squads/cff3d9bb/2021-2022/Chelsea-Stats
https://fbref.com/en/squads/361ca564/2021-2022/Tottenham-Hotspur-Stats
https://fbref.com/en/squads/18bb7c10/2021-2022/Arsenal-Stats
https://fbref.com/en/squads/19538871/2021-2022/Manchester-United-Stats
https://fbref.com/en/squads/7c21e445/2021-2022/West-Ham-United-Stats
https://fbref.com/en/squads/a2d435b3/2021-2022/Leicester-City-Stats
https://fbref.com/en/squads/d07537b9/2021-2022/Brighton-and-Hove-Albion-Stats
https://fbref.com/en/squads/8cec06e1/2021-2022/Wolverhampton-Wanderers-Stats
https://fbref.com/en/squads/b2b47a98/2021-2022/Newcastle-United-Stats
https://fbref.com/en/squads/47c64c55/2021-2022/Crystal-Palace-Stats
https://fbref.com/en/squads/cd051869/2021-2022/Brentford-Stats
https://fbref.com/en/squads/8602292d/2021-2022/Aston-Villa-Stats
https://fbref.com/en/squads/33c895d4/2021-2022

In [68]:
links = []
table_tbody_rows = bs.find('table', {'id': "results2021-202291_overall"}).find('tbody').find_all('tr')
for row in table_tbody_rows:
    link_tag = row.find('td', {'data-stat': 'team'}).a
    if 'href' in link_tag.attrs:
        link = link_tag.get('href')
        links.append(scheme + '://' + netloc + link)
        print(link)

/en/squads/b8fd03ef/2021-2022/Manchester-City-Stats
/en/squads/822bd0ba/2021-2022/Liverpool-Stats
/en/squads/cff3d9bb/2021-2022/Chelsea-Stats
/en/squads/361ca564/2021-2022/Tottenham-Hotspur-Stats
/en/squads/18bb7c10/2021-2022/Arsenal-Stats
/en/squads/19538871/2021-2022/Manchester-United-Stats
/en/squads/7c21e445/2021-2022/West-Ham-United-Stats
/en/squads/a2d435b3/2021-2022/Leicester-City-Stats
/en/squads/d07537b9/2021-2022/Brighton-and-Hove-Albion-Stats
/en/squads/8cec06e1/2021-2022/Wolverhampton-Wanderers-Stats
/en/squads/b2b47a98/2021-2022/Newcastle-United-Stats
/en/squads/47c64c55/2021-2022/Crystal-Palace-Stats
/en/squads/cd051869/2021-2022/Brentford-Stats
/en/squads/8602292d/2021-2022/Aston-Villa-Stats
/en/squads/33c895d4/2021-2022/Southampton-Stats
/en/squads/d3fd31cc/2021-2022/Everton-Stats
/en/squads/5bfb9659/2021-2022/Leeds-United-Stats
/en/squads/943e8050/2021-2022/Burnley-Stats
/en/squads/2abfe087/2021-2022/Watford-Stats
/en/squads/1c781004/2021-2022/Norwich-City-Stats


In [127]:
#  the table containing scores and fixtures for all the matches of Manchester City
mancurl = links[0]
req = requests.get(mancurl)
bs = BeautifulSoup(req.text, 'html.parser')
matches = pd.read_html(req.text, match="Scores & Fixtures")[0]
matches.head()


Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,,,57,,Fernandinho,4-3-3,Paul Tierney,Match Report,
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,1.8,1.0,65,58262.0,Fernandinho,4-3-3,Anthony Taylor,Match Report,
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,2.6,0.1,67,51437.0,İlkay Gündoğan,4-3-3,Graham Scott,Match Report,
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,4.4,0.2,80,52276.0,İlkay Gündoğan,4-3-3,Martin Atkinson,Match Report,
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,2.8,0.6,61,32087.0,İlkay Gündoğan,4-3-3,Paul Tierney,Match Report,


In [70]:
#  the shooting stats for the Manchester City - the number of shots, the number of shots on target, the number of free kicks, and the number of penalty kick
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

In [130]:
internallinksmancity = getInternalLinks(bs, mancurl)
shooting_links = []
for internallink in internallinksmancity:
    if 'shooting' in internallink and 'All' in internallink:
        shooting_links.append(internallink)
        print(internallink)

https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions
https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions
https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions
https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions


In [72]:
req = requests.get(shooting_links[0])
shootings = pd.read_html(req.text, match = re.compile(r"Shooting"))[0][:-1]
shootings.columns = shootings.columns.droplevel(0)
print(f'Matches shape:{matches.shape}')
print(f'Shootings shape:{shootings.shape}')

Matches shape:(58, 19)
Shootings shape:(58, 26)


In [73]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          58 non-null     object 
 1   Time          58 non-null     object 
 2   Comp          58 non-null     object 
 3   Round         58 non-null     object 
 4   Day           58 non-null     object 
 5   Venue         58 non-null     object 
 6   Result        58 non-null     object 
 7   GF            58 non-null     object 
 8   GA            58 non-null     object 
 9   Opponent      58 non-null     object 
 10  xG            50 non-null     float64
 11  xGA           50 non-null     float64
 12  Poss          58 non-null     int64  
 13  Attendance    56 non-null     float64
 14  Captain       58 non-null     object 
 15  Formation     58 non-null     object 
 16  Referee       58 non-null     object 
 17  Match Report  58 non-null     object 
 18  Notes         7 non-null      ob

In [74]:
shootings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          58 non-null     object 
 1   Time          58 non-null     object 
 2   Comp          58 non-null     object 
 3   Round         58 non-null     object 
 4   Day           58 non-null     object 
 5   Venue         58 non-null     object 
 6   Result        58 non-null     object 
 7   GF            58 non-null     object 
 8   GA            58 non-null     object 
 9   Opponent      58 non-null     object 
 10  Gls           58 non-null     int64  
 11  Sh            58 non-null     int64  
 12  SoT           58 non-null     int64  
 13  SoT%          58 non-null     float64
 14  G/Sh          58 non-null     float64
 15  G/SoT         58 non-null     float64
 16  Dist          50 non-null     float64
 17  FK            50 non-null     float64
 18  PK            58 non-null     in

In [75]:
# Concatenate to get the final table
target_columns = ['Gls', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']
new_names_target_columns = ['Goals', 'Shots', 'Shots_on_target', 'Distance', 'Freekicks', 'Penalties','Attempted_penalties']
rename_map = dict(zip(target_columns, new_names_target_columns))

In [76]:

sub_shootings = shootings.loc[:, target_columns]
sub_shootings.rename(columns= rename_map)
mancity_match_shootings = pd.concat([matches, sub_shootings], axis=1)
mancity_match_shootings.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Referee,Match Report,Notes,Gls,Sh,SoT,Dist,FK,PK,PKatt
0,2021-08-07,17:15,Community Shield,FA Community Shield,Sat,Neutral,L,0,1,Leicester City,...,Paul Tierney,Match Report,,0,12,3,,,0,0
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,Anthony Taylor,Match Report,,0,18,4,17.3,1.0,0,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,Graham Scott,Match Report,,4,16,4,18.5,1.0,0,0
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,Martin Atkinson,Match Report,,5,25,10,14.8,0.0,0,0
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,Paul Tierney,Match Report,,1,25,8,14.3,0.0,0,0


In [131]:
url = 'https://fbref.com/en/comps/9/history/Premier-League-Seasons'
req = requests.get(url)
bs = BeautifulSoup(req.text, 'html.parser')
year_table = bs.select('table.stats_table')
selected_year_tables = year_table[0].find_all('a', string = re.compile(r'200\d|201\d|2020'))
selected_links = [link_tag.get('href') for link_tag in selected_year_tables]
selected_links = [scheme + '://' + netloc + link if link.startswith('/') else link for link in selected_links]


In [168]:

def link_selector(latest_season):
    url = 'https://fbref.com/en/comps/9/history/Premier-League-Seasons'
    req = requests.get(url)
    bs = BeautifulSoup(req.text, 'html.parser')
    year_table = bs.select('table.stats_table')
    selected_year_tables = year_table[0].find_all('a', string = re.compile(r'19\d\d|200\d|201\d|202[123]'))
    selected_links = [link_tag.get('href') for link_tag in selected_year_tables]
    selected_links = [scheme + '://' + netloc + link if link.startswith('/') else link for link in selected_links]

    mask = [str(latest_season) in link for link in selected_links]
    starting_url = list(compress(selected_links, mask))
    return starting_url[-1]

In [167]:
starting_year = link_selector(2020)
starting_year

'https://fbref.com/en/comps/9/2019-2020/2019-2020-Premier-League-Stats'

In [79]:
def safeget(url):
    try:
        req = requests.get(url)
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []
    bs = BeautifulSoup(req.text, 'html.parser')
    return bs


In [80]:
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re

def team_stats_of_a_season_links_extractor(url):
    parsed = urlparse(url)
    scheme = parsed.scheme
    netloc = parsed.netloc
    bs = safeget(url)
    table = bs.select('table.stats_table')[0].find_all('a', href=re.compile('squads'))
    links = [f"{scheme}://{netloc}{link.get('href')}" for link in table if 'href' in link.attrs]
    
    for link in links:
        print(link)
    
    return links


In [81]:
def scores_extractor(url): # input a url of a team of a year/season, such as an item in links from above
    req = requests.get(url)
    bs = BeautifulSoup(req.text, 'html.parser')
    matches = pd.read_html(req.text, match="Scores & Fixtures")[0]
    matches.head()

In [82]:
def prevyear_url_extractor(url):
    parsed = urlparse(url)
    scheme = parsed.scheme
    netloc = parsed.netloc
    bs = safeget(url)
    desired_tag = bs.find('a', class_ = 'button2 prev')
    prevlink = scheme + '://' + netloc + desired_tag.get('href')
    return prevlink

In [83]:
prevyear_url_extractor('https://fbref.com/en/comps/9/2020-2021/2020-2021-Premier-League-Stats')

'https://fbref.com/en/comps/9/2019-2020/2019-2020-Premier-League-Stats'

In [84]:
# Step 1: Import time
import time

# Step 2: Set a list of years
start_season = 2019
end_season = 2022
seasons = list(range(end_season, start_season, -1))  # This will create a list from 2022 to 2010 in reverse order.

# Step 3: Initiate an empty list
dataframes = []

In [85]:
# Step 4: define the starting url of year 2022
start_url = 'https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats'
# Step 5
for season in seasons:
    print(f'Current season {season-1} - {season} \n')
    # Extract the absolute URLs for the teams from the table
    team_stats_of_this_season_links = team_stats_of_a_season_links_extractor(start_url)
    # Extract the absolute URL for the previous season
    prevlink = prevyear_url_extractor(start_url)
    #  re-assign the starting URL to it for the subsequent iterations through the years
    start_url = prevlink
    time.sleep(1)
# The links need to be stored somewhere

Current season 2021 - 2022 

https://fbref.com/en/squads/b8fd03ef/2021-2022/Manchester-City-Stats
https://fbref.com/en/squads/822bd0ba/2021-2022/Liverpool-Stats
https://fbref.com/en/squads/cff3d9bb/2021-2022/Chelsea-Stats
https://fbref.com/en/squads/361ca564/2021-2022/Tottenham-Hotspur-Stats
https://fbref.com/en/squads/18bb7c10/2021-2022/Arsenal-Stats
https://fbref.com/en/squads/19538871/2021-2022/Manchester-United-Stats
https://fbref.com/en/squads/7c21e445/2021-2022/West-Ham-United-Stats
https://fbref.com/en/squads/a2d435b3/2021-2022/Leicester-City-Stats
https://fbref.com/en/squads/d07537b9/2021-2022/Brighton-and-Hove-Albion-Stats
https://fbref.com/en/squads/8cec06e1/2021-2022/Wolverhampton-Wanderers-Stats
https://fbref.com/en/squads/b2b47a98/2021-2022/Newcastle-United-Stats
https://fbref.com/en/squads/47c64c55/2021-2022/Crystal-Palace-Stats
https://fbref.com/en/squads/cd051869/2021-2022/Brentford-Stats
https://fbref.com/en/squads/8602292d/2021-2022/Aston-Villa-Stats
https://fbref.com

In [86]:
# Step 5.1 For each website above, extract the name
def name_extractor(url):
    team_name_pattern = re.compile(r'\d{4}-\d{4}/([A-Za-z-]+)-Stats')
    season_pattern = re.compile(r'(\d{4}-\d{4})/(?:[A-Za-z-]+)-Stats')
    season_in_case = re.findall(season_pattern, url)[0]
    team_name = re.findall(team_name_pattern, url)[0].replace('-', ' ')
    return season_in_case, team_name

In [105]:
# Step 5.2 For each website above, extract the Scores & Fixtures table
def scores_fixtures_extractor(url):
    try:
        req = requests.get(url)
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []
    
    try:
        tables = pd.read_html(req.text, match="Scores & Fixtures")
        if tables:
            matches = tables[0]
            print(f'Matches shape:{matches.shape}')
        else:
            print('No S&F table')
    except Exception as e:
        print(f'Error: {e}')
        return []
    return matches


In [103]:
# Step 5.3 For each website above...
def shootings_extractor(url):
    bs = safeget(url)
    internallinks = getInternalLinks(bs, url)
    shooting_links = []
    for internallink in internallinks:
        if 'shooting' in internallink and 'All' in internallink:
            shooting_links.append(internallink)

    try:
        req = requests.get(shooting_links[0]) # The first one is for all competitions
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []
    

    try:
        tables = pd.read_html(req.text, match = re.compile(r"Shooting"))
    except Exception as e:
        print(f'Error: {e}')
        return []

    if tables:
        shootings = tables[0][:-1] # exclude total row
        shootings.columns = shootings.columns.droplevel(0)
        print(f'Shootings shape:{shootings.shape}')
    return shootings

In [124]:
# 5.4 It takes a url, gets the matches and shootings
def match_shooting_merger(url):
    matches = scores_fixtures_extractor(url)
    shootings = shootings_extractor(url)
    season_in_case, team_name = name_extractor(url)
    
    if matches is not [] and shootings is not []:
        if matches.shape[0] == shootings.shape[0]:
            target_columns = ['Gls', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']
            new_names_target_columns = ['Goals', 'Shots', 'Shots_on_target', 'Distance', 'Freekicks', 'Penalties', 'Attempted_penalties']
            rename_map = dict(zip(target_columns, new_names_target_columns))

            for target_column in target_columns:
                if target_column in shootings.columns:
                    matches[target_column] = shootings[target_column]
                else:
                    matches[target_column] = np.nan

            match_shootings = matches.rename(columns=rename_map, inplace=False)

            # make it Premier League only
            match_shootings['ispml'] = match_shootings['Comp'].str.contains('[Pp]remier\s+[Ll]eague', na=False, regex=True) # Fuzzy match
            pml = match_shootings[match_shootings['ispml'] == True]
            new_names_target_columns.insert(0, 'Date')
            pml = pml[new_names_target_columns] # Selecting only date and the rest of the stats
            pml['Season'] = season_in_case
            pml['Team'] = team_name
            return pml

        else:
            print(f'Table length unequal for {team_name} in {season_in_case}')
            return []

    else:
        print(f'Either table missing for {team_name} in {season_in_case}')
        return []


In [169]:
pm = match_shooting_merger(team_stats_of_this_season_links[4])
pm

Matches shape:(48, 19)
Shootings shape:(48, 26)


Unnamed: 0,Date,Goals,Shots,Shots_on_target,Distance,Freekicks,Penalties,Attempted_penalties,Season,Team
0,2019-08-11,0,16,1,23.9,0.0,0,0,2019-2020,Leicester City
1,2019-08-18,1,12,3,20.8,0.0,0,0,2019-2020,Leicester City
2,2019-08-24,2,10,2,19.1,2.0,0,0,2019-2020,Leicester City
4,2019-08-31,3,15,6,20.5,0.0,0,0,2019-2020,Leicester City
5,2019-09-14,0,9,3,24.7,1.0,0,0,2019-2020,Leicester City
6,2019-09-21,2,16,7,18.0,0.0,0,0,2019-2020,Leicester City
8,2019-09-29,4,13,5,15.4,0.0,0,0,2019-2020,Leicester City
9,2019-10-05,1,2,1,19.5,0.0,0,0,2019-2020,Leicester City
10,2019-10-19,2,19,3,19.0,1.0,0,0,2019-2020,Leicester City
11,2019-10-25,9,24,14,17.5,1.0,1,1,2019-2020,Leicester City


In [170]:
# Scraping the tables through the list of URLs!
start_season = 2020
end_season = 2022
seasons = list(range(end_season, start_season, -1))  # This will create a list from 2022 to 2010 in reverse order.
start_url = link_selector(end_season)
team_stats_tables = []
for season in seasons:
    print(f'Current season {season-1} - {season} \n')
    # Extract the absolute URLs for the teams from the table
    team_stats_of_this_season_links = team_stats_of_a_season_links_extractor(start_url)
    # Extract the stat tables of this season
    for link in team_stats_of_this_season_links:
        pml = match_shooting_merger(link)
        team_stats_tables.append(pml)
        time.sleep(1)

    # Extract the absolute URL for the previous season
    prevlink = prevyear_url_extractor(start_url)
    #  re-assign the starting URL to it for the subsequent iterations through the years
    start_url = prevlink
    time.sleep(1)
print(f'Successful execution - table no. {len(team_stats_tables)}')

Current season 2021 - 2022 

https://fbref.com/en/squads/b8fd03ef/2021-2022/Manchester-City-Stats
https://fbref.com/en/squads/822bd0ba/2021-2022/Liverpool-Stats
https://fbref.com/en/squads/cff3d9bb/2021-2022/Chelsea-Stats
https://fbref.com/en/squads/361ca564/2021-2022/Tottenham-Hotspur-Stats
https://fbref.com/en/squads/18bb7c10/2021-2022/Arsenal-Stats
https://fbref.com/en/squads/19538871/2021-2022/Manchester-United-Stats
https://fbref.com/en/squads/7c21e445/2021-2022/West-Ham-United-Stats
https://fbref.com/en/squads/a2d435b3/2021-2022/Leicester-City-Stats
https://fbref.com/en/squads/d07537b9/2021-2022/Brighton-and-Hove-Albion-Stats
https://fbref.com/en/squads/8cec06e1/2021-2022/Wolverhampton-Wanderers-Stats
https://fbref.com/en/squads/b2b47a98/2021-2022/Newcastle-United-Stats
https://fbref.com/en/squads/47c64c55/2021-2022/Crystal-Palace-Stats
https://fbref.com/en/squads/cd051869/2021-2022/Brentford-Stats
https://fbref.com/en/squads/8602292d/2021-2022/Aston-Villa-Stats
https://fbref.com

AttributeError: 'list' object has no attribute 'shape'

In [None]:
# Step 5.1 For each website above, extract the name
url = 'https://fbref.com/en/squads/b8fd03ef/2021-2022/Manchester-City-Stats'
bs = safeget(url)
script_tag = bs.find_all('script',{'type': "application/ld+json"})[0]
if script_tag:
    json_data = json.loads(script_tag.text)
    name = json_data['name']

In [None]:
url = 'https://fbref.com/en/squads/cff3d9bb/2020-2021/Chelsea-Stats'
team_name_pattern = re.compile(r'\d{4}-\d{4}/([A-Za-z-]+)-Stats')
urlname = re.findall(team_name_pattern, url)[0].replace('-', ' ')
urlname

'Chelsea'