In [1]:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

In [2]:
all_matches = []

In [3]:
year_urls= {
    '2022-2023': 'https://fbref.com/en/comps/9/2022-2023/2022-2023-Premier-League-Stats',
    '2021-2022':'https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats',
    '2020-2021': 'https://fbref.com/en/comps/9/2020-2021/2020-2021-Premier-League-Stats',
    '2019-2020':'https://fbref.com/en/comps/9/2019-2020/2019-2020-Premier-League-Stats',
    '2018-2019':'https://fbref.com/en/comps/9/2018-2019/2018-2019-Premier-League-Stats',
    '2017-2018':'https://fbref.com/en/comps/9/2017-2018/2017-2018-Premier-League-Stats'
    }

In [4]:
for year, standing_url in year_urls.items():
    print(f"Year: {year}, URL: {standing_url}")

Year: 2022-2023, URL: https://fbref.com/en/comps/9/2022-2023/2022-2023-Premier-League-Stats
Year: 2021-2022, URL: https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats
Year: 2020-2021, URL: https://fbref.com/en/comps/9/2020-2021/2020-2021-Premier-League-Stats
Year: 2019-2020, URL: https://fbref.com/en/comps/9/2019-2020/2019-2020-Premier-League-Stats
Year: 2018-2019, URL: https://fbref.com/en/comps/9/2018-2019/2018-2019-Premier-League-Stats
Year: 2017-2018, URL: https://fbref.com/en/comps/9/2017-2018/2017-2018-Premier-League-Stats


In [5]:
print('Scraping for all years now...')
year_count = 0

for year, standing_url in year_urls.items():
    print('scraping data from', year)
    year_count += 1
    try:
        data = requests.get(standing_url)
        soup = BeautifulSoup(data.text)
        standings_table = soup.select('table.stats_table')[0]
        links = [l.get('href') for l in standings_table.find_all('a')]
        links = [l for l in links if '/squads/' in l]
        team_urls = [f'https://fbref.com{l}' for l in links]

        for team_url in team_urls:
            team_name = team_url.split('/')[-1].replace('-Stats','').replace('-',' ')

            team_url_data = requests.get(team_url)
            matches = pd.read_html(team_url_data.text, match="Scores & Fixtures")

            new_soup= BeautifulSoup(team_url_data.text)
            a_links = [l.get('href') for l in new_soup.find_all('a')]
            a_links_2 = [l for l in a_links if l and 'all_comps/shooting/' in l]


            shooting_stats_html = requests.get(f'https://fbref.com{a_links_2[0]}')
            shooting_stats = pd.read_html(shooting_stats_html.text, match ='Shooting')[0]
            shooting_stats.columns = shooting_stats.columns.droplevel()
            try:
                team_data = matches[0].merge(shooting_stats[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
            except ValueError:
                continue

            team_data = team_data[team_data["Comp"] == "Premier League"]
            team_data["Season"] = year
            team_data["Team"] = team_name
            all_matches.append(team_data)
            time.sleep(1) # so that we don't get blocked from scraping the site
    except ValueError:
        print('ERROR while scraping data from year:', year)
        print (ValueError)


print('SCRAPING DONE!')
print(f'Scraped epl data for {year_count} years')


Scraping for all years now...
scraping data from 2022-2023
scraping data from 2021-2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data["Season"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data["Team"] = team_name


scraping data from 2020-2021
scraping data from 2019-2020
scraping data from 2018-2019
scraping data from 2017-2018


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data["Season"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data["Team"] = team_name


SCRAPING DONE!
Scraped epl data for 6 years


In [6]:
match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]
print(match_df)

          date   time            comp         round  day venue result gf ga  \
1   2022-08-07  16:30  Premier League   Matchweek 1  Sun  Away      W  2  0   
2   2022-08-13  15:00  Premier League   Matchweek 2  Sat  Home      W  4  0   
3   2022-08-21  16:30  Premier League   Matchweek 3  Sun  Away      D  3  3   
4   2022-08-27  15:00  Premier League   Matchweek 4  Sat  Home      W  4  2   
5   2022-08-31  19:30  Premier League   Matchweek 5  Wed  Home      W  6  0   
..         ...    ...             ...           ...  ...   ...    ... .. ..   
38  2018-04-15  16:00  Premier League  Matchweek 34  Sun  Away      W  1  0   
39  2018-04-21  12:30  Premier League  Matchweek 35  Sat  Home      D  2  2   
40  2018-04-28  15:00  Premier League  Matchweek 36  Sat  Away      W  1  0   
41  2018-05-05  15:00  Premier League  Matchweek 37  Sat  Home      W  1  0   
42  2018-05-13  15:00  Premier League  Matchweek 38  Sun  Away      L  0  2   

           opponent  ...  match report  notes    sh

In [7]:
match_df.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team'],
      dtype='object')

In [8]:
rows, columns = match_df.shape

In [9]:
rows

4560

In [10]:
columns

27

In [11]:
# saving to csv
match_df.to_csv('matches_6_years.csv')