In [1]:
import requests

In [2]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [3]:
data = requests.get(standings_url)

In [4]:
data

<Response [200]>

## Scraping our first page with requests

In [5]:
from bs4 import BeautifulSoup

In [6]:
soup = BeautifulSoup(data.text)

In [7]:
standings_table = soup.select('table.stats_table')[0]

## Parsing our first page with requests

In [8]:
links = standings_table.find_all('a') 

In [9]:
links = [l.get("href") for l in links]

In [10]:
links = [l for l in links if '/squads/' in l]

In [11]:
links

['/en/squads/18bb7c10/Arsenal-Stats',
 '/en/squads/b8fd03ef/Manchester-City-Stats',
 '/en/squads/19538871/Manchester-United-Stats',
 '/en/squads/b2b47a98/Newcastle-United-Stats',
 '/en/squads/361ca564/Tottenham-Hotspur-Stats',
 '/en/squads/fd962109/Fulham-Stats',
 '/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 '/en/squads/cd051869/Brentford-Stats',
 '/en/squads/822bd0ba/Liverpool-Stats',
 '/en/squads/cff3d9bb/Chelsea-Stats',
 '/en/squads/8602292d/Aston-Villa-Stats',
 '/en/squads/47c64c55/Crystal-Palace-Stats',
 '/en/squads/e4a775cb/Nottingham-Forest-Stats',
 '/en/squads/a2d435b3/Leicester-City-Stats',
 '/en/squads/5bfb9659/Leeds-United-Stats',
 '/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 '/en/squads/4ba7cbea/Bournemouth-Stats',
 '/en/squads/7c21e445/West-Ham-United-Stats',
 '/en/squads/d3fd31cc/Everton-Stats',
 '/en/squads/33c895d4/Southampton-Stats']

In [12]:
team_urls = [f"https://fbref.com{l}" for l in links]

In [13]:
team_urls = team_urls[0]

In [14]:
team_urls

'https://fbref.com/en/squads/18bb7c10/Arsenal-Stats'

In [15]:
data = requests.get(team_urls)

## Extract match stats using pandas and requests

In [16]:
import pandas as pd
matches = pd.read_html(data.text, match = "Scores & Fixtures")[0]

In [17]:
matches.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain', 'Formation',
       'Referee', 'Match Report', 'Notes'],
      dtype='object')

## Get match shooting stts with requests and pandas

In [18]:
soup = BeautifulSoup(data.text)

In [19]:
links = soup.find_all('a')

In [20]:
links = [l.get("href") for l in links]

In [21]:
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [22]:
links

['/en/squads/18bb7c10/2022-2023/matchlogs/all_comps/shooting/Arsenal-Match-Logs-All-Competitions',
 '/en/squads/18bb7c10/2022-2023/matchlogs/all_comps/shooting/Arsenal-Match-Logs-All-Competitions',
 '/en/squads/18bb7c10/2022-2023/matchlogs/all_comps/shooting/Arsenal-Match-Logs-All-Competitions',
 '/en/squads/18bb7c10/2022-2023/matchlogs/all_comps/shooting/Arsenal-Match-Logs-All-Competitions']

In [23]:
data = requests.get(f"https://fbref.com{links[0]}")

In [24]:
data

<Response [200]>

In [25]:
shooting = pd.read_html(data.text, match = "Shooting")[0]

In [26]:
shooting.head()

Unnamed: 0_level_0,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2,0,Crystal Palace,...,14.6,1.0,0,0,1.0,1.0,0.1,0.0,0.0,Match Report
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,2,Leicester City,...,13.0,0.0,0,0,2.7,2.7,0.16,1.3,1.3,Match Report
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3,0,Bournemouth,...,14.8,0.0,0,0,1.3,1.3,0.1,1.7,1.7,Match Report
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2,1,Fulham,...,15.5,1.0,0,0,2.6,2.6,0.12,-0.6,-0.6,Match Report
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2,1,Aston Villa,...,16.3,1.0,0,0,2.4,2.4,0.12,-0.4,-0.4,Match Report


In [27]:
shooting.columns = shooting.columns.droplevel()

In [28]:
shooting.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'Gls', 'Sh', 'SoT', 'SoT%', 'G/Sh', 'G/SoT', 'Dist', 'FK',
       'PK', 'PKatt', 'xG', 'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG',
       'Match Report'],
      dtype='object')

## Cleaning and merging scraped data with pandas

In [29]:
team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on = "Date")

In [30]:
team_data.shape

(26, 25)

In [31]:
shooting.shape

(27, 26)

In [32]:
matches.shape

(47, 19)

## Scraping data for multiple season and teams with forloops

In [33]:
years = list(range(2023, 2021, -1))
all_matches = []

In [34]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"


In [35]:
!pip install html5lib

You should consider upgrading via the '/Users/mac/.pyenv/versions/3.10.3/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0m

In [36]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    if len(soup.select('table.stats_table')) == 0:
        continue
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        print(team_url)
        
        data = requests.get(team_url)
        try :
            matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        except ValueError:
            print(data.text)
            continue
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        
        data = requests.get(f"https://fbref.com{links[0]}")
        try:
            shooting = pd.read_html(data.text, match="Shooting")[0]
        except ValueError:
            print(links[0])
            print(data.text)
            continue
            
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(10)

https://fbref.com/en/squads/18bb7c10/Arsenal-Stats
https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats
https://fbref.com/en/squads/19538871/Manchester-United-Stats
https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats
https://fbref.com/en/squads/361ca564/Tottenham-Hotspur-Stats
https://fbref.com/en/squads/fd962109/Fulham-Stats
https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats
https://fbref.com/en/squads/cd051869/Brentford-Stats
https://fbref.com/en/squads/822bd0ba/Liverpool-Stats
https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats
https://fbref.com/en/squads/8602292d/Aston-Villa-Stats
https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats
https://fbref.com/en/squads/e4a775cb/Nottingham-Forest-Stats
https://fbref.com/en/squads/a2d435b3/Leicester-City-Stats
https://fbref.com/en/squads/5bfb9659/Leeds-United-Stats
https://fbref.com/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats
https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats
https://fbref.com/en/sq

In [37]:
len(all_matches)

40

In [38]:
match_df = pd.concat(all_matches)

In [39]:
match_df.columns = [c.lower() for c in match_df.columns]

In [40]:
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,Match Report,,10.0,2.0,14.6,1.0,0.0,0.0,2023,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,Match Report,,19.0,7.0,13.0,0.0,0.0,0.0,2023,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,Match Report,,14.0,6.0,14.8,0.0,0.0,0.0,2023,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,Match Report,,22.0,8.0,15.5,1.0,0.0,0.0,2023,Arsenal
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,Match Report,,22.0,8.0,16.3,1.0,0.0,0.0,2023,Arsenal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Away,L,0,2,Aston Villa,...,Match Report,,9.0,3.0,21.6,0.0,0.0,0.0,2022,Norwich City
39,2022-05-08,14:00,Premier League,Matchweek 36,Sun,Home,L,0,4,West Ham,...,Match Report,,8.0,2.0,22.2,1.0,0.0,0.0,2022,Norwich City
40,2022-05-11,19:45,Premier League,Matchweek 21,Wed,Away,L,0,3,Leicester City,...,Match Report,,9.0,5.0,17.0,0.0,0.0,0.0,2022,Norwich City
41,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,1,1,Wolves,...,Match Report,,11.0,2.0,14.4,0.0,0.0,0.0,2022,Norwich City


In [41]:
match_df.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team'],
      dtype='object')

In [42]:
match_df['team'].unique()

array(['Arsenal', 'Manchester City', 'Manchester United',
       'Newcastle United', 'Tottenham Hotspur', 'Fulham',
       'Brighton and Hove Albion', 'Brentford', 'Liverpool', 'Chelsea',
       'Aston Villa', 'Crystal Palace', 'Nottingham Forest',
       'Leicester City', 'Leeds United', 'Wolverhampton Wanderers',
       'Bournemouth', 'West Ham United', 'Everton', 'Southampton',
       'Burnley', 'Watford', 'Norwich City'], dtype=object)

In [44]:
match_df.to_csv("match_pl_20230121.csv")