In [72]:
import requests

In [73]:
stats_url = "https://fbref.com/en/comps/12/2022-2023/2022-2023-La-Liga-Stats"  # page to scrape (starting url)

In [74]:
data = requests.get(stats_url)

In [75]:
# start scrape
from bs4 import BeautifulSoup
soup = BeautifulSoup(data.text)  # initialize scrape

In [76]:
# Get team links:
standings = soup.select('table.stats_table')[0]  # select only the standings table
links = standings.find_all('a')  # find all <a>tags
links = [l.get("href") for l in links]  # get href property of each team link
links = [l for l in links if '/squads/' in l]  # filter for only team links (else: auto-removes from list)

In [77]:
squad_urls = [f"https://fbref.com{l}" for l in links]  # format squad links from relative -> absolute url

In [78]:
# First team:
data = requests.get(squad_urls[0])  # get html data

In [79]:
# Games data frame:
import pandas as pd
games_data = pd.read_html(data.text, match="Scores & Fixtures")[0]  # scan for scores & fictures table

In [80]:
# Shooting data frame:
soup = BeautifulSoup(data.text)  # initialize scrape
links = soup.find_all('a')  # find all page links (in <a>tags)
links = [l.get("href") for l in links]  # get urls
links = [l for l in links if l and 'all_comps/shooting/' in l]  # filter to find shooting sats link

In [81]:
data = requests.get(f"https://fbref.com{links[0]}")  # absoulte link (index 0 as link is duplicated on page)

In [82]:
shooting_data = pd.read_html(data.text, match="Shooting")[0]  # parse shooting stats

In [83]:
shooting_data.columns = shooting_data.columns.droplevel()  # clean data: drop repeated index

In [84]:
# merge games and shooting data frames, remove duplicates / unimportant columns, add shooting stats columns
team_data = games_data.merge(shooting_data[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

In [85]:
team_data.head()  # test merge

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2022-08-13,21:00,La Liga,Matchweek 1,Sat,Home,D,0,0,Rayo Vallecano,...,4-3-3,Alejandro Hernández,Match Report,,21,5,17.0,1.0,0,0
1,2022-08-21,22:00,La Liga,Matchweek 2,Sun,Away,W,4,1,Real Sociedad,...,3-2-4-1,José Luis Munuera,Match Report,,15,7,14.6,0.0,0,0
2,2022-08-28,19:30,La Liga,Matchweek 3,Sun,Home,W,4,0,Valladolid,...,4-3-3,Ricardo de Burgos,Match Report,,24,9,14.4,1.0,0,0
3,2022-09-03,21:00,La Liga,Matchweek 4,Sat,Away,W,3,0,Sevilla,...,4-3-3,Antonio Matéu Lahoz,Match Report,,18,5,16.0,2.0,0,0
4,2022-09-07,21:00,Champions Lg,Group stage,Wed,Home,W,5,1,cz Viktoria Plzeň,...,4-3-3,Lawrence Visser,Match Report,,20,10,16.2,0.0,0,0


In [86]:
years = list(range(2023, 2021, -1))  # scrape 2022-23 and 2021-2022 seasons
all_games = []  # initialize list of data frames with match logs for each team in one season

In [87]:
stats_url = "https://fbref.com/en/comps/12/2022-2023/2022-2023-La-Liga-Stats"  # starting url

In [88]:
import time

# Scale for all teams for multiple years (seasons):
for year in years:
    data = requests.get(stats_url)
    soup = BeautifulSoup(data.text)  # start scrape
    
    standings = soup.select('table.stats_table')[0]  # team links for each team
    links = [l.get("href") for l in standings.find_all('a')]  # get all team links & hrefs
    links = [l for l in links if '/squads/' in l]  # filter only for squads
    squad_urls = [f"https://fbref.com{l}" for l in links]  # convert from relative to absolute urls

    # scrape previous seasons
    prev_season = soup.select("a.prev")[0].get("href")  # get previous season url
    stats_url = f"https://fbref.com{prev_season}"  # convert to absolute url

    # Scrape match logs for each team (loop through their urls):
    for team_url in squad_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")  # format team name from url

        # get scores & fixtures table
        data = requests.get(team_url)
        games_data = pd.read_html(data.text, match="Scores & Fixtures")[0]  # team data

        # get shooting stats page
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]  # find shooting stats
        data = requests.get(f"https://fbref.com{links[0]}")  # absolute url
        shooting_data = pd.read_html(data.text, match="Shooting")[0]  # shooting data
        shooting_data.columns = shooting_data.columns.droplevel()  # account for repeated (unnecessary) columns

        # bypass error if shooting stats data is empty for a team (ignore)
        try:
            # merge data frames, adding shooting stat columns to team_data frame
            team_data = games_data.merge(shooting_data[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue  # ignore

        # final formatting
        team_data = team_data[team_data["Comp"] == "La Liga"]  # filter out all other competitions (only la liga matchweeks)
        team_data["Season"] = year  # add season column
        team_data["Team"] = team_name  # add team column

        # end of loop
        all_games.append(team_data)  # append data to list of team data frames
        time.sleep(5)  # bypass scraping speed block

In [89]:
main_data = pd.concat(all_games)  # returns single data frame from list of data frame (combines all data frames into one)

In [90]:
main_data.columns = [c.lower() for c in main_data.columns]  # set all column titles to lowercase (easier syntax when parsing)

In [91]:
main_data

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2022-08-13,21:00,La Liga,Matchweek 1,Sat,Home,D,0,0,Rayo Vallecano,...,Match Report,,21.0,5.0,17.0,1.0,0,0,2023,Barcelona
1,2022-08-21,22:00,La Liga,Matchweek 2,Sun,Away,W,4,1,Real Sociedad,...,Match Report,,15.0,7.0,14.6,0.0,0,0,2023,Barcelona
2,2022-08-28,19:30,La Liga,Matchweek 3,Sun,Home,W,4,0,Valladolid,...,Match Report,,24.0,9.0,14.4,1.0,0,0,2023,Barcelona
3,2022-09-03,21:00,La Liga,Matchweek 4,Sat,Away,W,3,0,Sevilla,...,Match Report,,18.0,5.0,16.0,2.0,0,0,2023,Barcelona
5,2022-09-10,18:30,La Liga,Matchweek 5,Sat,Away,W,4,0,Cádiz,...,Match Report,,16.0,8.0,14.9,0.0,0,0,2023,Barcelona
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,2022-04-30,14:00,La Liga,Matchweek 34,Sat,Home,W,2,1,Villarreal,...,Match Report,,11.0,4.0,14.8,0.0,0,0,2022,Alaves
36,2022-05-07,18:30,La Liga,Matchweek 35,Sat,Away,L,0,4,Celta Vigo,...,Match Report,,3.0,1.0,19.5,0.0,0,0,2022,Alaves
37,2022-05-11,19:00,La Liga,Matchweek 36,Wed,Home,W,2,1,Espanyol,...,Match Report,,27.0,9.0,16.9,2.0,0,0,2022,Alaves
38,2022-05-15,19:30,La Liga,Matchweek 37,Sun,Away,L,1,3,Levante,...,Match Report,,6.0,2.0,20.2,1.0,0,0,2022,Alaves


In [95]:
main_data.to_csv('games_stats.csv')