In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import os
from scrape_utils import get_fbref_big5, get_tm_team_links, get_tm_team_squad, format_tm_market_value

# Scrape fbref big-5 league data

There are several different pages for the big five leagues. Scrape them in a loop and save them as parquet files <br>
First the current season:

In [None]:
# current season urls
urls_current = ['https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/shooting/players/Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/passing/players/Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/passing_types/players/Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/gca/players/Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/possession/players/Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/playingtime/players/Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/misc/players/Big-5-European-Leagues-Stats']
out_current = ['stats-20', 'shooting-20', 'passing-20', 'passing_types-20',
               'defence-20', 'gca-20', 'possession-20', 'playingtime-20', 'misc-20']
# scrape and save the files
for i, url in enumerate(urls_current):
    df = get_fbref_big5(url)
    df.to_parquet(os.path.join('data', 'raw', 'fbref', f'fbref_{out_current[i]}.parquet'))
    time.sleep(60)

Previous seasons next:

In [None]:
# previous season urls
urls_history = ['https://fbref.com/en/comps/Big5/2019-2020/stats/players/2019-2020-Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/2019-2020/shooting/players/2019-2020-Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/2019-2020/passing/players/2019-2020-Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/2019-2020/passing_types/players/2019-2020-Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/2019-2020/defense/players/2019-2020-Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/2019-2020/gca/players/2019-2020-Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/2019-2020/possession/players/2019-2020-Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/2019-2020/playingtime/players/2019-2020-Big-5-European-Leagues-Stats',
                'https://fbref.com/en/comps/Big5/2019-2020/misc/players/2019-2020-Big-5-European-Leagues-Stats']
urls_history.extend([url.replace('2019-2020', '2018-2019') for url in urls_history])
out_history = ['stats-19', 'shooting-19', 'passing-19', 'passing_types-19',
               'defence-19', 'gca-19', 'possession-19', 'playingtime-19', 'misc-19']
out_history.extend([o.replace('19', '18') for o in out_history])
# scrape and save the files
for i, url in enumerate(urls_history):
    df = get_fbref_big5(url)
    df.to_parquet(os.path.join('data', 'raw', 'fbref', f'fbref_{out_history[i]}.parquet'))
    time.sleep(60)

# Scrape transfermarkt data for the big-5 leagues

Get the links for each football team in the big-5 leagues for the 2019 season

In [None]:
league_urls = ['https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/plus/?saison_id=2021',
               'https://www.transfermarkt.com/laliga/startseite/wettbewerb/ES1/plus/?saison_id=2021',
               'https://www.transfermarkt.com/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2021',
               'https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1/plus/?saison_id=2021',
               'https://www.transfermarkt.com/ligue-1/startseite/wettbewerb/FR1/plus/?saison_id=2021']

team_links = []
for url in league_urls:
    links = get_tm_team_links(url)
    team_links.extend(links)
    time.sleep(np.random.uniform(low=2, high=5))
    
print('Number of teams:', len(team_links))

Replace the compact (startseite) link with the detailed version (kader). And add on plus/1 to remove some of the extra tables

In [None]:
team_links = [s.replace('startseite', 'kader') for s in team_links] 
# not needed as looking at current season anyway
#team_links.extend([f'{s[:-4]}2020' for s in team_links])  # add 2020 pages for latest market values
team_links = [f'{l}/plus/1' for l in team_links]

Get a dataframe with all the player data for the big-5 leagues

In [None]:
team_dfs = []
for url in team_links:
    year = url[-11:-7]
    df = get_tm_team_squad(url)
    df['year'] = year
    team_dfs.append(df)   
    time.sleep(np.random.uniform(low=2, high=5))
df_all_players = pd.concat(team_dfs)

Format the player dataframe and save it as a parquet file

In [None]:
df_all_players.reset_index(drop=True, inplace=True)
df_all_players['number'] = pd.to_numeric(df_all_players.number, errors='coerce')
df_all_players = format_tm_market_value(df_all_players)
# format date columns
df_all_players['dob'] = pd.to_datetime(df_all_players['dob_age'].str.split(pat='(').str[0], errors='coerce')
df_all_players['joined'] = pd.to_datetime(df_all_players['joined'], errors='coerce')
df_all_players['contract_expires'] = pd.to_datetime(df_all_players['contract_expires'], errors='coerce')
df_all_players.drop(['dob_age'], axis=1, inplace=True)
df_all_players['height'] = pd.to_numeric(df_all_players['height'].str.replace('m', '').str.replace(',','.'))
df_all_players.to_parquet(os.path.join('data', 'raw', 'transfermarkt', 'players_transfermarkt.parquet'))