In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time

Functions for getting the data

In [2]:
# headers for web scraping
headers = {'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/39.0.2171.95 Safari/537.36')}

def get_links(url, headers):
    """Get links for each team page."""
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')
    table = soup.find_all('table')[3]
    links = table.find_all('a', class_='vereinprofil_tooltip')
    links = [l['href'] for l in links]
    links = list(set(links))
    links = [f'https://www.transfermarkt.com{l}' for l in links]
    return links

def get_team_df(url, headers):
    """Get a dataframe of the players in each team and their monetary values."""
    team_name = url[30: 30 + url[30:].find('/')]
    r = requests.get(url, headers=headers)
    r.encoding = 'unicode-escape'
    soup = BeautifulSoup(r.content, 'html.parser')
    dataframes = pd.read_html(str(soup))
    df = dataframes[1].copy()
    cols = ['#', 'Unnamed: 2', 'Date of birth / Age', 'Market value']
    df[cols] = df[cols].ffill()
    df = df.drop_duplicates(cols, keep='last')
    df.drop('Nat.', inplace=True, axis=1)
    df['team_name'] = team_name
    return df

Get the links for each football team in the top-5 leagues for the 2019 season

In [3]:
league_urls = ['https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/plus/?saison_id=2019',
               'https://www.transfermarkt.com/laliga/startseite/wettbewerb/ES1/plus/?saison_id=2019',
               'https://www.transfermarkt.com/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2019',
               'https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1/plus/?saison_id=2019',
               'https://www.transfermarkt.com/ligue-1/startseite/wettbewerb/FR1/plus/?saison_id=2019']

team_links = []
for url in league_urls:
    links = get_links(url, headers)
    team_links.extend(links)
    time.sleep(np.random.uniform(low=2, high=5))
    
print('Number of teams:', len(team_links))

Number of teams: 98


Get a dataframe with all the player data for the top-5 leagues

In [4]:
team_dfs = []
for url in team_links:
    df = get_team_df(url, headers)
    team_dfs.append(df)   
    time.sleep(np.random.uniform(low=2, high=5))
df_all_players = pd.concat(team_dfs)

Format the dataframe

In [5]:
df_all_players.reset_index(drop=True, inplace=True)
df_all_players.columns = ['number', 'position', 'player', 'dob_age', 'market_value_euros', 'team_name']
df_all_players['number'] = pd.to_numeric(df_all_players.number, errors='coerce')
# market value to numeric
df_all_players['market_value_euros'] = df_all_players.market_value_euros.str.replace('€', '')
mask_million = df_all_players.market_value_euros.str[-1] == 'm'
mask_thousand = df_all_players.market_value_euros.str[-3:] == 'Th.'
df_all_players['market_value_euros'] = df_all_players.market_value_euros.str.replace('Th.', '').str.replace('m', '')
df_all_players['market_value_euros'] = pd.to_numeric(df_all_players.market_value_euros, errors='coerce')
df_all_players.loc[mask_million, 'market_value_euros'] = df_all_players.loc[mask_million, 'market_value_euros'] * (10**6)
df_all_players.loc[mask_thousand, 'market_value_euros'] = df_all_players.loc[mask_thousand, 'market_value_euros'] * (10**3)
# split date of birth/ age columns
df_all_players['dob_age'] = df_all_players.dob_age.str.split(pat='(')
df_all_players['dob'] = pd.to_datetime(df_all_players.dob_age.apply(lambda x: x[0]))
df_all_players['age'] =pd.to_numeric(df_all_players.dob_age.apply(lambda x: x[1].replace(')', '')), errors='coerce')
df_all_players.drop('dob_age', axis=1, inplace=True)

Save the dataframe

In [6]:
df_all_players.to_parquet('transfermarket_big5_1920.parquet')