<a href="https://colab.research.google.com/github/andrewRowlinson/data-science/blob/master/football/scrape_fbref.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

def get_soup(url):
    headers = {'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/39.0.2171.95 Safari/537.36')}
    r = requests.get(url, headers=headers)
    r.encoding = 'unicode-escape'
    return BeautifulSoup(r.content, 'html.parser')

def get_data_from_table(table, data_type, skip_rows):
    """Helper method to get the data from a table. """
    # https://stackoverflow.com/questions/42285417/how-to-preserve-links-when-scraping-a-table-with-beautiful-soup-and-pandas
    if data_type == 'title':
        data = [[td.a.get('title') if td.find('a') else ''.join(td.stripped_strings) for td in row.find_all('td')]
                for row in table.find_all('tr')]
    if data_type == 'link':
        data = [[td.a['href'] if td.find('a') else ''.join(td.stripped_strings) for td in row.find_all('td')]
                for row in table.find_all('tr')]
    else:
        data = [[td.a.string if td.find('a') else ''.join(td.stripped_strings) for td in row.find_all('td')]
                for row in table.find_all('tr')]   
    
    data = [d for d in data if len(d)!=0][0::skip_rows]
    
    return data

def get_fbref_big5(url):
    soup = get_soup(url)
    df = pd.read_html(str(soup))[0]
    
    # column names - collapse the multiindex
    col1 = list(df.columns.get_level_values(0))
    col1 = ['' if c[:7]=='Unnamed' else c.replace(' ', '_').lower() for c in col1]
    col2 = list(df.columns.get_level_values(1))
    col2 = [c.replace(' ', '_').lower() for c in col2]
    cols = [f'{c}_{col2[i]}' if c != '' else col2[i] for i, c in enumerate(col1)]
    df.columns = cols
    
    # remove lines that are the header row repeated
    df = df[df.rk != 'Rk'].copy()
    
    # add the url for the player profile and match logs
    # https://stackoverflow.com/questions/42285417/how-to-preserve-links-when-scraping-a-table-with-beautiful-soup-and-pandas
    parsed_table = soup.find_all('table')[0]
    data = [[td.a['href'] if td.find('a') else ''.join(td.stripped_strings) for td in row.find_all('td')]
        for row in parsed_table.find_all('tr')]
    data = [d for d in data if len(d)!=0]
    match_log = [d[-1] for d in data]
    player_profile = [d[0] for d in data]
    df['match_link'] = match_log
    df['player_link'] = player_profile
    
    # remove players who haven't played a minute from the playing time table
    if 'playing_time_mp' in df.columns:
        df = df[df.playing_time_mp != '0'].copy()
        df.reset_index(drop=True, inplace=True)
        df['rk'] = df.index + 1
        
    # drop the matches column
    df.drop('matches', axis='columns', inplace=True)

    # columns to numeric columns
    df[df.columns[6:-2]] = df[df.columns[6:-2]].apply(pd.to_numeric, errors='coerce', axis='columns')
    return df

In [2]:
url = 'https://fbref.com/en/comps/Big5/keepersadv/players/Big-5-European-Leagues-Stats'

In [3]:
df_gk = get_fbref_big5(url)

In [4]:
df_gk.head()

Unnamed: 0,rk,player,nation,pos,squad,comp,age,born,90s,goals_ga,...,goal_kicks_launch%,goal_kicks_avglen,crosses_opp,crosses_stp,crosses_stp%,sweeper_#opa,sweeper_#opa/90,sweeper_avgdist,match_link,player_link
0,1,Julen Agirrezabala,es ESP,GK,Athletic Club,es La Liga,,2000.0,4.0,5.0,...,38.5,38.3,35.0,2.0,5.7,5.0,1.25,17.3,/en/players/a2c1a8d3/matchlogs/2021-2022/keepe...,/en/players/a2c1a8d3/Julen-Agirrezabala
1,2,Doğan Alemdar,tr TUR,GK,Rennes,fr Ligue 1,,2002.0,5.0,4.0,...,86.2,57.7,46.0,1.0,2.2,7.0,1.4,14.3,/en/players/9e17ccff/matchlogs/2021-2022/keepe...,/en/players/9e17ccff/Dogan-Alemdar
2,3,Alisson,br BRA,GK,Liverpool,eng Premier League,,1992.0,26.0,18.0,...,43.5,39.1,166.0,17.0,10.2,38.0,1.46,17.6,/en/players/7a2e46a8/matchlogs/2021-2022/keepe...,/en/players/7a2e46a8/Alisson
3,4,Alphonse Areola,fr FRA,GK,West Ham,eng Premier League,,1993.0,1.0,1.0,...,71.4,53.2,13.0,1.0,7.7,0.0,0.0,7.0,/en/players/2f965a72/matchlogs/2021-2022/keepe...,/en/players/2f965a72/Alphonse-Areola
4,5,Kepa Arrizabalaga,es ESP,GK,Chelsea,eng Premier League,,1994.0,4.0,2.0,...,28.6,30.2,31.0,1.0,3.2,5.0,1.25,16.2,/en/players/28d596a0/matchlogs/2021-2022/keepe...,/en/players/28d596a0/Kepa-Arrizabalaga
