In [1]:
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
import requests
from bs4 import Tag, NavigableString, BeautifulSoup
import re
import time
import pandas as pd
import sys

# Store all players' links

In [2]:
def extract_players_urls(main_url, competitions_url, wanted_seasons):
    """
    Extracts all big 5 European player urls from the main competition 
    url if they have played in the wanted seasons.
    
    Arguments:
        main_url: string to main fbreb url
        competitions_url: string to competitions main webpage
        wanted_seasons: list with strings of wanted seasons in format 
                        '2020-2021'
                        
    Returns:
        all_players_urls: list with player urls.
    """
    html = urlopen(competitions_url)
    bs_competitions = BeautifulSoup(html)

    # Find big 5 leagues urls
    big5_urls = []
    big5_table = bs_competitions.find_all('table', {'class':'sortable'})[2]
    big5_rows = big5_table.tbody

    for row in big5_rows:
        if not isinstance(row, NavigableString):
            big5_urls.append(main_url + row.a['href'])

    # Find the wanted seasons' urls of the big5
    league_seasons_urls = []
    for league_url in big5_urls[:5]: # avoid big 5 european leagues combined
        # Open html
        html = urlopen(league_url)
        bs_league = BeautifulSoup(html)
        # Extract seasons' urls
        seasons_table = bs_league.find('tbody').find_all('th')
        for season in seasons_table:
            if season.text in wanted_seasons:
                league_seasons_urls.append(main_url + season.a['href'])

    # Find all the teams per season
    season_team_urls = []
    for season_url in league_seasons_urls:
        # Open html
        html = urlopen(season_url)
        bs_season = BeautifulSoup(html)
        # Extract teams' urls
        teams_table = bs_season.find('table', {'class', 'stats_table'}).find_all('tr')
        for team in teams_table:
            team_cell = team.find('td')
            if team_cell != None:
                season_team_urls.append(main_url + team_cell.a['href'])

    # Find all players' urls
    all_players_urls = []
    for season_team_url in season_team_urls:
        # Open html
        html = urlopen(season_team_url)
        bs_team_season = BeautifulSoup(html)
        # Extract players' urls
        players_table = bs_team_season.find('table', {'class', 'stats_table'}).tbody.find_all('tr')
        for player_row in players_table:
            player_url = main_url + player_row.th.a['href']
            if player_url not in all_players_urls:
                all_players_urls.append(player_url)
    
    return all_players_urls

----------

# Players' data

### Brief data

In [3]:
def extract_players_brief(bs_player):
    """
    Extracts biographical and miscellaneous information of the players.
    
    Argument: 
        bs_player: BeautifoulSoup object of the player's html.
    
    Returns: tuple with the extracted player information
    """
    # Player's brief
    player_brief = bs_player.find('div', {'id': 'info'}).div

    # Extract image from brief
    image = None
    ############################################################################
    #image_line = player_brief.find('div', {'class': 'media-item'})
    #if image_line != None:
    #    image = image_line.img['src']
    ################################################################################

    # Extract name from brief
    name = player_brief.span.string
    first_name = name.split(" ")[0]

    # Extract all brief info below the name
    all_lines_brief = player_brief.find_all('p')

    complete_name = name
    position, footed = None, None
    height, weight = None, None
    birth_year = None
    born_country, citizenship = None, None
    national, youth_national = None, None
    
    # complete name
    c_name_line = player_brief.find('h1', {'itemprop':'name'}).next_sibling.next_sibling
    if not "Position:" in c_name_line.text:
        complete_name = c_name_line.text
    else:
        complete_name = name
    
    # height
    height = player_brief.find('span', {'itemprop':'height'})
    if height != None:
        height = height.text.strip()

    # weight
    weight = player_brief.find('span', {'itemprop':'weight'})
    if weight != None:
        weight = weight.text.strip()

    # birthday
    birthday = player_brief.find('span', {'itemprop':'birthDate'})
    if birthday != None:
        birthday= birthday.text.strip().split(" ")[-1]  

    # Other fields
    for line in all_lines_brief:
            
        # field position and footed
        if "Position" in line.text:
            position_fooded = line.text

            if "Footed" in position_fooded:
                position = re.search('[A-Z]+(-[A-Z]*)*', position_fooded.split("Position: ")
                                     [1]).group(0)
                footed = re.search('[A-Za-z]+', position_fooded.split("Footed: ")
                                   [1]).group(0)
            else:
                position = position_fooded.split("Position: ")[1].strip()
                footed = None
        
        # Country data comes from different origins because depending on the player
        # there will be or not info in the birthplace, citizenship and national team fields
        # Birthday and country
        if "Born:" in line.text:
            birth_year = re.search('[0-9]+\n', line.text)
            if birth_year != None:
                birth_year = birth_year.group(0).strip()
                if "in" in line.text:
                    born_country = re.search(', [A-Za-z]+\n', line.text)
                    if isinstance(born_country, re.Match):
                        born_country = born_country.group(0).split(" ")[1].strip()

        # Citizenship
        if "Citizenship" in line.text:
            citizenship = line.a
            if citizenship != None:
                 citizenship = citizenship.text
        
        # Youth national team
        if "Youth National Team" in line.text:
            youth_national = line.a
            if youth_national != None:
                 youth_national = youth_national.text
                    
        # National team
        if "National Team" in line.text:
            national = line.a
            if national != None:
                 national = national.text
    
    # Final country field value
    country = born_country
    if country is None:
        country = citizenship
        if country is None:
            country = youth_national
            if country is None:
                country = national
                
    ##################################################################################
    # Extract trophies
    # player_trophies = bs_player.find('div', {'id': 'info'}).ul
    # player_trophies.find_all('li', {'class': 'important poptip'})
    # player_trophies.find_all('li', {'class': 'important all_star poptip'})
    ##################################################################################

    return (image, name, complete_name, position, footed, height, weight, birthday, country)

In [4]:
# Testing multiple players
url ="https://fbref.com/en/players/31c69ef1/Ruben-Dias"
url2  ="https://fbref.com/en/players/6d8f8441/scout/365_euro/Coke-Scouting-Report"

html = urlopen(url2)
bs_player = BeautifulSoup(html)

extract_players_brief(bs_player)

(None,
 'Coke',
 'Jorge Andújar Moreno',
 'DF',
 'Right',
 '182cm',
 '78kg',
 '1987',
 'Spain')

### Standard Stats

In [5]:
def extract_standard_stats_table(bs_player, wanted_seasons):
    """
    Extracts the league stats of the players. Only stats in wanted seasons
    
    Argument: 
        bs_player: BeautifoulSoup object of the player's html.
        wanted_seasons: list with strings of wanted seasons in format 
                        '2020-2021'
    
    Returns: dataframe with the extracted player information
    """
    # Filter standard stats for all competitions
    standard_stats = bs_player.find(id = 'div_stats_standard_dom_lg')

    if standard_stats != None:

        # Extract standard_stats column names
        head_table = standard_stats.thead.find_all('tr')
        column_names = head_table[1].text.strip().split(" ")[:17]

        # Extract standard_stats data (only playing time and performance metrics)
        all_data = []
        rows_table = standard_stats.tbody.find_all('tr') 
        for data in rows_table:
            season = data.th.text
            if season in wanted_seasons:
                age = data.find('td', {'data-stat':'age'}).text
                squad = data.find('td', {'data-stat':'squad'}).text
                country = data.find('td', {'data-stat':'country'}).text.split(" ")[1]
                comp = data.find('td', {'data-stat':'comp_level'}).a.text
                LgRank = data.find('td', {'data-stat':'lg_finish'}).text
                # Playing time
                MP = data.find('td', {'data-stat':'games'}).text
                Starts = data.find('td', {'data-stat':'games_starts'}).text
                Min = data.find('td', {'data-stat':'minutes'}).text
                var_90s = data.find('td', {'data-stat':'minutes_90s'}).text
                # Performance
                Gls = data.find('td', {'data-stat':'goals'}).text
                Ast = data.find('td', {'data-stat':'assists'}).text
                G_PK = data.find('td', {'data-stat':'goals_pens'}).text
                PK = data.find('td', {'data-stat':'pens_made'}).text
                PKatt = data.find('td', {'data-stat':'pens_att'}).text
                CrdY = data.find('td', {'data-stat':'cards_yellow'}).text
                CrdR = data.find('td', {'data-stat':'cards_red'}).text

                all_data.append([season, age, squad, country, comp, LgRank, MP, Starts, Min, 
                  var_90s, Gls, Ast, G_PK, PK, PKatt, CrdY, CrdR])

        df = pd.DataFrame(all_data, columns=column_names)
        return df
    
    return None

### Check if player is in wanted seasons

We are going to extract brief and stats data if players have participated in the wanted seasons leagues.

In [6]:
def check_player_if_in_seasons(bs_player, wanted_seasons):
    """
    Checks if a player played in league seasons present in the list
    wanted_seasons.
    
    Arguments:
        bs_player: BeautifoulSoup object of the player's html.
        wanted_seasons: list with strings of wanted seasons in format 
                        '2020-2021'    
    
    Returns:
        Boolean: true if player played the league in the wanted seasons.
    
    """
    # Filter standard stats for all competitions
    standard_stats = bs_player.find(id = 'div_stats_standard_dom_lg')
    
    if standard_stats != None:
        rows_table = standard_stats.tbody.find_all('tr')
        for data in rows_table:
            season = data.th.text
            if season in wanted_seasons:
                return True
    return False

-----

# All data

In [7]:
def extract_all_data(wanted_seasons, player_urls_file):
    """
    Create final dataframe with all players league stats for the wanted
    seasons. 
    
    Arguments:
        wanted_seasons: list with strings of wanted seasons in format 
                        '2020-2021'    
        players_urls_file: file with all players urls.
    
    Returns:
        df: dataframe with all players' information
        down_urls: list of player urls already downloaded
        not_wanted_urls: list of player urls that are not in wanted 
                        seasons leagues
    """
    
    df = None
    
    # Get players urls
    players_urls = []
    with open(player_urls_file, 'r') as file:
        for row in file:
            players_urls.append(row.strip())
    
     
    num_urls = len(players_urls) # For printing
    down_urls = [] # Player urls already downloaded
    not_wanted_urls = [] # Player urls that are not in wanted seasons
    
    # Extract data from players
    for num, url in enumerate(players_urls):
        
        # Print scraping state
        print("\rPlayer number {}/{}.".format(num, num_urls), end="")
        sys.stdout.flush()
        
        # html to beautifulsoup
        try:
            html = urlopen(url)
            bs_player = BeautifulSoup(html)
            down_urls.append(url)
        except HTTPError as e:
            print(e)
            return df, down_urls, not_wanted_urls
        except URLError as e:
            print(e)
            return df, down_urls, not_wanted_urls
        except:
            return df, down_urls, not_wanted_urls
            

        # check if player is in wanted league seasons
        is_wanted = check_player_if_in_seasons(bs_player, wanted_seasons)

        if is_wanted:

            # brief_data
            brief_data = extract_players_brief(bs_player)    
            image, name, complete_name, position = brief_data[0:4]
            footed, height, weight, birth_year, country = brief_data[4:]

            # standard stats
            if not isinstance(df, pd.DataFrame):
                df = extract_standard_stats_table(bs_player, wanted_seasons)
                #df.insert(17, 'image', image)
                df.insert(0, 'player_country', country)
                df.insert(0, 'birth_year', birth_year)
                df.insert(0, 'weight', weight)
                df.insert(0, 'height', height)
                df.insert(0, 'footed', footed)
                df.insert(0, 'position', position)
                df.insert(0, 'c_name', complete_name)
                df.insert(0, 'name', name)


            else:
                df2 = extract_standard_stats_table(bs_player, wanted_seasons)
                #df2.insert(17, 'image', image)
                df2.insert(0, 'player_country', country)
                df2.insert(0, 'birth_year', birth_year)
                df2.insert(0, 'weight', weight)
                df2.insert(0, 'height', height)
                df2.insert(0, 'footed', footed)
                df2.insert(0, 'position', position)
                df2.insert(0, 'c_name', complete_name)
                df2.insert(0, 'name', name)
                df = df.append(df2)
        else:
            not_wanted_urls.append(url)
            
        if num % 100 == 0 and num != 0:
        
            df.to_csv('players_stats.csv')
        
    return df, down_urls, not_wanted_urls

In [8]:
main_url = "https://fbref.com"
competitions_url = "https://fbref.com/en/comps/"

wanted_seasons = ['2010-2011', '2011-2012', '2012-2013', '2013-2014', '2014-2015', 
                  '2015-2016', '2016-2017', '2017-2018', '2018-2019', '2019-2020', 
                  '2020-2021']

# Extract all players' links
player_urls_file = 'links_all_players.txt'

# Get players urls
all_players_urls = []

with open(player_urls_file, 'r') as file:
    for row in file:
        all_players_urls.append(row.strip())
        
        
if len(all_players_urls) == 0:
    # Extract and store players urls on a file
    all_players_urls = extract_players_urls(main_url, competitions_url, wanted_seasons)

    with open(player_urls_file, 'w') as file:
        for player_link in all_players_urls:
            player_link += "\n"
            file.write(player_link)

In [9]:
df, down_urls, not_wanted_urls = extract_all_data(wanted_seasons, player_urls_file)

Player number 3288/3289.