In [2]:
import pandas as pd

# **1. Definición de variables necesarias para la extracción.**

In [3]:
# Defined variables

##Define league names and their IDs
dict_league_names = {'Premier-League': '9',
                     'Ligue-1': '13',
                     'Bundesliga': '20',
                     'Serie-A': '11',
                     'La-Liga': '12',
                     'Major-League-Soccer': '22',
                     'Big-5-European-Leagues': 'Big5'
                    }
## Define list of long names for 'Big 5' European Leagues and MLS
lst_league_names_long = ['Premier-League', 'Ligue-1', 'Bundesliga', 'Serie-A', 'La-Liga', 'Major-League-Soccer', 'Big-5-European-Leagues']

## Define seasons to scrape
lst_seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

# **2. Función para extracción de estádisticas de los jugadores**

In [4]:
# Define function for scraping a defined season and competition of FBref player data
def get_fbref_player_stats(lst_league_names, lst_seasons):
    
    ## Define list of league names
    league_names_long = lst_league_names
    
    ## Define seasons to scrape
    seasons = lst_seasons

    ## Scrape information for each player
    for season in seasons:

        ### Print message
        print(f'Scraping started for the {season} season...')

        ### Loop through leagues
        for league_name_long in league_names_long:
            league_name_short = [v for k,v in dict_league_names.items() if k == league_name_long][0]
    
            print(f'Scraping started for player stats data for {league_name_long} league for the {season} season...')
            
            print(f'Scraping Standard stats...')
            url_std_stats = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fstats%2Fplayers%2F{season}-{league_name_long}&div=div_stats_standard'
            df_std_stats = pd.read_html(url_std_stats, header=1)[0]  
            
            ##### Shooting stats
            print(f'Scraping Shooting stats...')
            url_shooting = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fshooting%2Fplayers%2F{season}-{league_name_long}&div=div_stats_shooting'
            df_shooting = pd.read_html(url_shooting, header=1)[0]

            ##### Passing stats
            print(f'Scraping Passing stats...')
            url_passing = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fpassing%2Fplayers%2F{season}-{league_name_long}&div=div_stats_passing'
            df_passing = pd.read_html(url_passing, header=1)[0]

            ##### Pass Types stats
            print(f'Scraping Pass Types stats...')
            url_passing_types = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fpassing_types%2Fplayers%2F{season}-{league_name_long}&div=div_stats_passing_types'
            df_passing_types = pd.read_html(url_passing_types, header=1)[0]

            ##### Goals and Shot Creation stats
            print(f'Scraping Goals and Shot Creation stats...')
            url_gca = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fgca%2Fplayers%2F{season}-{league_name_long}&div=div_stats_gca'
            df_gca = pd.read_html(url_gca, header=1)[0]

            ##### Defensive Actions stats
            print(f'Scraping Defensive Actions stats...')
            url_defense = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fdefense%2Fplayers%2F{season}-{league_name_long}&div=div_stats_defense'
            df_defense = pd.read_html(url_defense, header=1)[0]

            ##### Possession stats
            print(f'Scraping Possession stats...')
            url_possession = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fpossession%2Fplayers%2F{season}-{league_name_long}&div=div_stats_possession'
            df_possession = pd.read_html(url_possession, header=1)[0]

            ##### Playing Time stats
            print(f'Scraping Playing Time stats...')
            url_playing_time = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fplayingtime%2Fplayers%2F{season}-{league_name_long}&div=div_stats_playing_time'
            df_playing_time = pd.read_html(url_playing_time, header=1)[0]

            ##### Miscellaneous stats
            print(f'Scraping Miscellaneous stats...')
            url_misc = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fmisc%2Fplayers%2F{season}-{league_name_long}&div=div_stats_misc'
            df_misc = pd.read_html(url_misc, header=1)[0]
      
            ##### Concatenate defined individual DataFrames
                
            ####### Define DataFrames to be concatenated side-by-side (not all of them)
            lst_dfs = [df_std_stats, df_shooting, df_passing, df_passing_types, df_gca, df_defense, df_possession]

            ###### Concatenate DataFrames side-by-side (indicated in list above)
            df_all = pd.concat(lst_dfs, axis=1)

            ###### Drop duplicate columns
            df_all = df_all.loc[:,~df_all.columns.duplicated()]

            ###### Drop duplicate rows
            df_all = df_all.drop_duplicates()
                
            ##### Left join defined individual DataFrames
                
            ####### Define join conditions
            conditions_join = ['Player', 'Nation', 'Pos', 'Squad', 'Comp']

            ###### Left join Playing Time data
            df_all = pd.merge(df_all, df_playing_time, left_on=conditions_join, right_on=conditions_join, how='left')

            ###### Remove duplicate columns after join (contain '_y') and remove '_x' suffix from kept columns
            df_all = df_all[df_all.columns.drop(list(df_all.filter(regex='_y')))]
            df_all.columns = df_all.columns.str.replace('_x','')
                
            ###### Drop duplicate rows
            df_all = df_all.drop_duplicates()

            ###### Left join Misc data
            df_all = pd.merge(df_all, df_misc, left_on=conditions_join, right_on=conditions_join, how='left')

            ###### Remove duplicate columns after join (contain '_y') and remove '_x' suffix from kept columns
            df_all = df_all[df_all.columns.drop(list(df_all.filter(regex='_y')))]
            df_all.columns = df_all.columns.str.replace('_x','')
                
            ###### Drop duplicate rows
            df_all = df_all.drop_duplicates()
                
                
            ##### FORMAT DataFrames
                
            ###### Take first two digits of age - fixes current season issue with extra values
            df_all['Age'] = df_all['Age'].astype(str).str[:2]
                
            ###### Create columns for league code and season
            df_all['League Name'] = league_name_long
            df_all['League ID'] = league_name_short
            df_all['Season'] = season          

            ###### Drop duplicates
            df_all = df_all.drop_duplicates()

    return df_all

# **3. Extracción de jugadores**
Se hace una extracción por temporada para evitar el time out del servidor.


In [5]:
# Leagues selected to scrape
lst_league_names = ['Big-5-European-Leagues']     #'Premier-League', 'Ligue-1', 'Bundesliga', 'Serie-A', 'La-Liga', 'Major-League-Soccer']

In [None]:
playerdata2017_2018 = get_fbref_player_stats(lst_league_names,['2017-2018'])

Scraping started for the 2017-2018 season...
Scraping started for player stats data for Big-5-European-Leagues league for the 2017-2018 season...
Scraping Standard stats...
Scraping Shooting stats...
Scraping Passing stats...


In [None]:
playerdata2018_2019 = get_fbref_player_stats(lst_league_names,['2018-2019'])

In [None]:
playerdata2019_2020 = get_fbref_player_stats(lst_league_names,['2019-2020'])

In [None]:

playerdata2020_2021 = get_fbref_player_stats(lst_league_names,['2020-2021'])

In [None]:
playerdata2021_2022 = get_fbref_player_stats(lst_league_names,['2021-2022'])

# **4. Función para extracción de estádisticas de los porteros**

In [None]:
# Define function for scraping a defined season and competition of FBref player data
def get_fbref_goalkeeper_stats(lst_league_names, lst_seasons):
    
    ## Define list of league names
    league_names_long = lst_league_names
    
    ## Define seasons to scrape
    seasons = lst_seasons
    
    ## Scrape information for each player
    for season in seasons:

        ### Print message
        print(f'Scraping started for the {season} season...')

        ### Loop through leagues
        for league_name_long in league_names_long:
            league_name_short = [v for k,v in dict_league_names.items() if k == league_name_long][0]
        
            print(f'Scraping started for goalkeeper stats data for {league_name_long} league for the {season} season...')

            ##### Standard stats
            print(f'Scraping Standard stats...')
            url_std_stats = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fstats%2Fplayers%2F{season}-{league_name_long}&div=div_stats_standard'
            df_std_stats = pd.read_html(url_std_stats, header=1)[0]

            ##### Goalkeeper stats
            print(f'Scraping Goalkeeper stats...')
            url_keepers = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fkeepers%2Fplayers%2F{season}-{league_name_long}&div=div_stats_keeper'
            df_keepers = pd.read_html(url_keepers, header=1)[0]

            ##### Advanced Goalkeeper stats
            print(f'Scraping Advanced Goalkeeper stats...')
            url_keepers_adv = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fkeepersadv%2Fplayers%2F{season}-{league_name_long}&div=div_stats_keeper_adv'
            df_keepers_adv = pd.read_html(url_keepers_adv, header=1)[0]

            ##### Playing Time stats
            print(f'Scraping Playing Time stats...')
            url_playing_time = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fplayingtime%2Fplayers%2F{season}-{league_name_long}&div=div_stats_playing_time'
            df_playing_time = pd.read_html(url_playing_time, header=1)[0]

            ##### Miscellaneous stats
            print(f'Scraping Miscellaneous stats...')
            url_misc = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fmisc%2Fplayers%2F{season}-{league_name_long}&div=div_stats_misc'
            df_misc = pd.read_html(url_misc, header=1)[0]

            ##### Concatenate defined individual DataFrames
                
            ####### Define DataFrames to be concatenated side-by-side (not all of them)
            lst_dfs = [df_keepers, df_keepers_adv]

            ###### Concatenate DataFrames side-by-side (indicated in list above)
            df_all = pd.concat(lst_dfs, axis=1)

            ###### Drop duplicate columns
            df_all = df_all.loc[:,~df_all.columns.duplicated()]

            ###### Drop duplicate rows
            df_all = df_all.drop_duplicates()
                
            ##### Left join defined individual DataFrames
                
            ####### Define join conditions
            conditions_join = ['Player', 'Nation', 'Pos', 'Squad', 'Comp']

            ###### Left join Standard Stats data
            df_all = pd.merge(df_all, df_std_stats, left_on=conditions_join, right_on=conditions_join, how='left')

            ###### Remove duplicate columns after join (contain '_y') and remove '_x' suffix from kept columns
            df_all = df_all[df_all.columns.drop(list(df_all.filter(regex='_y')))]
            df_all.columns = df_all.columns.str.replace('_x','')
                
            ###### Drop duplicate rows
            df_all = df_all.drop_duplicates()
                
            ###### Left join Playing Time data
            df_all = pd.merge(df_all, df_playing_time, left_on=conditions_join, right_on=conditions_join, how='left')

            ###### Remove duplicate columns after join (contain '_y') and remove '_x' suffix from kept columns
            df_all = df_all[df_all.columns.drop(list(df_all.filter(regex='_y')))]
            df_all.columns = df_all.columns.str.replace('_x','')
                
            ###### Drop duplicate rows
            df_all = df_all.drop_duplicates()

            ###### Left join Misc data
            df_all = pd.merge(df_all, df_misc, left_on=conditions_join, right_on=conditions_join, how='left')

            ###### Remove duplicate columns after join (contain '_y') and remove '_x' suffix from kept columns
            df_all = df_all[df_all.columns.drop(list(df_all.filter(regex='_y')))]
            df_all.columns = df_all.columns.str.replace('_x','')
                
            ###### Drop duplicate rows
            df_all = df_all.drop_duplicates()
                
            ##### FORMAT DataFrames
                
            ###### Take first two digits of age - fixes current season issue with extra values
            df_all['Age'] = df_all['Age'].astype(str).str[:2]
                
            ###### Create columns for league code and season
            df_all['League Name'] = league_name_long
            df_all['League ID'] = league_name_short
            df_all['Season'] = season              

             ###### Drop duplicates
            df_all = df_all.drop_duplicates()

    return df_all

# **5. Extracción de porteros**
Se hace una extracción por temporada para evitar el time out del servidor.

In [None]:
goalkeeperdata2017_2018 = get_fbref_goalkeeper_stats(lst_league_names,['2017-2018'])

In [None]:
goalkeeperdata2018_2019 = get_fbref_goalkeeper_stats(lst_league_names,['2018-2019'])

In [None]:
goalkeeperdata2019_2020 = get_fbref_goalkeeper_stats(lst_league_names,['2019-2020'])

In [None]:
goalkeeperdata2020_2021 = get_fbref_goalkeeper_stats(lst_league_names,['2020-2021'])

In [None]:
goalkeeperdata2021_2022 = get_fbref_goalkeeper_stats(lst_league_names,['2021-2022'])