### Importing required libraries

In [21]:
%matplotlib inline
import csv
import numpy as np
import pandas as pd
from math import pi
import pandas_profiling as pp
import random
import json
from pandas.io.json import json_normalize
import requests
import re
from bs4 import BeautifulSoup
import matplotlib as mpl
import mplsoccer
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os
import glob

### Setting directories

In [22]:
base_dir = ".."
data_dir = os.path.join(base_dir,"data")
data_dir_fbref = os.path.join(base_dir,"data","fbref")
img_dir = os.path.join(base_dir,"image")
fig_dir = os.path.join(base_dir, "image", "fig")

os.makedirs(data_dir, exist_ok=True)
os.makedirs(data_dir_fbref, exist_ok=True)
os.makedirs(img_dir, exist_ok=True)
os.makedirs(fig_dir, exist_ok=True)

### Creating league dictionaries and relevant lists

In [23]:
dict_league_names = {
    'Premier-League': '9',
    'Ligue-1': '13',
    'Bundesliga': '20',
    'Serie-A': '11',
    'La-Liga': '12',
    'Major-League-Soccer': '22',
    'Big-5-European-Leagues': 'Big5'
}

In [24]:
league_names = ['Premier-League', 'Ligue-1', 'Bundesliga',
                'Serie-A', 'La-Liga',
                'Major-League-Soccer', 'Big-5-European-Leagues']

seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

folders = ['raw', 'engineered', 'reference']

data_type = ['goalkeeper', 'outfield', 'team']

### Creating the directories

In [25]:
# Make the data directory structure
for folder in folders:
    path = os.path.join(data_dir_fbref, folder)
    #print(path)
    if not os.path.exists(path):
        #print(path)
        os.mkdir(path)
    for data_types in data_type:
        path = os.path.join(data_dir_fbref, folder, data_types)
        #print(path)
        if not os.path.exists(path):
            #print("Here")
            os.mkdir(path)
        for league in league_names:
            path = os.path.join(data_dir_fbref, folder, data_types, league)
            #print(path)
            if not os.path.exists(path):
                #print("here2")
                os.mkdir(path)
            for season in seasons:
                path = os.path.join(data_dir_fbref, folder, data_types, league, season)
                #print(path)
                if not os.path.exists(path):
                    os.mkdir(path)
                    

### Function to get outfield players data from FBRef

In [26]:
# Define function for scraping a defined season and competition of FBref player data
def get_fbref_player_stats(lst_league_names, lst_seasons):
    
    """
    Function to scrape player stats from FBref.
    """
    
    
    ## Define list of league names
    league_names_long = lst_league_names
    
    
    ## Define seasons to scrape
    seasons = lst_seasons
    
    ## Scrape information for each player
    for season in seasons:

        ### Print message
        print(f'Scraping started for the {season} season...')

        ### Loop through leagues
        for league_name_long in league_names_long:
            
            #### Determine league short name from the league names dictionary
            league_name_short = [v for k,v in dict_league_names.items() if k == league_name_long][0]
            
            #### Save Player URL List (if not already saved)
            if not os.path.exists(os.path.join(data_dir_fbref + f'/raw/outfield/{league_name_long}/{season}/fbref_outfield_player_stats_{league_name_long}_{season}_latest.csv')):

                ##### Scraping

                ##### Print statement
                print(f'Scraping started for player stats data for {league_name_long} league for the {season} season...')

                ##### Standard stats
                print(f'Scraping Standard stats...')
                url_std_stats = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fstats%2Fplayers%2F{season}-{league_name_long}&div=div_stats_standard'
                df_std_stats = pd.read_html(url_std_stats, header=1)[0]


                ##### Shooting stats
                print(f'Scraping Shooting stats...')
                url_shooting = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fshooting%2Fplayers%2F{season}-{league_name_long}&div=div_stats_shooting'
                df_shooting = pd.read_html(url_shooting, header=1)[0]

                ##### Passing stats
                print(f'Scraping Passing stats...')
                url_passing = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fpassing%2Fplayers%2F{season}-{league_name_long}&div=div_stats_passing'
                df_passing = pd.read_html(url_passing, header=1)[0]

                ##### Pass Types stats
                print(f'Scraping Pass Types stats...')
                url_passing_types = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fpassing_types%2Fplayers%2F{season}-{league_name_long}&div=div_stats_passing_types'
                df_passing_types = pd.read_html(url_passing_types, header=1)[0]

                ##### Goals and Shot Creation stats
                print(f'Scraping Goals and Shot Creation stats...')
                url_gca = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fgca%2Fplayers%2F{season}-{league_name_long}&div=div_stats_gca'
                df_gca = pd.read_html(url_gca, header=1)[0]

                ##### Defensive Actions stats
                print(f'Scraping Defensive Actions stats...')
                url_defense = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fdefense%2Fplayers%2F{season}-{league_name_long}&div=div_stats_defense'
                df_defense = pd.read_html(url_defense, header=1)[0]

                ##### Possession stats
                print(f'Scraping Possession stats...')
                url_possession = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fpossession%2Fplayers%2F{season}-{league_name_long}&div=div_stats_possession'
                df_possession = pd.read_html(url_possession, header=1)[0]

                ##### Playing Time stats
                print(f'Scraping Playing Time stats...')
                url_playing_time = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fplayingtime%2Fplayers%2F{season}-{league_name_long}&div=div_stats_playing_time'
                df_playing_time = pd.read_html(url_playing_time, header=1)[0]

                ##### Miscellaneous stats
                print(f'Scraping Miscellaneous stats...')
                url_misc = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fmisc%2Fplayers%2F{season}-{league_name_long}&div=div_stats_misc'
                df_misc = pd.read_html(url_misc, header=1)[0]

                ##### Concatenate defined individual DataFrames
                
                ####### Define DataFrames to be concatenated side-by-side (not all of them)
                lst_dfs = [df_std_stats, df_shooting, df_passing, df_passing_types, df_gca, df_defense, df_possession]

                ###### Concatenate DataFrames side-by-side (indicated in list above)
                df_all = pd.concat(lst_dfs, axis=1)

                ###### Drop duplicate columns
                df_all = df_all.loc[:,~df_all.columns.duplicated()]

                ###### Drop duplicate rows
                df_all = df_all.drop_duplicates()
                
                ##### Left join defined individual DataFrames
                
                ####### Define join conditions
                conditions_join = ['Player', 'Nation', 'Pos', 'Squad', 'Comp']

                ###### Left join Playing Time data
                df_all = pd.merge(df_all, df_playing_time, left_on=conditions_join, right_on=conditions_join, how='left')

                ###### Remove duplicate columns after join (contain '_y') and remove '_x' suffix from kept columns
                df_all = df_all[df_all.columns.drop(list(df_all.filter(regex='_y')))]
                df_all.columns = df_all.columns.str.replace('_x','')
                
                ###### Drop duplicate rows
                df_all = df_all.drop_duplicates()

                ###### Left join Misc data
                df_all = pd.merge(df_all, df_misc, left_on=conditions_join, right_on=conditions_join, how='left')

                ###### Remove duplicate columns after join (contain '_y') and remove '_x' suffix from kept columns
                df_all = df_all[df_all.columns.drop(list(df_all.filter(regex='_y')))]
                df_all.columns = df_all.columns.str.replace('_x','')
                
                ###### Drop duplicate rows
                df_all = df_all.drop_duplicates()
                
                
                ##### Engineer DataFrames
                
                ###### Take first two digits of age - fixes current season issue with extra values
                df_all['Age'] = df_all['Age'].astype(str).str[:2]
                
                ###### Create columns for league code and season
                df_all['League Name'] = league_name_long
                df_all['League ID'] = league_name_short
                df_all['Season'] = season              

                ###### Drop duplicates
                df_all = df_all.drop_duplicates()

                
                ##### Save DataFrame
                df_all.to_csv(data_dir_fbref + f'/raw/outfield/{league_name_long}/{season}/fbref_outfield_player_stats_{league_name_long}_{season}_latest.csv', index=None, header=True, encoding='utf-8')        
                
                ##### Export a copy to the 'archive' subfolder, including the date
                #df_all.to_csv(data_dir_fbref + f'/raw/outfield/{league_name_long}/{season}/archive/fbref_outfield_player_stats_{league_name_long}_{season}_last_updated_{today}.csv', index=None, header=True, encoding='utf-8')        
                
                
                ##### Print statement for league and season
                print(f'All player stats data for the {league_name_long} league for {season} season scraped and saved.')
             
            
            #### Load player stats data (if already saved)
            else:

                ##### Print statement
                print(f'Player stats data for the {league_name_long} league for the {season} season already saved as a CSV file.')         

                
    ## Unify individual CSV files as a single DataFrame
    
    ### Show files in directory
    all_files = glob.glob(os.path.join(data_dir_fbref + f'/raw/outfield/*/*/fbref_outfield_player_stats_*_*_latest.csv'))
    
    ### Create an empty list of Players URLs
    lst_player_stats_all = []

    ### Loop through list of files and read into temporary DataFrames
    for filename in all_files:
        df_temp = pd.read_csv(filename, index_col=None, header=0)
        lst_player_stats_all.append(df_temp)

    ### Concatenate the files into a single DataFrame
    df_fbref_player_stats_all = pd.concat(lst_player_stats_all, axis=0, ignore_index=True)
    
    ### Drop header row of each concatenated  DataFrame (contains 'Rk', 'Rk' column)
    df_fbref_player_stats_all = df_fbref_player_stats_all[~df_fbref_player_stats_all['Rk'].str.contains('Rk')]
    
    ### Drop 'Rk' column
    df_fbref_player_stats_all = df_fbref_player_stats_all.drop(['Rk'], axis=1)
    
    ### Reset index
    #df_fbref_player_stats_all = df_fbref_player_stats_all.reset_index()
    
    ### Sort DataFrame
    df_fbref_player_stats_all = df_fbref_player_stats_all.sort_values(['League Name', 'Season', 'Player'], ascending=[True, True, True])

    
    ## Export DataFrame
    
    ###
    df_fbref_player_stats_all.to_csv(data_dir_fbref + f'/raw/outfield/fbref_outfield_player_stats_combined_latest.csv', index=None, header=True, encoding='utf-8')
    
    ### Save a copy to archive folder (dated)
    #df_fbref_player_stats_all.to_csv(data_dir_fbref + f'/raw/outfield/archive/fbref_outfield_player_stats_combined_last_updated_{today}.csv', index=None, header=True, encoding='utf-8')
    
    
    ## Distinct number of players
    total_players = df_fbref_player_stats_all['Player'].nunique()


    ## Print statement
    print(f'Player stats DataFrame contains {total_players} players.')
    
    
    ## Return final list of Player URLs
    return(df_fbref_player_stats_all)

### Function to get goalkeepers data from FBRef

In [27]:
# Define function for scraping a defined season and competition of FBref player data
def get_fbref_goalkeeper_stats(lst_league_names, lst_seasons):
    
    """
    Function to scrape goalkeeper stats from FBref.
    """
    
    
    ## Define list of league names
    league_names_long = lst_league_names
    
    
    ## Define seasons to scrape
    seasons = lst_seasons
    
    ## Scrape information for each player
    for season in seasons:

        ### Print message
        print(f'Scraping started for the {season} season...')

        ### Loop through leagues
        for league_name_long in league_names_long:
            
            #### Determine league short name from the league names dictionary
            league_name_short = [v for k,v in dict_league_names.items() if k == league_name_long][0]
            
            #### Save Player URL List (if not already saved)
            if not os.path.exists(os.path.join(data_dir_fbref + f'/raw/goalkeeper/{league_name_long}/{season}/fbref_goalkeeper_stats_{league_name_long}_{season}_latest.csv')):

                ##### Scraping

                ##### Print statement
                print(f'Scraping started for goalkeeper stats data for {league_name_long} league for the {season} season...')

                ##### Standard stats
                print(f'Scraping Standard stats...')
                url_std_stats = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fstats%2Fplayers%2F{season}-{league_name_long}&div=div_stats_standard'
                df_std_stats = pd.read_html(url_std_stats, header=1)[0]

                ##### Goalkeeper stats
                print(f'Scraping Goalkeeper stats...')
                url_keepers = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fkeepers%2Fplayers%2F{season}-{league_name_long}&div=div_stats_keeper'
                df_keepers = pd.read_html(url_keepers, header=1)[0]

                ##### Advanced Goalkeeper stats
                print(f'Scraping Advanced Goalkeeper stats...')
                url_keepers_adv = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fkeepersadv%2Fplayers%2F{season}-{league_name_long}&div=div_stats_keeper_adv'
                df_keepers_adv = pd.read_html(url_keepers_adv, header=1)[0]

                ##### Playing Time stats
                print(f'Scraping Playing Time stats...')
                url_playing_time = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fplayingtime%2Fplayers%2F{season}-{league_name_long}&div=div_stats_playing_time'
                df_playing_time = pd.read_html(url_playing_time, header=1)[0]

                ##### Miscellaneous stats
                print(f'Scraping Miscellaneous stats...')
                url_misc = f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=fb&url=%2Fen%2Fcomps%2F{league_name_short}%2F{season}%2Fmisc%2Fplayers%2F{season}-{league_name_long}&div=div_stats_misc'
                df_misc = pd.read_html(url_misc, header=1)[0]

                ##### Concatenate defined individual DataFrames
                
                ####### Define DataFrames to be concatenated side-by-side (not all of them)
                lst_dfs = [df_keepers, df_keepers_adv]

                ###### Concatenate DataFrames side-by-side (indicated in list above)
                df_all = pd.concat(lst_dfs, axis=1)

                ###### Drop duplicate columns
                df_all = df_all.loc[:,~df_all.columns.duplicated()]

                ###### Drop duplicate rows
                df_all = df_all.drop_duplicates()
                
                ##### Left join defined individual DataFrames
                
                ####### Define join conditions
                conditions_join = ['Player', 'Nation', 'Pos', 'Squad', 'Comp']

                ###### Left join Standard Stats data
                df_all = pd.merge(df_all, df_std_stats, left_on=conditions_join, right_on=conditions_join, how='left')

                ###### Remove duplicate columns after join (contain '_y') and remove '_x' suffix from kept columns
                df_all = df_all[df_all.columns.drop(list(df_all.filter(regex='_y')))]
                df_all.columns = df_all.columns.str.replace('_x','')
                
                ###### Drop duplicate rows
                df_all = df_all.drop_duplicates()
                
                ###### Left join Playing Time data
                df_all = pd.merge(df_all, df_playing_time, left_on=conditions_join, right_on=conditions_join, how='left')

                ###### Remove duplicate columns after join (contain '_y') and remove '_x' suffix from kept columns
                df_all = df_all[df_all.columns.drop(list(df_all.filter(regex='_y')))]
                df_all.columns = df_all.columns.str.replace('_x','')
                
                ###### Drop duplicate rows
                df_all = df_all.drop_duplicates()

                ###### Left join Misc data
                df_all = pd.merge(df_all, df_misc, left_on=conditions_join, right_on=conditions_join, how='left')

                ###### Remove duplicate columns after join (contain '_y') and remove '_x' suffix from kept columns
                df_all = df_all[df_all.columns.drop(list(df_all.filter(regex='_y')))]
                df_all.columns = df_all.columns.str.replace('_x','')
                
                ###### Drop duplicate rows
                df_all = df_all.drop_duplicates()
                
                
                ##### Engineer DataFrames
                
                ###### Take first two digits of age - fixes current season issue with extra values
                df_all['Age'] = df_all['Age'].astype(str).str[:2]
                
                ###### Create columns for league code and season
                df_all['League Name'] = league_name_long
                df_all['League ID'] = league_name_short
                df_all['Season'] = season              

                ###### Drop duplicates
                df_all = df_all.drop_duplicates()

                
                ##### Save DataFrame
                df_all.to_csv(data_dir_fbref + f'/raw/goalkeeper/{league_name_long}/{season}/fbref_goalkeeper_stats_{league_name_long}_{season}_latest.csv', index=None, header=True, encoding='utf-8')        
                
                ##### Export a copy to the 'archive' subfolder, including the date
                #df_all.to_csv(data_dir_fbref + f'/raw/goalkeeper/{league_name_long}/{season}/archive/fbref_goalkeeper_stats_{league_name_long}_{season}_last_updated_{today}.csv', index=None, header=True, encoding='utf-8')        
                
                
                ##### Print statement for league and season
                print(f'All Goalkeeper stats data for the {league_name_long} league for {season} season scraped and saved.')
             
            
            #### Load goalkeeper stats data (if already saved)
            else:

                ##### Print statement
                print(f'Goalkeeper stats data for the {league_name_long} league for the {season} season already saved as a CSV file.')         

                
   
    ## Unify individual CSV files as a single DataFrame
    
    ### Show files in directory
    all_files = glob.glob(os.path.join(data_dir_fbref + f'/raw/goalkeeper/*/*/fbref_goalkeeper_stats_*_*_latest.csv'))
    
    ### Create an empty list of Players URLs
    lst_goalkeeper_stats_all = []

    ### Loop through list of files and read into temporary DataFrames
    for filename in all_files:
        df_temp = pd.read_csv(filename, index_col=None, header=0)
        lst_goalkeeper_stats_all.append(df_temp)

    ### Concatenate the files into a single DataFrame
    df_fbref_goalkeeper_stats_all = pd.concat(lst_goalkeeper_stats_all, axis=0, ignore_index=True)
    
    ### Drop header row of each concatenated  DataFrame (contains 'Rk', 'Rk' column)
    df_fbref_goalkeeper_stats_all = df_fbref_goalkeeper_stats_all[~df_fbref_goalkeeper_stats_all['Rk'].str.contains('Rk')]
    
    ### Drop 'Rk' column
    df_fbref_goalkeeper_stats_all = df_fbref_goalkeeper_stats_all.drop(['Rk'], axis=1)
    
    ### Reset index
    #df_fbref_goalkeeper_stats_all = df_fbref_goalkeeper_stats_all.reset_index()
    
    ### Sort DataFrame
    df_fbref_goalkeeper_stats_all = df_fbref_goalkeeper_stats_all.sort_values(['League Name', 'Season', 'Player'], ascending=[True, True, True])

    
    ## Export DataFrame
    
    ###
    df_fbref_goalkeeper_stats_all.to_csv(data_dir_fbref + f'/raw/goalkeeper/fbref_goalkeeper_stats_combined_latest.csv', index=None, header=True, encoding='utf-8')
    
    ### Save a copy to archive folder (dated)
    #df_fbref_goalkeeper_stats_all.to_csv(data_dir_fbref + f'/raw/goalkeeper/archive/fbref_goalkeeper_stats_combined_last_updated_{today}.csv', index=None, header=True, encoding='utf-8')
    
    
    ## Distinct number of goalkeepers
    total_players = df_fbref_goalkeeper_stats_all['Player'].nunique()


    ## Print statement
    print(f'Goalkeeper stats DataFrame contains {total_players} players.')
    
    
    ## Return final list of Player URLs
    return(df_fbref_goalkeeper_stats_all)

In [28]:
# Display all columns of pandas DataFrames
pd.set_option('display.max_columns', None)

### Testing

In [29]:
league_names = ['Big-5-European-Leagues']     #'Premier-League', 'Ligue-1', 'Bundesliga', 'Serie-A', 'La-Liga', 'Major-League-Soccer']
seasons = ['2021-2022']

df_gk = get_fbref_goalkeeper_stats(league_names,seasons)

Scraping started for the 2021-2022 season...
Goalkeeper stats data for the Big-5-European-Leagues league for the 2021-2022 season already saved as a CSV file.
Goalkeeper stats DataFrame contains 188 players.


In [None]:
pp.ProfileReport(df_gk)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
df_gk.head()

Unnamed: 0,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,Min,90s,GA,GA90,SoTA,Saves,Save%,W,D,L,CS,CS%,PKatt,PKA,PKsv,PKm,Save%.1,Matches,FK,CK,OG,PSxG,PSxG/SoT,PSxG+/-,/90,Cmp,Att,Cmp%,Att.1,Thr,Launch%,AvgLen,Att.2,Launch%.1,AvgLen.1,Opp,Stp,Stp%,#OPA,#OPA/90,AvgDist,Gls,Ast,G-PK,PK,CrdY,CrdR,Gls.1,Ast.1,G+A,G-PK.1,G+A-PK,xG,npxG,xA,npxG+xA,xG.1,xA.1,xG+xA,npxG.1,npxG+xA.1,Mn/MP,Min%,Mn/Start,Compl,Subs,Mn/Sub,unSub,PPM,onG,onGA,+/-,+/-90,On-Off,onxG,onxGA,xG+/-,xG+/-90,On-Off.1,2CrdY,Fls,Fld,Off,Crs,Int,TklW,PKwon,PKcon,Recov,Won,Lost,Won%,League Name,League ID,Season
143,Aaron Ramsdale,eng ENG,GK,Arsenal,eng Premier League,23,1998,21,21,1890,21.0,18,0.86,76,57,80.3,14,3,4,11,52.4,3,3,0,0,0.0,Matches,0,0,0,17.6,0.2,-0.4,-0.02,128,355,36.1,607,104,42.5,38.1,128,75.8,58.9,168,16,9.5,15,0.71,15.0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90,87.5,90,21,0,,2,2.14,38,18,20,0.95,3.95,35.6,23.2,12.4,0.59,2.66,0,0,4,0,0,0,0,0,0,109,0,0,,Big-5-European-Leagues,Big5,2021-2022
50,AarÃ³n Escandell,es ESP,GK,Granada,es La Liga,26,1995,4,3,284,3.2,6,1.9,14,10,71.4,0,2,1,1,33.3,2,2,0,0,0.0,Matches,0,0,0,4.2,0.19,-1.8,-0.58,39,89,43.8,88,13,68.2,48.1,37,78.4,54.3,26,2,7.7,7,2.22,14.5,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71,12.1,90,3,1,14.0,21,0.5,1,6,-5,-1.58,-1.23,2.1,5.6,-3.5,-1.11,-0.58,0,0,1,0,0,0,0,0,0,24,0,0,,Big-5-European-Leagues,Big5,2021-2022
55,Aitor FernÃ¡ndez,es ESP,GK,Levante,es La Liga,30,1991,13,13,1166,13.0,29,2.24,79,51,64.6,1,5,7,2,15.4,2,1,1,0,50.0,Matches,0,8,0,24.9,0.29,-4.1,-0.32,73,174,42.0,290,62,46.2,40.2,91,44.0,36.4,111,5,4.5,11,0.85,16.3,0,0,0,0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90,49.8,90,12,0,,12,0.62,12,29,-17,-1.31,-0.85,15.2,18.8,-3.6,-0.28,-0.35,0,0,4,0,0,0,1,0,0,51,0,0,,Big-5-European-Leagues,Big5,2021-2022
86,Alban Lafont,fr FRA,GK,Nantes,fr Ligue 1,23,1999,26,26,2340,26.0,29,1.12,102,74,72.5,11,6,9,7,26.9,3,1,2,0,66.7,Matches,0,3,1,29.8,0.27,1.8,0.07,132,431,30.6,560,85,49.6,44.0,196,78.1,57.3,187,12,6.4,17,0.65,14.6,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90,100.0,90,26,0,,0,1.5,34,29,5,0.19,,26.9,35.3,-8.4,-0.32,,0,0,2,0,0,0,0,0,0,109,0,0,,Big-5-European-Leagues,Big5,2021-2022
36,Alessio Cragno,it ITA,GK,Cagliari,it Serie A,27,1994,24,24,2160,24.0,43,1.79,114,73,65.8,5,8,11,0,0.0,5,4,1,0,20.0,Matches,4,8,1,40.9,0.33,-1.1,-0.05,220,513,42.9,638,77,57.4,43.3,205,71.7,50.6,216,18,8.3,12,0.5,14.7,0,0,0,0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90,88.9,90,24,0,,0,0.96,26,43,-17,-0.71,0.29,27.8,35.7,-7.9,-0.33,0.97,0,1,6,0,0,0,1,0,1,106,0,0,,Big-5-European-Leagues,Big5,2021-2022


In [32]:
df_gk.shape

(190, 104)

In [33]:
df_gk.columns

Index(['Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', 'MP',
       'Starts', 'Min',
       ...
       'TklW', 'PKwon', 'PKcon', 'Recov', 'Won', 'Lost', 'Won%', 'League Name',
       'League ID', 'Season'],
      dtype='object', length=104)

In [34]:
df_gk.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 190 entries, 143 to 170
Columns: 104 entries, Player to Season
dtypes: object(104)
memory usage: 155.9+ KB
