In [1]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
from io import StringIO
# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options

In [2]:
pd.set_option('display.max_columns', None)

## Big 5 Leagues

### Standard Stats

In [3]:
# Accessing table
b5_stand_df = pd.read_html('https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats', 
                  attrs={"id":"stats_standard"})[0]

# Flattening multiIndex columns
b5_stand_df.columns = ['_'.join(col) for col in b5_stand_df.columns.values]

# Dropping useless columns
b5_stand_df = b5_stand_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 37_level_0_Matches'], axis=1)

# Renaming player ID columns
b5_stand_df = b5_stand_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Comp':'Comp',
                                        'Unnamed: 6_level_0_Age':'Age',
                                        'Unnamed: 7_level_0_Born':'Born'})

# Renaming stat group columns
b5_stand_df = b5_stand_df.rename(columns={'Playing Time_MP':'MP',
                                        'Playing Time_Starts':'Starts', 
                                        'Playing Time_Min':'Min', 
                                        'Playing Time_90s':'No_90s',
                                        'Performance_Gls':'Gls', 
                                        'Performance_Ast':'Ast',
                                        'Performance_G+A':'G+A',
                                        'Performance_G-PK':'G-PK', 
                                        'Performance_PK':'PK', 
                                        'Performance_PKatt':'PKatt',
                                        'Performance_CrdY':'CrdY',
                                        'Performance_CrdR':'CrdR',
                                        'Expected_xG':'xG', 
                                        'Expected_npxG':'npxG',
                                        'Expected_xAG':'xAG', 
                                        'Expected_npxG+xAG':'npxG+xAG',
                                        'Progression_PrgC':'Prg_Carr',
                                        'Progression_PrgP':'Prg_Pass', 
                                        'Progression_PrgR':'Prg_Pass_Rec', 
                                        'Per 90 Minutes_Gls':'Gls_90',
                                        'Per 90 Minutes_Ast':'Ast_90', 
                                        'Per 90 Minutes_G+A':'G+A_90', 
                                        'Per 90 Minutes_G-PK':'G-PK_90',
                                        'Per 90 Minutes_G+A-PK':'G+A-PK_90', 
                                        'Per 90 Minutes_xG':'xG_90', 
                                        'Per 90 Minutes_xAG':'xAG_90',
                                        'Per 90 Minutes_xG+xAG':'xG+xAG_90', 
                                        'Per 90 Minutes_npxG':'npxG_90',
                                        'Per 90 Minutes_npxG+xAG':'npxG+xAG_90'})




In [None]:
b5_stand_df[b5_stand_df.isnull().any(axis=1)]

In [4]:
# Dropping rows with NANs
b5_stand_df = b5_stand_df.dropna(axis=0, how='any')


# Removing column header rows
b5_stand_df = b5_stand_df[b5_stand_df["Player"] != "Player"]


# Applying datatypes to variables
# stand_cols_float = ['No_90s','Gls_90','Ast_90','G+A_90','G-PK_90','G+A_90','G+A-PK_90','xG','npxG','xAG','npxG+xAG','xG_90','xAG_90','xG+xAG_90','npxG_90','npxG+xAG_90']

# stand_cols_int = ['Age', 'MP', 'Starts', 'Min', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt','CrdY','CrdR','Prg_Carr','Prg_Pass','Prg_Pass_Rec']

# stand_cols_str = ['Player', 'Nation', 'Position', 'Squad', 'Comp', 'Born']

convert_dict = {'No_90s':float,
                'Gls_90':float,
                'Ast_90':float,
                'G+A_90':float,
                'G-PK_90':float,
                'G+A_90':float,
                'G+A-PK_90':float,
                'xG':float,
                'npxG':float,
                'xAG':float,
                'npxG+xAG':float,
                'xG_90':float,
                'xAG_90':float,
                'xG+xAG_90':float,
                'npxG_90':float,
                'npxG+xAG_90':float,
                
                'MP':int, 
                'Starts':int, 
                'Min':int, 
                'Gls':int, 
                'Ast':int, 
                'G+A':int, 
                'G-PK':int, 
                'PK':int, 
                'PKatt':int,
                'CrdY':int,
                'CrdR':int,
                'Prg_Carr':int,
                'Prg_Pass':int,
                'Prg_Pass_Rec':int
    }

b5_stand_df = b5_stand_df.astype(convert_dict)
# b5_stand_df[[stand_cols_float, stand_cols_int]] = b5_stand_df[[stand_cols_float, stand_cols_int]].apply(pd.to_numeric)

# b5_stand_df[stand_cols_float] = b5_stand_df[stand_cols_float].astype(float)
# b5_stand_df[stand_cols_int] = b5_stand_df[stand_cols_int].astype('int')
# b5_stand_df[stand_cols_str] = b5_stand_df[stand_cols_str].astype('str')
# # player_stand_df.info()  

In [None]:
b5_stand_df.dtypes

In [None]:
# b5_stand_df = b5_stand_df.dropna(axis=0, how='any')

In [None]:
b5_stand_df.info()

### Shooting

In [5]:
# Accessing table
b5_shoot_df = pd.read_html('https://fbref.com/en/comps/Big5/shooting/players/Big-5-European-Leagues-Stats', 
                  attrs={"id":"stats_shooting"})[0]

# Flattening multiIndex columns
b5_shoot_df.columns = ['_'.join(col) for col in b5_shoot_df.columns.values]

# Dropping useless columns
b5_shoot_df = b5_shoot_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 8_level_0_90s','Unnamed: 26_level_0_Matches','Standard_Gls','Standard_PK',
                              'Standard_PKatt', 'Expected_xG', 'Expected_npxG'], axis=1)

# Renaming player ID columns
b5_shoot_df = b5_shoot_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Comp':'Comp',
                                        'Unnamed: 6_level_0_Age':'Age',
                                        'Unnamed: 7_level_0_Born':'Born'})

# Renaming stat group columns
b5_shoot_df = b5_shoot_df.rename(columns={'Standard_Sh':'Shots', 
                                        'Standard_SoT':'SoT',
                                        'Standard_SoT%':'SoT_pct', 
                                        'Standard_Sh/90':'Shots_90', 
                                        'Standard_SoT/90':'SoT_90', 
                                        'Standard_G/Sh':'Gls_per_Sh',
                                        'Standard_G/SoT':'Gls_per_SoT', 
                                        'Standard_Dist':'Avg_Sh_Dist', 
                                        'Standard_FK':'Sh_FK', 
                                        'Expected_npxG/Sh':'npxG_per_Sh',
                                        'Expected_G-xG':'G-xG', 
                                        'Expected_np:G-xG':'npG-npxG'})

In [None]:
# b5_shoot_df[b5_shoot_df.isnull().any(axis=1)]
b5_shoot_df.columns

In [6]:
# Dropping rows with NANs
b5_shoot_df = b5_shoot_df.dropna(axis=0, how='any')


# Removing column header rows
b5_shoot_df = b5_shoot_df[b5_shoot_df["Player"] != "Player"]

convert_dict = {
    'Shots':int,
    'SoT':int,
    'SoT_pct':float,
    'Shots_90':float,
    'SoT_90':float,
    'Gls_per_Sh':float,
    'Gls_per_SoT':float,
    'Avg_Sh_Dist':float,
    'Sh_FK':int,
    'npxG_per_Sh':float,
    'G-xG':float,
    'npG-npxG':float   
}

b5_shoot_df = b5_shoot_df.astype(convert_dict)

### Possession

In [7]:
# Accessing table
b5_poss_df = pd.read_html('https://fbref.com/en/comps/Big5/possession/players/Big-5-European-Leagues-Stats', 
                  attrs={"id":"stats_possession"})[0]

# Flattening multiIndex columns
b5_poss_df.columns = ['_'.join(col) for col in b5_poss_df.columns.values]

# Dropping useless columns
b5_poss_df = b5_poss_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 8_level_0_90s','Unnamed: 31_level_0_Matches'], axis=1)

# Renaming player ID columns
b5_poss_df = b5_poss_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Comp':'Comp',
                                        'Unnamed: 6_level_0_Age':'Age',
                                        'Unnamed: 7_level_0_Born':'Born'})

# Renaming stat group columns
b5_poss_df = b5_poss_df.rename(columns={'Touches_Touches':'Touches','Carries_Carries':'Carries'})

In [None]:
b5_poss_df.columns

In [8]:
# Dropping rows with NANs
b5_poss_df = b5_poss_df.dropna(axis=0, how='any')


# Removing column header rows
b5_poss_df = b5_poss_df[b5_poss_df["Player"] != "Player"]

convert_dict = {
                'Touches':int, 
                'Touches_Def Pen':int, 
                'Touches_Def 3rd':int, 
                'Touches_Mid 3rd':int,
                'Touches_Att 3rd':int, 
                'Touches_Att Pen':int, 
                'Touches_Live':int, 
                'Take-Ons_Att':int,
                'Take-Ons_Succ':int, 
                'Take-Ons_Succ%':float, 
                'Take-Ons_Tkld':int, 
                'Take-Ons_Tkld%':float,
                'Carries':int, 
                'Carries_TotDist':int, 
                'Carries_PrgDist':int, 
                'Carries_PrgC':int,
                'Carries_1/3':int, 
                'Carries_CPA':int, 
                'Carries_Mis':int, 
                'Carries_Dis':int,
                'Receiving_Rec':int, 
                'Receiving_PrgR':int  
}

b5_poss_df = b5_poss_df.astype(convert_dict)

### Passing

In [9]:
# Accessing table
b5_pass_df = pd.read_html('https://fbref.com/en/comps/Big5/passing/players/Big-5-European-Leagues-Stats', 
                  attrs={"id":"stats_passing"})[0]

# Flattening multiIndex columns
b5_pass_df.columns = ['_'.join(col) for col in b5_pass_df.columns.values]

# Dropping useless columns
b5_pass_df = b5_pass_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 8_level_0_90s','Unnamed: 32_level_0_Matches',
                            'Unnamed: 23_level_0_Ast','Unnamed: 24_level_0_xAG','Unnamed: 31_level_0_PrgP'], axis=1)

# Renaming player ID columns
b5_pass_df = b5_pass_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Comp':'Comp',
                                        'Unnamed: 6_level_0_Age':'Age',
                                        'Unnamed: 7_level_0_Born':'Born'})

# Renaming stat group columns
b5_pass_df = b5_pass_df.rename(columns={'Expected_xA':'xA',
                                      'Expected_A-xAG':'A-xAG',
                                      'Unnamed: 27_level_0_KP':'KP',
                                      'Unnamed: 28_level_0_1/3':'Pass_Fin_3rd',
                                      'Unnamed: 29_level_0_PPA':'Pass_Pen_Area',
                                      'Unnamed: 30_level_0_CrsPA':'Cross_Pen_Area'})

In [10]:
# Dropping rows with NANs
b5_pass_df = b5_pass_df.dropna(axis=0, how='any')


# Removing column header rows
b5_pass_df = b5_pass_df[b5_pass_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'Total_Cmp':int, 
                'Total_Att':int, 
                'Total_Cmp%':float, 
                'Total_TotDist':int,
                'Total_PrgDist':int, 
                'Short_Cmp':int, 
                'Short_Att':int, 
                'Short_Cmp%':float, 
                'Medium_Cmp':int,
                'Medium_Att':int, 
                'Medium_Cmp%':float, 
                'Long_Cmp':int, 
                'Long_Att':int, 
                'Long_Cmp%':float, 
                'xA':float,
                'A-xAG':float, 
                'KP':int, 
                'Pass_Fin_3rd':int, 
                'Pass_Pen_Area':int, 
                'Cross_Pen_Area':int  
}

### Passing Types

In [11]:
# Accessing table
b5_ptype_df = pd.read_html('https://fbref.com/en/comps/Big5/passing_types/players/Big-5-European-Leagues-Stats', 
                  attrs={"id":"stats_passing_types"})[0]

# Flattening multiIndex columns
b5_ptype_df.columns = ['_'.join(col) for col in b5_ptype_df.columns.values]

# Dropping useless columns
b5_ptype_df = b5_ptype_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 8_level_0_90s','Unnamed: 24_level_0_Matches',
                            'Outcomes_Cmp','Unnamed: 9_level_0_Att'], axis=1)

# Renaming player ID columns
b5_ptype_df = b5_ptype_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Comp':'Comp',
                                        'Unnamed: 6_level_0_Age':'Age',
                                        'Unnamed: 7_level_0_Born':'Born'})

# Renaming stat group columns
b5_ptype_df = b5_ptype_df.rename(columns={'Outcomes_Blocks':'Pass_Blocked',
                                      'Outcomes_Off':'Pass_Offsides'
                                       })

In [12]:
# Dropping rows with NANs
b5_ptype_df = b5_ptype_df.dropna(axis=0, how='any')


# Removing column header rows
b5_ptype_df = b5_ptype_df[b5_ptype_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'Pass Types_Live':int, 
                'Pass Types_Dead':int, 
                'Pass Types_FK':int, 
                'Pass Types_TB':int,
                'Pass Types_Sw':int, 
                'Pass Types_Crs':int, 
                'Pass Types_TI':int, 
                'Pass Types_CK':int,
                'Corner Kicks_In':int, 
                'Corner Kicks_Out':int, 
                'Corner Kicks_Str':int,
                'Pass_Offsides':int, 
                'Pass_Blocked':int  
}
b5_ptype_df = b5_ptype_df.astype(convert_dict)

### Goal and Shot Creation (GCA)

In [13]:
# Accessing table
b5_gca_df = pd.read_html('https://fbref.com/en/comps/Big5/gca/players/Big-5-European-Leagues-Stats', 
                  attrs={"id":"stats_gca"})[0]

# Flattening multiIndex columns
b5_gca_df.columns = ['_'.join(col) for col in b5_gca_df.columns.values]

# Dropping useless columns
b5_gca_df = b5_gca_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 8_level_0_90s','Unnamed: 25_level_0_Matches'], axis=1)

# Renaming player ID columns
b5_gca_df = b5_gca_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Comp':'Comp',
                                        'Unnamed: 6_level_0_Age':'Age',
                                        'Unnamed: 7_level_0_Born':'Born'})

# Renaming stat group columns
b5_gca_df = b5_gca_df.rename(columns={'SCA_SCA':'SCA',
                                    'SCA_SCA90':'SCA_90',
                                    'SCA_Types_PassLive':'SCA_PassLive',
                                    'SCA_Types_PassDead':'SCA_PassDead',
                                    'SCA_Types_TO':'SCA_TO',
                                    'SCA_Types_Sh':'SCA_Shot',
                                    'SCA_Types_Fld':'SCA_Fouls_Drawn',
                                    'SCA_Types_Def':'SCA_Def_Action',
                                    'GCA_GCA':'GCA',
                                    'GCA_GCA90':'GCA_90',
                                    'GCA_Types_PassLive':'GCA_PassLive',
                                    'GCA_Types_PassDead':'GCA_PassDead',
                                    'GCA_Types_TO':'GCA_TO',
                                    'GCA_Types_Sh':'GCA_Shot',
                                    'GCA_Types_Fld':'GCA_Fouls_Drawn',
                                    'GCA_Types_Def':'GCA_Def_Action'
                                   })

In [14]:
# Dropping rows with NANs
b5_gca_df = b5_gca_df.dropna(axis=0, how='any')


# Removing column header rows
b5_gca_df = b5_gca_df[b5_gca_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'SCA':int,
                'SCA_90':float, 
                'SCA Types_PassLive':int, 
                'SCA Types_PassDead':int, 
                'SCA Types_TO':int,
                'SCA Types_Sh':int, 
                'SCA Types_Fld':int, 
                'SCA Types_Def':int, 
                'GCA':int, 
                'GCA_90':float,
                'GCA Types_PassLive':int, 
                'GCA Types_PassDead':int, 
                'GCA Types_TO':int,
                'GCA Types_Sh':int, 
                'GCA Types_Fld':int, 
                'GCA Types_Def':int  
}
b5_gca_df = b5_gca_df.astype(convert_dict)

### Defensive Actions

In [15]:
# Accessing table
b5_def_df = pd.read_html('https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats', 
                  attrs={"id":"stats_defense"})[0]

# Flattening multiIndex columns
b5_def_df.columns = ['_'.join(col) for col in b5_def_df.columns.values]

# Dropping useless columns
b5_def_df = b5_def_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 8_level_0_90s','Unnamed: 25_level_0_Matches', 'Challenges_Lost'], axis=1)

# Renaming player ID columns
b5_def_df = b5_def_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Comp':'Comp',
                                        'Unnamed: 6_level_0_Age':'Age',
                                        'Unnamed: 7_level_0_Born':'Born'})

# Renaming stat group columns
b5_def_df = b5_def_df.rename(columns={'Tackles_Tkl':'Tkls',
                                    'Tackles_TklW':'Tkls_Won',
                                    'Tackles_Def_3rd':'Tkl_Def_3rd',
                                    'Tackles_Mid_3rd':'Tkl_Mid_3rd',
                                    'Tackles_Att_3rd':'Tkl_Att_3rd',
                                    'Challenges_Tkl':'Drib_Tkl',
                                    'Challenges_Att':'Drib_Tkl_Att',
                                    'Challenges_Tkl%':'Drib_Tkl%',
                                    'Blocks_Blocks':'Def_Blocks',
                                    'Blocks_Sh':'Def_Shot_Blocks',
                                    'Blocks_Pass':'Def_Pass_Blocks',
                                    'Unnamed: 21_level_0_Int':'Int',
                                    'Unnamed: 22_level_0_Tkl+Int':'Tkl+Int',
                                    'Unnamed: 23_level_0_Clr':'Clearances',
                                    'Unnamed: 24_level_0_Err':'Errors'
                                   })

In [16]:
# Dropping rows with NANs
b5_def_df = b5_def_df.dropna(axis=0, how='any')


# Removing column header rows
b5_def_df = b5_def_df[b5_def_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'Tkls':int,
                'Tkls_Won':int, 
                'Tackles_Def 3rd':int, 
                'Tackles_Mid 3rd':int, 
                'Tackles_Att 3rd':int,
                'Drib_Tkl':int, 
                'Drib_Tkl_Att':int, 
                'Drib_Tkl%':float, 
                'Def_Blocks':int,
                'Def_Shot_Blocks':int, 
                'Def_Pass_Blocks':int, 
                'Int':int, 
                'Tkl+Int':int, 
                'Clearances':int,
                'Errors':int 
}
b5_def_df = b5_def_df.astype(convert_dict)

### Miscellaneous Stats

In [17]:
# Accessing table
b5_misc_df = pd.read_html('https://fbref.com/en/comps/Big5/misc/players/Big-5-European-Leagues-Stats', 
                  attrs={"id":"stats_misc"})[0]

# Flattening multiIndex columns
b5_misc_df.columns = ['_'.join(col) for col in b5_misc_df.columns.values]

# Dropping useless columns
b5_misc_df = b5_misc_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 8_level_0_90s','Unnamed: 25_level_0_Matches','Aerial Duels_Lost',
                         'Performance_CrdY','Performance_CrdR','Performance_Crs','Performance_Int','Performance_TklW',
                         'Performance_Fld'], axis=1)

# Renaming player ID columns
b5_misc_df = b5_misc_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Comp':'Comp',
                                        'Unnamed: 6_level_0_Age':'Age',
                                        'Unnamed: 7_level_0_Born':'Born'})

# Renaming stat group columns
b5_misc_df = b5_misc_df.rename(columns={'Performance_2CrdY':'CrdY2',
                                    'Performance_Fls':'Fls_Comm',
                                    'Performance_Off':'Offsides',
                                    'Performance_PKwon':'PK_Won',
                                    'Performance_PKcon':'PK_Conv',
                                    'Performance_OG':'Own_Goal',
                                    'Performance_Recov':'Ball_Recoveries'
                                   })

In [18]:
# Dropping rows with NANs
b5_misc_df = b5_misc_df.dropna(axis=0, how='any')


# Removing column header rows
b5_misc_df = b5_misc_df[b5_misc_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'CrdY2':int,
                'Fls_Comm':int, 
                'Offsides':int, 
                'PK_Won':int, 
                'PK_Conv':int, 
                'Own_Goal':int,
                'Ball_Recoveries':int, 
                'Aerial Duels_Won':int, 
                'Aerial Duels_Won%':float 
}

b5_misc_df = b5_misc_df.astype(convert_dict)

### Merging all stat group dataframes

In [19]:
b5_conditions_join = ['Player','Nation','Position','Squad','Age','Born','Comp']

b5_outfield_df = pd.merge(b5_stand_df, b5_shoot_df, left_on=b5_conditions_join, right_on=b5_conditions_join,
                          how='left')

b5_outfield_df = pd.merge(b5_outfield_df, b5_poss_df, left_on=b5_conditions_join, right_on=b5_conditions_join,
                          how='left')

b5_outfield_df = pd.merge(b5_outfield_df, b5_pass_df, left_on=b5_conditions_join, right_on=b5_conditions_join,
                          how='left')

b5_outfield_df = pd.merge(b5_outfield_df, b5_ptype_df, left_on=b5_conditions_join, right_on=b5_conditions_join,
                          how='left')

b5_outfield_df = pd.merge(b5_outfield_df, b5_gca_df, left_on=b5_conditions_join, right_on=b5_conditions_join,
                          how='left')

b5_outfield_df = pd.merge(b5_outfield_df, b5_def_df, left_on=b5_conditions_join, right_on=b5_conditions_join,
                          how='left')

b5_outfield_df = pd.merge(b5_outfield_df, b5_misc_df, left_on=b5_conditions_join, right_on=b5_conditions_join,
                          how='left')


In [20]:
b5_outfield_df.head(25)

Unnamed: 0,Player,Nation,Position,Squad,Comp,Age,Born,MP,Starts,Min,No_90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,Prg_Carr,Prg_Pass,Prg_Pass_Rec,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90,Shots,SoT,SoT_pct,Shots_90,SoT_90,Gls_per_Sh,Gls_per_SoT,Avg_Sh_Dist,Sh_FK,npxG_per_Sh,G-xG,npG-npxG,Touches,Touches_Def Pen,Touches_Def 3rd,Touches_Mid 3rd,Touches_Att 3rd,Touches_Att Pen,Touches_Live,Take-Ons_Att,Take-Ons_Succ,Take-Ons_Succ%,Take-Ons_Tkld,Take-Ons_Tkld%,Carries,Carries_TotDist,Carries_PrgDist,Carries_PrgC,Carries_1/3,Carries_CPA,Carries_Mis,Carries_Dis,Receiving_Rec,Receiving_PrgR,Total_Cmp,Total_Att,Total_Cmp%,Total_TotDist,Total_PrgDist,Short_Cmp,Short_Att,Short_Cmp%,Medium_Cmp,Medium_Att,Medium_Cmp%,Long_Cmp,Long_Att,Long_Cmp%,xA,A-xAG,KP,Pass_Fin_3rd,Pass_Pen_Area,Cross_Pen_Area,Pass Types_Live,Pass Types_Dead,Pass Types_FK,Pass Types_TB,Pass Types_Sw,Pass Types_Crs,Pass Types_TI,Pass Types_CK,Corner Kicks_In,Corner Kicks_Out,Corner Kicks_Str,Pass_Offsides,Pass_Blocked,SCA,SCA_90,SCA Types_PassLive,SCA Types_PassDead,SCA Types_TO,SCA Types_Sh,SCA Types_Fld,SCA Types_Def,GCA,GCA_90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def,Tkls,Tkls_Won,Tackles_Def 3rd,Tackles_Mid 3rd,Tackles_Att 3rd,Drib_Tkl,Drib_Tkl_Att,Drib_Tkl%,Def_Blocks,Def_Shot_Blocks,Def_Pass_Blocks,Int,Tkl+Int,Clearances,Errors,CrdY2,Fls_Comm,Offsides,PK_Won,PK_Conv,Own_Goal,Ball_Recoveries,Aerial Duels_Won,Aerial Duels_Won%
0,Max Aarons,eng ENG,DF,Bournemouth,eng Premier League,23,2000,20,13,1237,13.7,0,1,1,0,0,0,1,0,0.0,0.0,0.8,0.9,22,43,26,0.0,0.07,0.07,0.0,0.07,0.0,0.06,0.06,0.0,0.06,,,,,,,,,,,,,711.0,43.0,252.0,303.0,165.0,11.0,711.0,34.0,14.0,41.2,12.0,35.3,364.0,2174.0,1121.0,22.0,12.0,7.0,13.0,8.0,371.0,26.0,450.0,581.0,77.5,7402.0,2789.0,220.0,248.0,88.7,188.0,235.0,80.0,34.0,63.0,54.0,0.9,0.2,7.0,25.0,13.0,2.0,453,127,11,2,3,13,116,0,0,0,0,1,23,23,1.68,16,4,0,0,3,0,2,0.15,2,0,0,0,0,0,29.0,19.0,20.0,7.0,2.0,20.0,34.0,58.8,9.0,5.0,4.0,8.0,37.0,27.0,0.0,0.0,12.0,2.0,0.0,1.0,0.0,75.0,5.0,31.3
1,Brenden Aaronson,us USA,"MF,FW",Union Berlin,de Bundesliga,22,2000,30,14,1267,14.1,2,2,4,2,0,0,3,1,2.0,2.0,1.9,3.8,37,56,91,0.14,0.14,0.28,0.14,0.28,0.14,0.13,0.27,0.14,0.27,18.0,7.0,38.9,1.28,0.5,0.11,0.29,18.4,0.0,0.11,0.0,0.0,675.0,11.0,108.0,301.0,293.0,47.0,675.0,77.0,34.0,44.2,41.0,53.2,406.0,2721.0,1387.0,37.0,29.0,9.0,41.0,38.0,457.0,91.0,365.0,472.0,77.3,4890.0,1506.0,206.0,240.0,85.8,105.0,130.0,80.8,19.0,32.0,59.4,2.0,0.1,22.0,30.0,14.0,3.0,439,29,3,5,1,22,12,6,2,3,0,4,21,53,3.76,41,1,8,3,0,0,8,0.57,6,0,2,0,0,0,32.0,18.0,13.0,13.0,6.0,16.0,32.0,50.0,26.0,1.0,25.0,2.0,34.0,4.0,0.0,1.0,15.0,5.0,0.0,0.0,0.0,88.0,13.0,44.8
2,Paxten Aaronson,us USA,MF,Eint Frankfurt,de Bundesliga,19,2003,7,1,101,1.1,0,1,1,0,0,0,0,0,0.1,0.1,0.1,0.2,2,5,7,0.0,0.89,0.89,0.0,0.89,0.11,0.07,0.19,0.11,0.19,2.0,2.0,100.0,1.78,1.78,0.0,0.0,15.1,0.0,0.06,-0.1,-0.1,72.0,0.0,6.0,39.0,28.0,5.0,72.0,7.0,2.0,28.6,4.0,57.1,43.0,193.0,50.0,2.0,0.0,1.0,5.0,4.0,46.0,7.0,41.0,50.0,82.0,576.0,71.0,20.0,25.0,80.0,20.0,22.0,90.9,0.0,2.0,0.0,0.1,0.9,1.0,4.0,2.0,0.0,48,2,1,0,0,0,1,0,0,0,0,0,0,1,0.89,1,0,0,0,0,0,1,0.89,1,0,0,0,0,0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,100.0,2.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,5.0,3.0,100.0
3,Keyliane Abdallah,fr FRA,FW,Marseille,fr Ligue 1,17,2006,1,0,4,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Yunis Abdelhamid,ma MAR,DF,Reims,fr Ligue 1,35,1987,31,31,2781,30.9,4,0,4,3,1,1,5,0,3.4,2.6,0.3,2.9,36,137,9,0.13,0.0,0.13,0.1,0.1,0.11,0.01,0.12,0.09,0.09,21.0,7.0,33.3,0.68,0.23,0.14,0.43,15.0,0.0,0.13,0.6,0.4,2185.0,293.0,976.0,1119.0,114.0,35.0,2184.0,15.0,8.0,53.3,7.0,46.7,1506.0,8663.0,4921.0,36.0,19.0,0.0,23.0,4.0,1403.0,9.0,1552.0,1836.0,84.5,29618.0,9672.0,487.0,548.0,88.9,893.0,976.0,91.5,141.0,252.0,56.0,0.6,-0.3,8.0,129.0,3.0,0.0,1650,178,75,2,13,3,29,0,0,0,0,8,20,24,0.78,19,1,0,3,1,0,1,0.03,0,0,0,1,0,0,64.0,35.0,36.0,23.0,5.0,26.0,45.0,57.8,51.0,32.0,19.0,39.0,103.0,109.0,2.0,0.0,26.0,0.0,0.0,0.0,1.0,149.0,61.0,62.2
5,Salis Abdul Samed,gh GHA,MF,Lens,fr Ligue 1,23,2000,27,17,1519,16.9,0,0,0,0,0,0,2,0,0.8,0.8,0.5,1.3,9,78,20,0.0,0.0,0.0,0.0,0.0,0.05,0.03,0.08,0.05,0.08,,,,,,,,,,,,,1022.0,21.0,168.0,647.0,218.0,7.0,1022.0,19.0,7.0,36.8,11.0,57.9,823.0,4361.0,1683.0,9.0,33.0,0.0,23.0,11.0,780.0,20.0,796.0,895.0,88.9,12470.0,3008.0,393.0,433.0,90.8,330.0,360.0,91.7,41.0,54.0,75.9,1.1,-0.5,6.0,87.0,5.0,2.0,875,18,17,0,1,3,1,0,0,0,0,2,17,27,1.6,27,0,0,0,0,0,3,0.18,3,0,0,0,0,0,21.0,14.0,8.0,10.0,3.0,8.0,18.0,44.4,12.0,1.0,11.0,12.0,33.0,18.0,0.0,0.0,34.0,0.0,0.0,3.0,0.0,89.0,2.0,22.2
6,Nabil Aberdin,fr FRA,DF,Getafe,es La Liga,20,2002,2,2,180,2.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,60.0,79.0,75.9,1159.0,426.0,22.0,26.0,84.6,29.0,34.0,85.3,7.0,17.0,41.2,0.0,0.0,0.0,4.0,0.0,0.0,75,4,4,0,1,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,10.0,4.0,40.0
7,Laurent Abergel,fr FRA,MF,Lorient,fr Ligue 1,30,1993,33,32,2860,31.8,2,1,3,2,0,0,4,0,1.1,1.1,2.2,3.3,38,194,51,0.06,0.03,0.09,0.06,0.09,0.04,0.07,0.1,0.04,0.1,26.0,11.0,42.3,0.82,0.35,0.08,0.18,26.5,0.0,0.04,0.9,0.9,2164.0,105.0,553.0,1288.0,349.0,10.0,2164.0,65.0,44.0,67.7,18.0,27.7,1649.0,8707.0,3892.0,38.0,49.0,3.0,32.0,30.0,1502.0,51.0,1551.0,1836.0,84.5,27382.0,8613.0,629.0,707.0,89.0,711.0,802.0,88.7,150.0,233.0,64.4,1.9,-1.2,30.0,188.0,23.0,3.0,1743,88,68,6,13,34,14,0,0,0,0,5,25,81,2.55,67,3,2,1,2,6,5,0.16,3,0,1,0,0,1,85.0,52.0,43.0,34.0,8.0,38.0,96.0,39.6,29.0,6.0,23.0,61.0,146.0,61.0,3.0,0.0,36.0,1.0,0.0,0.0,0.0,226.0,15.0,51.7
8,Matthis Abline,fr FRA,FW,Nantes,fr Ligue 1,20,2003,22,12,1044,11.6,5,0,5,5,0,0,1,0,3.8,3.8,2.0,5.8,28,20,87,0.43,0.0,0.43,0.43,0.43,0.33,0.17,0.5,0.33,0.5,42.0,19.0,45.2,3.62,1.64,0.12,0.26,17.4,0.0,0.09,1.2,1.2,386.0,9.0,36.0,153.0,209.0,65.0,386.0,33.0,19.0,57.6,14.0,42.4,294.0,1979.0,906.0,28.0,24.0,14.0,50.0,14.0,287.0,87.0,162.0,231.0,70.1,2410.0,448.0,91.0,128.0,71.1,52.0,68.0,76.5,9.0,11.0,81.8,0.9,-2.0,17.0,6.0,7.0,2.0,218,12,0,2,1,7,1,0,0,0,0,1,11,38,3.28,30,0,5,2,1,0,2,0.17,0,0,1,0,1,0,9.0,4.0,4.0,3.0,2.0,3.0,8.0,37.5,3.0,0.0,3.0,3.0,12.0,5.0,0.0,0.0,10.0,6.0,1.0,0.0,0.0,39.0,15.0,35.7
9,Abner,br BRA,DF,Betis,es La Liga,23,2000,23,15,1400,15.6,0,1,1,0,0,0,3,0,0.1,0.1,1.0,1.1,14,33,58,0.0,0.06,0.06,0.0,0.06,0.01,0.06,0.07,0.01,0.07,,,,,,,,,,,,,966.0,72.0,344.0,405.0,226.0,28.0,966.0,13.0,7.0,53.8,5.0,38.5,447.0,1802.0,949.0,14.0,10.0,5.0,18.0,11.0,502.0,58.0,615.0,797.0,77.2,9211.0,4552.0,351.0,394.0,89.1,203.0,265.0,76.6,37.0,79.0,46.8,1.0,0.0,8.0,20.0,7.0,2.0,628,165,7,0,3,26,158,0,0,0,0,4,23,14,0.9,13,0,0,1,0,0,0,0.0,0,0,0,0,0,0,25.0,19.0,15.0,9.0,1.0,17.0,34.0,50.0,23.0,5.0,18.0,15.0,40.0,62.0,0.0,0.0,16.0,2.0,0.0,1.0,0.0,79.0,14.0,58.3


In [21]:
b5_outfield_df.loc[b5_outfield_df['Comp'] =='eng Premier League', 'League'] = 'Premier League'
b5_outfield_df.loc[b5_outfield_df['Comp'] =='es La Liga', 'League'] = 'La Liga'
b5_outfield_df.loc[b5_outfield_df['Comp'] =='de Bundesliga', 'League'] = 'Bundesliga'
b5_outfield_df.loc[b5_outfield_df['Comp'] =='fr Ligue 1', 'League'] = 'Ligue 1'
b5_outfield_df.loc[b5_outfield_df['Comp'] =='it Serie A', 'League'] = 'Serie A'


In [24]:
b5_outfield_df.head()

Unnamed: 0,Player,Nation,Position,Squad,Age,Born,MP,Starts,Min,No_90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,Prg_Carr,Prg_Pass,Prg_Pass_Rec,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90,Shots,SoT,SoT_pct,Shots_90,SoT_90,Gls_per_Sh,Gls_per_SoT,Avg_Sh_Dist,Sh_FK,npxG_per_Sh,G-xG,npG-npxG,Touches,Touches_Def Pen,Touches_Def 3rd,Touches_Mid 3rd,Touches_Att 3rd,Touches_Att Pen,Touches_Live,Take-Ons_Att,Take-Ons_Succ,Take-Ons_Succ%,Take-Ons_Tkld,Take-Ons_Tkld%,Carries,Carries_TotDist,Carries_PrgDist,Carries_PrgC,Carries_1/3,Carries_CPA,Carries_Mis,Carries_Dis,Receiving_Rec,Receiving_PrgR,Total_Cmp,Total_Att,Total_Cmp%,Total_TotDist,Total_PrgDist,Short_Cmp,Short_Att,Short_Cmp%,Medium_Cmp,Medium_Att,Medium_Cmp%,Long_Cmp,Long_Att,Long_Cmp%,xA,A-xAG,KP,Pass_Fin_3rd,Pass_Pen_Area,Cross_Pen_Area,Pass Types_Live,Pass Types_Dead,Pass Types_FK,Pass Types_TB,Pass Types_Sw,Pass Types_Crs,Pass Types_TI,Pass Types_CK,Corner Kicks_In,Corner Kicks_Out,Corner Kicks_Str,Pass_Offsides,Pass_Blocked,SCA,SCA_90,SCA Types_PassLive,SCA Types_PassDead,SCA Types_TO,SCA Types_Sh,SCA Types_Fld,SCA Types_Def,GCA,GCA_90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def,Tkls,Tkls_Won,Tackles_Def 3rd,Tackles_Mid 3rd,Tackles_Att 3rd,Drib_Tkl,Drib_Tkl_Att,Drib_Tkl%,Def_Blocks,Def_Shot_Blocks,Def_Pass_Blocks,Int,Tkl+Int,Clearances,Errors,CrdY2,Fls_Comm,Offsides,PK_Won,PK_Conv,Own_Goal,Ball_Recoveries,Aerial Duels_Won,Aerial Duels_Won%,League
0,Max Aarons,eng ENG,DF,Bournemouth,23,2000,20,13,1237,13.7,0,1,1,0,0,0,1,0,0.0,0.0,0.8,0.9,22,43,26,0.0,0.07,0.07,0.0,0.07,0.0,0.06,0.06,0.0,0.06,,,,,,,,,,,,,711.0,43.0,252.0,303.0,165.0,11.0,711.0,34.0,14.0,41.2,12.0,35.3,364.0,2174.0,1121.0,22.0,12.0,7.0,13.0,8.0,371.0,26.0,450.0,581.0,77.5,7402.0,2789.0,220.0,248.0,88.7,188.0,235.0,80.0,34.0,63.0,54.0,0.9,0.2,7.0,25.0,13.0,2.0,453,127,11,2,3,13,116,0,0,0,0,1,23,23,1.68,16,4,0,0,3,0,2,0.15,2,0,0,0,0,0,29.0,19.0,20.0,7.0,2.0,20.0,34.0,58.8,9.0,5.0,4.0,8.0,37.0,27.0,0.0,0.0,12.0,2.0,0.0,1.0,0.0,75.0,5.0,31.3,Premier League
1,Brenden Aaronson,us USA,"MF,FW",Union Berlin,22,2000,30,14,1267,14.1,2,2,4,2,0,0,3,1,2.0,2.0,1.9,3.8,37,56,91,0.14,0.14,0.28,0.14,0.28,0.14,0.13,0.27,0.14,0.27,18.0,7.0,38.9,1.28,0.5,0.11,0.29,18.4,0.0,0.11,0.0,0.0,675.0,11.0,108.0,301.0,293.0,47.0,675.0,77.0,34.0,44.2,41.0,53.2,406.0,2721.0,1387.0,37.0,29.0,9.0,41.0,38.0,457.0,91.0,365.0,472.0,77.3,4890.0,1506.0,206.0,240.0,85.8,105.0,130.0,80.8,19.0,32.0,59.4,2.0,0.1,22.0,30.0,14.0,3.0,439,29,3,5,1,22,12,6,2,3,0,4,21,53,3.76,41,1,8,3,0,0,8,0.57,6,0,2,0,0,0,32.0,18.0,13.0,13.0,6.0,16.0,32.0,50.0,26.0,1.0,25.0,2.0,34.0,4.0,0.0,1.0,15.0,5.0,0.0,0.0,0.0,88.0,13.0,44.8,Bundesliga
2,Paxten Aaronson,us USA,MF,Eint Frankfurt,19,2003,7,1,101,1.1,0,1,1,0,0,0,0,0,0.1,0.1,0.1,0.2,2,5,7,0.0,0.89,0.89,0.0,0.89,0.11,0.07,0.19,0.11,0.19,2.0,2.0,100.0,1.78,1.78,0.0,0.0,15.1,0.0,0.06,-0.1,-0.1,72.0,0.0,6.0,39.0,28.0,5.0,72.0,7.0,2.0,28.6,4.0,57.1,43.0,193.0,50.0,2.0,0.0,1.0,5.0,4.0,46.0,7.0,41.0,50.0,82.0,576.0,71.0,20.0,25.0,80.0,20.0,22.0,90.9,0.0,2.0,0.0,0.1,0.9,1.0,4.0,2.0,0.0,48,2,1,0,0,0,1,0,0,0,0,0,0,1,0.89,1,0,0,0,0,0,1,0.89,1,0,0,0,0,0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,100.0,2.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,5.0,3.0,100.0,Bundesliga
3,Keyliane Abdallah,fr FRA,FW,Marseille,17,2006,1,0,4,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Ligue 1
4,Yunis Abdelhamid,ma MAR,DF,Reims,35,1987,31,31,2781,30.9,4,0,4,3,1,1,5,0,3.4,2.6,0.3,2.9,36,137,9,0.13,0.0,0.13,0.1,0.1,0.11,0.01,0.12,0.09,0.09,21.0,7.0,33.3,0.68,0.23,0.14,0.43,15.0,0.0,0.13,0.6,0.4,2185.0,293.0,976.0,1119.0,114.0,35.0,2184.0,15.0,8.0,53.3,7.0,46.7,1506.0,8663.0,4921.0,36.0,19.0,0.0,23.0,4.0,1403.0,9.0,1552.0,1836.0,84.5,29618.0,9672.0,487.0,548.0,88.9,893.0,976.0,91.5,141.0,252.0,56.0,0.6,-0.3,8.0,129.0,3.0,0.0,1650,178,75,2,13,3,29,0,0,0,0,8,20,24,0.78,19,1,0,3,1,0,1,0.03,0,0,0,1,0,0,64.0,35.0,36.0,23.0,5.0,26.0,45.0,57.8,51.0,32.0,19.0,39.0,103.0,109.0,2.0,0.0,26.0,0.0,0.0,0.0,1.0,149.0,61.0,62.2,Ligue 1


In [23]:
b5_outfield_df = b5_outfield_df.drop('Comp', axis=1)

## EFL Championship

### Standard Stats

In [25]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/10/stats/Championship-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_standard' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_standard')

        championship_stand_df = pd.read_html(StringIO(str(div)))[0]
        print(championship_stand_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk             Player             Nation   
0                    1      Azeem Abdulai            sct SCO   
1                    2           Ken Aboh            eng ENG   
2                    3          Che Adams            sct SCO   
3                    4         Ebou Adams             gm GAM   
4                    5      Albert Adomah             gh GHA   
..                 ...                ...                ...   
759                731       Charlie Wyke            eng ENG   
760                732        Jerry Yates            eng ENG   
761                733     Ephraim Yeboah             gh GHA   
762                734       Okay Yokuşlu             tr TUR   
763                735      Anass Zaroury             ma MAR   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0                   MF       Swansea City

In [26]:

# Flattening multiIndex columns
championship_stand_df.columns = ['_'.join(col) for col in championship_stand_df.columns.values]

# Dropping useless columns
championship_stand_df = championship_stand_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 36_level_0_Matches'], axis=1)

# Renaming player ID columns
championship_stand_df = championship_stand_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
championship_stand_df = championship_stand_df.rename(columns={'Playing Time_MP':'MP',
                                        'Playing Time_Starts':'Starts', 
                                        'Playing Time_Min':'Min', 
                                        'Playing Time_90s':'No_90s',
                                        'Performance_Gls':'Gls', 
                                        'Performance_Ast':'Ast',
                                        'Performance_G+A':'G+A',
                                        'Performance_G-PK':'G-PK', 
                                        'Performance_PK':'PK', 
                                        'Performance_PKatt':'PKatt',
                                        'Performance_CrdY':'CrdY',
                                        'Performance_CrdR':'CrdR',
                                        'Expected_xG':'xG', 
                                        'Expected_npxG':'npxG',
                                        'Expected_xAG':'xAG', 
                                        'Expected_npxG+xAG':'npxG+xAG',
                                        'Progression_PrgC':'Prg_Carr',
                                        'Progression_PrgP':'Prg_Pass', 
                                        'Progression_PrgR':'Prg_Pass_Rec', 
                                        'Per 90 Minutes_Gls':'Gls_90',
                                        'Per 90 Minutes_Ast':'Ast_90', 
                                        'Per 90 Minutes_G+A':'G+A_90', 
                                        'Per 90 Minutes_G-PK':'G-PK_90',
                                        'Per 90 Minutes_G+A-PK':'G+A-PK_90', 
                                        'Per 90 Minutes_xG':'xG_90', 
                                        'Per 90 Minutes_xAG':'xAG_90',
                                        'Per 90 Minutes_xG+xAG':'xG+xAG_90', 
                                        'Per 90 Minutes_npxG':'npxG_90',
                                        'Per 90 Minutes_npxG+xAG':'npxG+xAG_90'})


In [None]:
#Examining the Players that have null values

championship_stand_df[championship_stand_df.isnull().any(axis=1)]

In [27]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Bobby Thomas - Coventry City
bobby_thomas = championship_stand_df.loc[(championship_stand_df['Player']== 'Bobby Thomas') & (championship_stand_df['Squad']=='Coventry City')]
bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)

# Peter Kioso - Rotherham United
peter_kioso = championship_stand_df.loc[championship_stand_df['Player']== 'Peter Kioso']
peter_kioso.Nation = peter_kioso.Nation.fillna('ie IRL')

nan_players = pd.concat([bobby_thomas, peter_kioso], ignore_index=True)

championship_stand_df = pd.concat([championship_stand_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peter_kioso.Nation =

In [28]:
# Dropping rows with NANs
championship_stand_df = championship_stand_df.dropna(axis=0, how='any')


# Removing column header rows
championship_stand_df = championship_stand_df[championship_stand_df["Player"] != "Player"]


# Applying datatypes to variables
convert_dict = {'No_90s':float,
                'Gls_90':float,
                'Ast_90':float,
                'G+A_90':float,
                'G-PK_90':float,
                'G+A_90':float,
                'G+A-PK_90':float,
                'xG':float,
                'npxG':float,
                'xAG':float,
                'npxG+xAG':float,
                'xG_90':float,
                'xAG_90':float,
                'xG+xAG_90':float,
                'npxG_90':float,
                'npxG+xAG_90':float,
                
                'MP':int, 
                'Starts':int, 
                'Min':int, 
                'Gls':int, 
                'Ast':int, 
                'G+A':int, 
                'G-PK':int, 
                'PK':int, 
                'PKatt':int,
                'CrdY':int,
                'CrdR':int,
                'Prg_Carr':int,
                'Prg_Pass':int,
                'Prg_Pass_Rec':int
    }

championship_stand_df = championship_stand_df.astype(convert_dict)

### Shooting

In [29]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/10/shooting/Championship-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_shooting' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_shooting')

        championship_shoot_df = pd.read_html(StringIO(str(div)))[0]
        print(championship_shoot_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk             Player             Nation   
0                    1      Azeem Abdulai            sct SCO   
1                    2           Ken Aboh            eng ENG   
2                    3          Che Adams            sct SCO   
3                    4         Ebou Adams             gm GAM   
4                    5      Albert Adomah             gh GHA   
..                 ...                ...                ...   
759                731       Charlie Wyke            eng ENG   
760                732        Jerry Yates            eng ENG   
761                733     Ephraim Yeboah             gh GHA   
762                734       Okay Yokuşlu             tr TUR   
763                735      Anass Zaroury             ma MAR   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0                   MF       Swansea City

In [30]:
# Flattening multiIndex columns
championship_shoot_df.columns = ['_'.join(col) for col in championship_shoot_df.columns.values]

# Dropping useless columns
championship_shoot_df = championship_shoot_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 25_level_0_Matches','Standard_Gls','Standard_PK',
                              'Standard_PKatt', 'Expected_xG', 'Expected_npxG'], axis=1)

# Renaming player ID columns
championship_shoot_df = championship_shoot_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
championship_shoot_df = championship_shoot_df.rename(columns={'Standard_Sh':'Shots', 
                                        'Standard_SoT':'SoT',
                                        'Standard_SoT%':'SoT_pct', 
                                        'Standard_Sh/90':'Shots_90', 
                                        'Standard_SoT/90':'SoT_90', 
                                        'Standard_G/Sh':'Gls_per_Sh',
                                        'Standard_G/SoT':'Gls_per_SoT', 
                                        'Standard_Dist':'Avg_Sh_Dist', 
                                        'Standard_FK':'Sh_FK', 
                                        'Expected_npxG/Sh':'npxG_per_Sh',
                                        'Expected_G-xG':'G-xG', 
                                        'Expected_np:G-xG':'npG-npxG'})

In [31]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Bobby Thomas - Coventry City
bobby_thomas = championship_shoot_df.loc[(championship_shoot_df['Player']== 'Bobby Thomas') & (championship_shoot_df['Squad']=='Coventry City')]
bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)

# Peter Kioso - Rotherham United
peter_kioso = championship_shoot_df.loc[championship_shoot_df['Player']== 'Peter Kioso']
peter_kioso.Nation = peter_kioso.Nation.fillna('ie IRL')

nan_players = pd.concat([bobby_thomas, peter_kioso], ignore_index=True)

championship_shoot_df = pd.concat([championship_shoot_df, nan_players], ignore_index=True)


  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peter_kioso.Nation =

In [32]:
# Dropping rows with NANs
championship_shoot_df = championship_shoot_df.dropna(axis=0, how='any')


# Removing column header rows
championship_shoot_df = championship_shoot_df[championship_shoot_df["Player"] != "Player"]

convert_dict = {
    'Shots':int,
    'SoT':int,
    'SoT_pct':float,
    'Shots_90':float,
    'SoT_90':float,
    'Gls_per_Sh':float,
    'Gls_per_SoT':float,
    'Avg_Sh_Dist':float,
    'Sh_FK':int,
    'npxG_per_Sh':float,
    'G-xG':float,
    'npG-npxG':float
    }

championship_shoot_df = championship_shoot_df.astype(convert_dict)

### Possession

In [33]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/10/possession/Championship-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_possession' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_possession')

        championship_poss_df = pd.read_html(StringIO(str(div)))[0]
        print(championship_poss_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk             Player             Nation   
0                    1      Azeem Abdulai            sct SCO   
1                    2           Ken Aboh            eng ENG   
2                    3          Che Adams            sct SCO   
3                    4         Ebou Adams             gm GAM   
4                    5      Albert Adomah             gh GHA   
..                 ...                ...                ...   
759                731       Charlie Wyke            eng ENG   
760                732        Jerry Yates            eng ENG   
761                733     Ephraim Yeboah             gh GHA   
762                734       Okay Yokuşlu             tr TUR   
763                735      Anass Zaroury             ma MAR   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0                   MF       Swansea City

In [34]:
# Flattening multiIndex columns
championship_poss_df.columns = ['_'.join(col) for col in championship_poss_df.columns.values]

# Dropping useless columns
championship_poss_df = championship_poss_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 30_level_0_Matches'], axis=1)

# Renaming player ID columns
championship_poss_df = championship_poss_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
championship_poss_df = championship_poss_df.rename(columns={'Touches_Touches':'Touches','Carries_Carries':'Carries'})

In [35]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Bobby Thomas - Coventry City
bobby_thomas = championship_poss_df.loc[(championship_poss_df['Player']== 'Bobby Thomas') & (championship_poss_df['Squad']=='Coventry City')]
bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)

# Peter Kioso - Rotherham United
peter_kioso = championship_poss_df.loc[championship_poss_df['Player']== 'Peter Kioso']
peter_kioso.Nation = peter_kioso.Nation.fillna('ie IRL')

nan_players = pd.concat([bobby_thomas, peter_kioso], ignore_index=True)

championship_poss_df = pd.concat([championship_poss_df, nan_players], ignore_index=True)


  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peter_kioso.Nation =

In [36]:
# Dropping rows with NANs
championship_poss_df = championship_poss_df.dropna(axis=0, how='any')


# Removing column header rows
championship_poss_df = championship_poss_df[championship_poss_df["Player"] != "Player"]

convert_dict = {
                'Touches':int, 
                'Touches_Def Pen':int, 
                'Touches_Def 3rd':int, 
                'Touches_Mid 3rd':int,
                'Touches_Att 3rd':int, 
                'Touches_Att Pen':int, 
                'Touches_Live':int, 
                'Take-Ons_Att':int,
                'Take-Ons_Succ':int, 
                'Take-Ons_Succ%':float, 
                'Take-Ons_Tkld':int, 
                'Take-Ons_Tkld%':float,
                'Carries':int, 
                'Carries_TotDist':int, 
                'Carries_PrgDist':int, 
                'Carries_PrgC':int,
                'Carries_1/3':int, 
                'Carries_CPA':int, 
                'Carries_Mis':int, 
                'Carries_Dis':int,
                'Receiving_Rec':int, 
                'Receiving_PrgR':int
    }

championship_poss_df = championship_poss_df.astype(convert_dict)

### Passing

In [37]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/10/passing/Championship-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_passing' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_passing')

        championship_pass_df = pd.read_html(StringIO(str(div)))[0]
        print(championship_pass_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk             Player             Nation   
0                    1      Azeem Abdulai            sct SCO   
1                    2           Ken Aboh            eng ENG   
2                    3          Che Adams            sct SCO   
3                    4         Ebou Adams             gm GAM   
4                    5      Albert Adomah             gh GHA   
..                 ...                ...                ...   
759                731       Charlie Wyke            eng ENG   
760                732        Jerry Yates            eng ENG   
761                733     Ephraim Yeboah             gh GHA   
762                734       Okay Yokuşlu             tr TUR   
763                735      Anass Zaroury             ma MAR   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0                   MF       Swansea City

In [38]:
# Flattening multiIndex columns
championship_pass_df.columns = ['_'.join(col) for col in championship_pass_df.columns.values]

# Dropping useless columns
championship_pass_df = championship_pass_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 22_level_0_Ast',
                                                  'Unnamed: 31_level_0_Matches','Unnamed: 23_level_0_xAG','Unnamed: 30_level_0_PrgP'], axis=1)

# Renaming player ID columns
championship_pass_df = championship_pass_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
championship_pass_df = championship_pass_df.rename(columns={'Expected_xA':'xA',
                                      'Expected_A-xAG':'A-xAG',
                                      'Unnamed: 26_level_0_KP':'KP',
                                      'Unnamed: 27_level_0_1/3':'Pass_Fin_3rd',
                                      'Unnamed: 28_level_0_PPA':'Pass_Pen_Area',
                                      'Unnamed: 29_level_0_CrsPA':'Cross_Pen_Area'})

In [39]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Bobby Thomas - Coventry City
bobby_thomas = championship_pass_df.loc[(championship_pass_df['Player']== 'Bobby Thomas') & (championship_pass_df['Squad']=='Coventry City')]
bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)

# Peter Kioso - Rotherham United
peter_kioso = championship_pass_df.loc[championship_pass_df['Player']== 'Peter Kioso']
peter_kioso.Nation = peter_kioso.Nation.fillna('ie IRL')

nan_players = pd.concat([bobby_thomas, peter_kioso], ignore_index=True)

championship_pass_df = pd.concat([championship_pass_df, nan_players], ignore_index=True)


  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peter_kioso.Nation =

In [40]:
# Dropping rows with NANs
championship_pass_df = championship_pass_df.dropna(axis=0, how='any')


# Removing column header rows
championship_pass_df = championship_pass_df[championship_pass_df["Player"] != "Player"]

convert_dict = {
                'Total_Cmp':int, 
                'Total_Att':int, 
                'Total_Cmp%':float, 
                'Total_TotDist':int,
                'Total_PrgDist':int, 
                'Short_Cmp':int, 
                'Short_Att':int, 
                'Short_Cmp%':float, 
                'Medium_Cmp':int,
                'Medium_Att':int, 
                'Medium_Cmp%':float, 
                'Long_Cmp':int, 
                'Long_Att':int, 
                'Long_Cmp%':float, 
                'xA':float,
                'A-xAG':float, 
                'KP':int, 
                'Pass_Fin_3rd':int, 
                'Pass_Pen_Area':int, 
                'Cross_Pen_Area':int
    }

championship_pass_df = championship_pass_df.astype(convert_dict)

### Passing Types

In [41]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/10/passing_types/Championship-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_passing_types' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_passing_types')

        championship_ptype_df = pd.read_html(StringIO(str(div)))[0]
        print(championship_ptype_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk             Player             Nation   
0                    1      Azeem Abdulai            sct SCO   
1                    2           Ken Aboh            eng ENG   
2                    3          Che Adams            sct SCO   
3                    4         Ebou Adams             gm GAM   
4                    5      Albert Adomah             gh GHA   
..                 ...                ...                ...   
759                731       Charlie Wyke            eng ENG   
760                732        Jerry Yates            eng ENG   
761                733     Ephraim Yeboah             gh GHA   
762                734       Okay Yokuşlu             tr TUR   
763                735      Anass Zaroury             ma MAR   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0                   MF       Swansea City

In [42]:
# Flattening multiIndex columns
championship_ptype_df.columns = ['_'.join(col) for col in championship_ptype_df.columns.values]

# Dropping useless columns
championship_ptype_df = championship_ptype_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 23_level_0_Matches','Outcomes_Cmp',
                                                    'Unnamed: 8_level_0_Att'], axis=1)

# Renaming player ID columns
championship_ptype_df = championship_ptype_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
championship_ptype_df = championship_ptype_df.rename(columns={'Outcomes_Blocks':'Pass_Blocked',
                                      'Outcomes_Off':'Pass_Offsides'})

In [43]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Bobby Thomas - Coventry City
bobby_thomas = championship_ptype_df.loc[(championship_ptype_df['Player']== 'Bobby Thomas') & (championship_ptype_df['Squad']=='Coventry City')]
bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)

# Peter Kioso - Rotherham United
peter_kioso = championship_ptype_df.loc[championship_ptype_df['Player']== 'Peter Kioso']
peter_kioso.Nation = peter_kioso.Nation.fillna('ie IRL')

nan_players = pd.concat([bobby_thomas, peter_kioso], ignore_index=True)

championship_ptype_df = pd.concat([championship_ptype_df, nan_players], ignore_index=True)


  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peter_kioso.Nation =

In [44]:
# Dropping rows with NANs
championship_ptype_df = championship_ptype_df.dropna(axis=0, how='any')


# Removing column header rows
championship_ptype_df = championship_ptype_df[championship_ptype_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'Pass Types_Live':int, 
                'Pass Types_Dead':int, 
                'Pass Types_FK':int, 
                'Pass Types_TB':int,
                'Pass Types_Sw':int, 
                'Pass Types_Crs':int, 
                'Pass Types_TI':int, 
                'Pass Types_CK':int,
                'Corner Kicks_In':int, 
                'Corner Kicks_Out':int, 
                'Corner Kicks_Str':int,
                'Pass_Offsides':int, 
                'Pass_Blocked':int  
}
championship_ptype_df = championship_ptype_df.astype(convert_dict)

### Goal and Shot Creation (GCA)

In [45]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/10/gca/Championship-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_gca' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_gca')

        championship_gca_df = pd.read_html(StringIO(str(div)))[0]
        print(championship_gca_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk             Player             Nation   
0                    1      Azeem Abdulai            sct SCO   
1                    2           Ken Aboh            eng ENG   
2                    3          Che Adams            sct SCO   
3                    4         Ebou Adams             gm GAM   
4                    5      Albert Adomah             gh GHA   
..                 ...                ...                ...   
759                731       Charlie Wyke            eng ENG   
760                732        Jerry Yates            eng ENG   
761                733     Ephraim Yeboah             gh GHA   
762                734       Okay Yokuşlu             tr TUR   
763                735      Anass Zaroury             ma MAR   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0                   MF       Swansea City

In [46]:
# Flattening multiIndex columns
championship_gca_df.columns = ['_'.join(col) for col in championship_gca_df.columns.values]

# Dropping useless columns
championship_gca_df = championship_gca_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 24_level_0_Matches'], axis=1)

# Renaming player ID columns
championship_gca_df = championship_gca_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
championship_gca_df = championship_gca_df.rename(columns={'SCA_SCA':'SCA',
                                    'SCA_SCA90':'SCA_90',
                                    'SCA_Types_PassLive':'SCA_PassLive',
                                    'SCA_Types_PassDead':'SCA_PassDead',
                                    'SCA_Types_TO':'SCA_TO',
                                    'SCA_Types_Sh':'SCA_Shot',
                                    'SCA_Types_Fld':'SCA_Fouls_Drawn',
                                    'SCA_Types_Def':'SCA_Def_Action',
                                    'GCA_GCA':'GCA',
                                    'GCA_GCA90':'GCA_90',
                                    'GCA_Types_PassLive':'GCA_PassLive',
                                    'GCA_Types_PassDead':'GCA_PassDead',
                                    'GCA_Types_TO':'GCA_TO',
                                    'GCA_Types_Sh':'GCA_Shot',
                                    'GCA_Types_Fld':'GCA_Fouls_Drawn',
                                    'GCA_Types_Def':'GCA_Def_Action'})

In [47]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Bobby Thomas - Coventry City
bobby_thomas = championship_gca_df.loc[(championship_gca_df['Player']== 'Bobby Thomas') & (championship_gca_df['Squad']=='Coventry City')]
bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)

# Peter Kioso - Rotherham United
peter_kioso = championship_gca_df.loc[championship_gca_df['Player']== 'Peter Kioso']
peter_kioso.Nation = peter_kioso.Nation.fillna('ie IRL')

nan_players = pd.concat([bobby_thomas, peter_kioso], ignore_index=True)

championship_gca_df = pd.concat([championship_gca_df, nan_players], ignore_index=True)


  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peter_kioso.Nation =

In [48]:
# Dropping rows with NANs
championship_gca_df = championship_gca_df.dropna(axis=0, how='any')


# Removing column header rows
championship_gca_df = championship_gca_df[championship_gca_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'SCA':int,
                'SCA_90':float, 
                'SCA Types_PassLive':int, 
                'SCA Types_PassDead':int, 
                'SCA Types_TO':int,
                'SCA Types_Sh':int, 
                'SCA Types_Fld':int, 
                'SCA Types_Def':int, 
                'GCA':int, 
                'GCA_90':float,
                'GCA Types_PassLive':int, 
                'GCA Types_PassDead':int, 
                'GCA Types_TO':int,
                'GCA Types_Sh':int, 
                'GCA Types_Fld':int, 
                'GCA Types_Def':int  
}
championship_gca_df = championship_gca_df.astype(convert_dict)

### Defensive Actions

In [49]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/10/defense/Championship-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_defense' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_defense')

        championship_def_df = pd.read_html(StringIO(str(div)))[0]
        print(championship_def_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk             Player             Nation   
0                    1      Azeem Abdulai            sct SCO   
1                    2           Ken Aboh            eng ENG   
2                    3          Che Adams            sct SCO   
3                    4         Ebou Adams             gm GAM   
4                    5      Albert Adomah             gh GHA   
..                 ...                ...                ...   
759                731       Charlie Wyke            eng ENG   
760                732        Jerry Yates            eng ENG   
761                733     Ephraim Yeboah             gh GHA   
762                734       Okay Yokuşlu             tr TUR   
763                735      Anass Zaroury             ma MAR   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0                   MF       Swansea City

In [50]:
# Flattening multiIndex columns
championship_def_df.columns = ['_'.join(col) for col in championship_def_df.columns.values]

# Dropping useless columns
championship_def_df = championship_def_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 24_level_0_Matches', 'Challenges_Lost'], axis=1)

# Renaming player ID columns
championship_def_df = championship_def_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
championship_def_df = championship_def_df.rename(columns={'Tackles_Tkl':'Tkls',
                                    'Tackles_TklW':'Tkls_Won',
                                    'Tackles_Def_3rd':'Tkl_Def_3rd',
                                    'Tackles_Mid_3rd':'Tkl_Mid_3rd',
                                    'Tackles_Att_3rd':'Tkl_Att_3rd',
                                    'Challenges_Tkl':'Drib_Tkl',
                                    'Challenges_Att':'Drib_Tkl_Att',
                                    'Challenges_Tkl%':'Drib_Tkl%',
                                    'Blocks_Blocks':'Def_Blocks',
                                    'Blocks_Sh':'Def_Shot_Blocks',
                                    'Blocks_Pass':'Def_Pass_Blocks',
                                    'Unnamed: 20_level_0_Int':'Int',
                                    'Unnamed: 21_level_0_Tkl+Int':'Tkl+Int',
                                    'Unnamed: 22_level_0_Clr':'Clearances',
                                    'Unnamed: 23_level_0_Err':'Errors'})

In [51]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Bobby Thomas - Coventry City
bobby_thomas = championship_def_df.loc[(championship_def_df['Player']== 'Bobby Thomas') & (championship_def_df['Squad']=='Coventry City')]
bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)

# Peter Kioso - Rotherham United
peter_kioso = championship_def_df.loc[championship_def_df['Player']== 'Peter Kioso']
peter_kioso.Nation = peter_kioso.Nation.fillna('ie IRL')

nan_players = pd.concat([bobby_thomas, peter_kioso], ignore_index=True)

championship_def_df = pd.concat([championship_def_df, nan_players], ignore_index=True)


  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peter_kioso.Nation =

In [52]:
# Dropping rows with NANs
championship_def_df = championship_def_df.dropna(axis=0, how='any')


# Removing column header rows
championship_def_df = championship_def_df[championship_def_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'Tkls':int,
                'Tkls_Won':int, 
                'Tackles_Def 3rd':int, 
                'Tackles_Mid 3rd':int, 
                'Tackles_Att 3rd':int,
                'Drib_Tkl':int, 
                'Drib_Tkl_Att':int, 
                'Drib_Tkl%':float, 
                'Def_Blocks':int,
                'Def_Shot_Blocks':int, 
                'Def_Pass_Blocks':int, 
                'Int':int, 
                'Tkl+Int':int, 
                'Clearances':int,
                'Errors':int 
}
championship_def_df = championship_def_df.astype(convert_dict)

### Miscellaneous Stats

In [53]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/10/misc/Championship-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_misc' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_misc')

        championship_misc_df = pd.read_html(StringIO(str(div)))[0]
        print(championship_misc_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk             Player             Nation   
0                    1      Azeem Abdulai            sct SCO   
1                    2           Ken Aboh            eng ENG   
2                    3          Che Adams            sct SCO   
3                    4         Ebou Adams             gm GAM   
4                    5      Albert Adomah             gh GHA   
..                 ...                ...                ...   
759                731       Charlie Wyke            eng ENG   
760                732        Jerry Yates            eng ENG   
761                733     Ephraim Yeboah             gh GHA   
762                734       Okay Yokuşlu             tr TUR   
763                735      Anass Zaroury             ma MAR   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0                   MF       Swansea City

In [54]:
# Flattening multiIndex columns
championship_misc_df.columns = ['_'.join(col) for col in championship_misc_df.columns.values]

# Dropping useless columns
championship_misc_df = championship_misc_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 24_level_0_Matches','Aerial Duels_Lost',
                         'Performance_CrdY','Performance_CrdR','Performance_Crs','Performance_Int','Performance_TklW',
                         'Performance_Fld'], axis=1)

# Renaming player ID columns
championship_misc_df = championship_misc_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
championship_misc_df = championship_misc_df.rename(columns={'Performance_2CrdY':'CrdY2',
                                    'Performance_Fls':'Fls_Comm',
                                    'Performance_Off':'Offsides',
                                    'Performance_PKwon':'PK_Won',
                                    'Performance_PKcon':'PK_Conv',
                                    'Performance_OG':'Own_Goal',
                                    'Performance_Recov':'Ball_Recoveries'})

In [55]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Bobby Thomas - Coventry City
bobby_thomas = championship_misc_df.loc[(championship_misc_df['Player']== 'Bobby Thomas') & (championship_misc_df['Squad']=='Coventry City')]
bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)

# Peter Kioso - Rotherham United
peter_kioso = championship_misc_df.loc[championship_misc_df['Player']== 'Peter Kioso']
peter_kioso.Nation = peter_kioso.Nation.fillna('ie IRL')

nan_players = pd.concat([bobby_thomas, peter_kioso], ignore_index=True)

championship_misc_df = pd.concat([championship_misc_df, nan_players], ignore_index=True)


  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Age = bobby_thomas.Age.fillna(23).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bobby_thomas.Born = bobby_thomas.Born.fillna('2000').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peter_kioso.Nation =

In [56]:
# Dropping rows with NANs
championship_misc_df = championship_misc_df.dropna(axis=0, how='any')


# Removing column header rows
championship_misc_df = championship_misc_df[championship_misc_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'CrdY2':int,
                'Fls_Comm':int, 
                'Offsides':int, 
                'PK_Won':int, 
                'PK_Conv':int, 
                'Own_Goal':int,
                'Ball_Recoveries':int, 
                'Aerial Duels_Won':int, 
                'Aerial Duels_Won%':float 
}
championship_misc_df = championship_misc_df.astype(convert_dict)

### Merging Stat Group datasets

In [57]:
championship_conditions_join = ['Player','Nation','Position','Squad','Age','Born']

championship_outfield_df = pd.merge(championship_stand_df, championship_shoot_df, left_on=championship_conditions_join, right_on=championship_conditions_join,
                          how='left')

championship_outfield_df = pd.merge(championship_outfield_df, championship_poss_df, left_on=championship_conditions_join, right_on=championship_conditions_join,
                          how='left')

championship_outfield_df = pd.merge(championship_outfield_df, championship_pass_df, left_on=championship_conditions_join, right_on=championship_conditions_join,
                          how='left')

championship_outfield_df = pd.merge(championship_outfield_df, championship_ptype_df, left_on=championship_conditions_join, right_on=championship_conditions_join,
                          how='left')

championship_outfield_df = pd.merge(championship_outfield_df, championship_gca_df, left_on=championship_conditions_join, right_on=championship_conditions_join,
                          how='left')

championship_outfield_df = pd.merge(championship_outfield_df, championship_def_df, left_on=championship_conditions_join, right_on=championship_conditions_join,
                          how='left')

championship_outfield_df = pd.merge(championship_outfield_df, championship_misc_df, left_on=championship_conditions_join, right_on=championship_conditions_join,
                          how='left')


In [60]:
championship_outfield_df.head(25)

Unnamed: 0,Player,Nation,Position,Squad,Age,Born,MP,Starts,Min,No_90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,Prg_Carr,Prg_Pass,Prg_Pass_Rec,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90,Shots,SoT,SoT_pct,Shots_90,SoT_90,Gls_per_Sh,Gls_per_SoT,Avg_Sh_Dist,Sh_FK,npxG_per_Sh,G-xG,npG-npxG,Touches,Touches_Def Pen,Touches_Def 3rd,Touches_Mid 3rd,Touches_Att 3rd,Touches_Att Pen,Touches_Live,Take-Ons_Att,Take-Ons_Succ,Take-Ons_Succ%,Take-Ons_Tkld,Take-Ons_Tkld%,Carries,Carries_TotDist,Carries_PrgDist,Carries_PrgC,Carries_1/3,Carries_CPA,Carries_Mis,Carries_Dis,Receiving_Rec,Receiving_PrgR,Total_Cmp,Total_Att,Total_Cmp%,Total_TotDist,Total_PrgDist,Short_Cmp,Short_Att,Short_Cmp%,Medium_Cmp,Medium_Att,Medium_Cmp%,Long_Cmp,Long_Att,Long_Cmp%,xA,A-xAG,KP,Pass_Fin_3rd,Pass_Pen_Area,Cross_Pen_Area,Pass Types_Live,Pass Types_Dead,Pass Types_FK,Pass Types_TB,Pass Types_Sw,Pass Types_Crs,Pass Types_TI,Pass Types_CK,Corner Kicks_In,Corner Kicks_Out,Corner Kicks_Str,Pass_Offsides,Pass_Blocked,SCA,SCA_90,SCA Types_PassLive,SCA Types_PassDead,SCA Types_TO,SCA Types_Sh,SCA Types_Fld,SCA Types_Def,GCA,GCA_90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def,Tkls,Tkls_Won,Tackles_Def 3rd,Tackles_Mid 3rd,Tackles_Att 3rd,Drib_Tkl,Drib_Tkl_Att,Drib_Tkl%,Def_Blocks,Def_Shot_Blocks,Def_Pass_Blocks,Int,Tkl+Int,Clearances,Errors,CrdY2,Fls_Comm,Offsides,PK_Won,PK_Conv,Own_Goal,Ball_Recoveries,Aerial Duels_Won,Aerial Duels_Won%,League
0,Ken Aboh,eng ENG,MF,Norwich City,18,2004,1,0,10,0.1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,EFL Championship
1,Che Adams,sct SCO,FW,Southampton,27,1996,40,25,2289,25.4,16,4,20,16,0,0,5,0,13.6,13.6,3.6,17.2,21,88,150,0.63,0.16,0.79,0.63,0.79,0.53,0.14,0.68,0.53,0.68,80.0,31.0,38.8,3.15,1.22,0.2,0.52,16.4,0.0,0.17,2.4,2.4,921.0,18.0,70.0,470.0,393.0,126.0,921.0,34.0,19.0,55.9,12.0,35.3,575.0,2126.0,737.0,21.0,16.0,9.0,58.0,20.0,747.0,150.0,516.0,678.0,76.1,8988.0,1974.0,238.0,279.0,85.3,193.0,242.0,79.8,65.0,88.0,73.9,3.5,0.4,31.0,50.0,24.0,0.0,633,36,8,10,10,7,9,0,0,0,0,9,25,66,2.59,53,0,1,10,2,0,11,0.43,9,0,0,2,0,0,19.0,15.0,3.0,9.0,7.0,8.0,12.0,66.7,18.0,7.0,11.0,5.0,24.0,11.0,0.0,0.0,35.0,18.0,0.0,0.0,0.0,60.0,20.0,40.8,EFL Championship
2,Ebou Adams,gm GAM,"MF,DF",Cardiff City,27,1996,11,3,310,3.4,0,0,0,0,0,0,4,0,0.0,0.0,0.0,0.0,2,5,0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.01,,,,,,,,,,,,,215.0,20.0,92.0,115.0,8.0,3.0,215.0,1.0,0.0,0.0,1.0,100.0,112.0,409.0,118.0,2.0,1.0,2.0,7.0,3.0,112.0,0.0,113.0,155.0,72.9,2079.0,576.0,49.0,59.0,83.1,50.0,62.0,80.6,14.0,27.0,51.9,0.0,0.0,0.0,9.0,0.0,0.0,149,6,0,0,0,1,6,0,0,0,0,0,1,0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,16.0,12.0,11.0,4.0,1.0,9.0,11.0,81.8,3.0,0.0,3.0,5.0,21.0,18.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,23.0,15.0,93.8,EFL Championship
3,Albert Adomah,gh GHA,"DF,FW",QPR,35,1987,16,2,370,4.1,0,0,0,0,0,0,2,0,0.1,0.1,0.4,0.6,10,1,26,0.0,0.0,0.0,0.0,0.0,0.04,0.1,0.13,0.04,0.13,,,,,,,,,,,,,175.0,9.0,54.0,55.0,70.0,10.0,175.0,9.0,3.0,33.3,5.0,55.6,79.0,470.0,262.0,10.0,4.0,0.0,12.0,7.0,78.0,26.0,60.0,114.0,52.6,920.0,377.0,37.0,45.0,82.2,18.0,36.0,50.0,4.0,18.0,22.2,0.6,-0.4,5.0,4.0,2.0,2.0,96,13,2,0,1,15,11,0,0,0,0,5,7,6,1.46,5,0,0,1,0,0,0,0.0,0,0,0,0,0,0,20.0,9.0,10.0,7.0,3.0,6.0,15.0,40.0,5.0,0.0,5.0,3.0,23.0,8.0,0.0,0.0,7.0,1.0,0.0,0.0,1.0,23.0,3.0,25.0,EFL Championship
4,Emanuel Aiwum,at AUT,DF,Birmingham City,22,2000,24,22,1941,21.6,0,0,0,0,0,0,4,0,0.7,0.7,0.2,0.9,3,50,3,0.0,0.0,0.0,0.0,0.0,0.03,0.01,0.04,0.03,0.04,5.0,2.0,40.0,0.23,0.09,0.0,0.0,9.5,0.0,0.14,-0.7,-0.7,1159.0,163.0,637.0,487.0,39.0,12.0,1159.0,10.0,4.0,40.0,5.0,50.0,573.0,2605.0,1166.0,3.0,2.0,0.0,10.0,3.0,668.0,3.0,774.0,921.0,84.0,14518.0,4640.0,280.0,313.0,89.5,405.0,457.0,88.6,75.0,125.0,60.0,0.2,-0.2,3.0,36.0,2.0,1.0,855,64,24,0,4,3,38,0,0,0,0,2,7,14,0.65,11,0,0,2,0,1,1,0.05,1,0,0,0,0,0,45.0,27.0,26.0,17.0,2.0,28.0,34.0,82.4,44.0,27.0,17.0,20.0,65.0,85.0,2.0,0.0,14.0,0.0,0.0,0.0,0.0,83.0,33.0,49.3,EFL Championship
5,Semi Ajayi,ng NGA,DF,West Brom,29,1993,26,10,1210,13.4,2,1,3,2,0,0,1,0,0.9,0.9,0.2,1.1,21,44,2,0.15,0.07,0.22,0.15,0.22,0.07,0.01,0.08,0.07,0.08,6.0,2.0,33.3,0.45,0.15,0.33,1.0,8.4,0.0,0.16,1.1,1.1,893.0,82.0,396.0,427.0,75.0,15.0,893.0,15.0,9.0,60.0,5.0,33.3,502.0,2701.0,1635.0,21.0,16.0,0.0,13.0,8.0,623.0,2.0,634.0,785.0,80.8,13163.0,4639.0,195.0,220.0,88.6,339.0,396.0,85.6,98.0,150.0,65.3,0.2,0.8,5.0,54.0,2.0,1.0,746,36,21,1,18,9,11,0,0,0,0,3,7,10,0.74,10,0,0,0,0,0,0,0.0,0,0,0,0,0,0,19.0,12.0,11.0,6.0,2.0,9.0,15.0,60.0,9.0,5.0,4.0,10.0,29.0,30.0,1.0,0.0,9.0,1.0,0.0,0.0,0.0,60.0,35.0,55.6,EFL Championship
6,Yunus Akgün,tr TUR,"MF,FW",Leicester City,23,2000,23,9,890,9.9,1,2,3,1,0,0,1,0,1.2,1.2,1.6,2.8,24,27,80,0.1,0.2,0.3,0.1,0.3,0.12,0.16,0.28,0.12,0.28,16.0,3.0,18.8,1.62,0.3,0.06,0.33,16.3,0.0,0.08,-0.2,-0.2,513.0,0.0,49.0,226.0,243.0,38.0,513.0,17.0,8.0,47.1,7.0,41.2,337.0,1698.0,733.0,24.0,13.0,8.0,20.0,13.0,391.0,80.0,370.0,426.0,86.9,5499.0,910.0,202.0,226.0,89.4,143.0,157.0,91.1,14.0,25.0,56.0,1.6,0.4,15.0,21.0,7.0,1.0,410,15,4,2,1,17,1,10,5,4,0,1,6,32,3.24,23,3,2,1,2,1,4,0.4,4,0,0,0,0,0,16.0,10.0,5.0,7.0,4.0,4.0,11.0,36.4,10.0,0.0,10.0,6.0,22.0,3.0,0.0,0.0,12.0,1.0,0.0,0.0,0.0,49.0,4.0,25.0,EFL Championship
7,Marc Albrighton,eng ENG,FW,Leicester City,33,1989,12,0,129,1.4,0,0,0,0,0,0,0,0,0.0,0.0,0.8,0.8,6,8,20,0.0,0.0,0.0,0.0,0.0,0.03,0.53,0.56,0.03,0.56,,,,,,,,,,,,,123.0,2.0,15.0,43.0,67.0,5.0,123.0,4.0,1.0,25.0,3.0,75.0,77.0,317.0,132.0,6.0,2.0,0.0,5.0,2.0,84.0,20.0,64.0,103.0,62.1,1196.0,448.0,33.0,39.0,84.6,19.0,32.0,59.4,10.0,28.0,35.7,0.6,-0.8,7.0,8.0,6.0,3.0,89,14,1,0,0,27,4,9,1,6,0,0,2,10,7.03,6,3,0,0,0,1,0,0.0,0,0,0,0,0,0,5.0,4.0,1.0,3.0,1.0,3.0,5.0,60.0,2.0,0.0,2.0,2.0,7.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.0,100.0,EFL Championship
8,Carlos Alcaraz,ar ARG,"MF,FW",Southampton,20,2002,23,13,1102,12.2,3,1,4,3,0,1,4,0,4.4,3.6,1.1,4.6,28,78,74,0.25,0.08,0.33,0.25,0.33,0.36,0.09,0.44,0.29,0.38,40.0,16.0,40.0,3.27,1.31,0.08,0.19,19.5,3.0,0.09,-1.4,-0.6,808.0,19.0,74.0,445.0,301.0,37.0,807.0,47.0,23.0,48.9,23.0,48.9,513.0,2572.0,1219.0,28.0,28.0,6.0,35.0,17.0,628.0,74.0,508.0,616.0,82.5,7850.0,1771.0,276.0,314.0,87.9,171.0,202.0,84.7,42.0,61.0,68.9,1.7,-0.1,17.0,51.0,23.0,1.0,601,14,5,5,9,5,7,1,0,0,0,1,15,47,3.83,36,0,6,3,1,1,4,0.33,4,0,0,0,0,0,29.0,18.0,6.0,15.0,8.0,15.0,28.0,53.6,21.0,2.0,19.0,6.0,35.0,14.0,0.0,0.0,8.0,3.0,0.0,0.0,0.0,60.0,18.0,58.1,EFL Championship
9,Ajibola Alese,eng ENG,DF,Sunderland,22,2001,8,5,462,5.1,0,0,0,0,0,0,2,0,0.0,0.0,0.8,0.8,7,17,18,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.16,0.0,0.16,,,,,,,,,,,,,371.0,14.0,140.0,157.0,75.0,11.0,371.0,4.0,1.0,25.0,1.0,25.0,167.0,630.0,328.0,7.0,5.0,3.0,3.0,1.0,213.0,18.0,255.0,324.0,78.7,3806.0,1241.0,130.0,149.0,87.2,107.0,129.0,82.9,10.0,26.0,38.5,0.8,-0.8,7.0,9.0,1.0,0.0,260,64,6,0,0,4,57,0,0,0,0,0,7,11,2.14,8,0,0,1,0,2,0,0.0,0,0,0,0,0,0,10.0,9.0,6.0,3.0,1.0,4.0,7.0,57.1,7.0,1.0,6.0,6.0,16.0,13.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,18.0,9.0,56.3,EFL Championship


In [59]:
championship_outfield_df['League'] = 'EFL Championship'

# championship_outfield_df[championship_outfield_df['League']] == 'EFL Championship'

## Primeira Liga

### Standard Stats

In [61]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/32/stats/Primeira-Liga-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_standard' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_standard')

        primeiraliga_stand_df = pd.read_html(StringIO(str(div)))[0]
        print(primeiraliga_stand_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1       Rodrigo Abascal             uy URU   
1                    2          Antonio Adán             es ESP   
2                    3       Dennis Adeniran            eng ENG   
3                    4  Martin Agirregabiria             es ESP   
4                    5         Salvador Agra             pt POR   
..                 ...                   ...                ...   
552                532       Rodrigo Zalazar             uy URU   
553                533     Vinícius Zanocelo             br BRA   
554                534                 Zinho             br BRA   
555                535           Ivan Zlobin             ru RUS   
556                536        Nermin Zolotić             ba BIH   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [62]:

# Flattening multiIndex columns
primeiraliga_stand_df.columns = ['_'.join(col) for col in primeiraliga_stand_df.columns.values]

# Dropping useless columns
primeiraliga_stand_df = primeiraliga_stand_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 36_level_0_Matches'], axis=1)

# Renaming player ID columns
primeiraliga_stand_df = primeiraliga_stand_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
primeiraliga_stand_df = primeiraliga_stand_df.rename(columns={'Playing Time_MP':'MP',
                                        'Playing Time_Starts':'Starts', 
                                        'Playing Time_Min':'Min', 
                                        'Playing Time_90s':'No_90s',
                                        'Performance_Gls':'Gls', 
                                        'Performance_Ast':'Ast',
                                        'Performance_G+A':'G+A',
                                        'Performance_G-PK':'G-PK', 
                                        'Performance_PK':'PK', 
                                        'Performance_PKatt':'PKatt',
                                        'Performance_CrdY':'CrdY',
                                        'Performance_CrdR':'CrdR',
                                        'Expected_xG':'xG', 
                                        'Expected_npxG':'npxG',
                                        'Expected_xAG':'xAG', 
                                        'Expected_npxG+xAG':'npxG+xAG',
                                        'Progression_PrgC':'Prg_Carr',
                                        'Progression_PrgP':'Prg_Pass', 
                                        'Progression_PrgR':'Prg_Pass_Rec', 
                                        'Per 90 Minutes_Gls':'Gls_90',
                                        'Per 90 Minutes_Ast':'Ast_90', 
                                        'Per 90 Minutes_G+A':'G+A_90', 
                                        'Per 90 Minutes_G-PK':'G-PK_90',
                                        'Per 90 Minutes_G+A-PK':'G+A-PK_90', 
                                        'Per 90 Minutes_xG':'xG_90', 
                                        'Per 90 Minutes_xAG':'xAG_90',
                                        'Per 90 Minutes_xG+xAG':'xG+xAG_90', 
                                        'Per 90 Minutes_npxG':'npxG_90',
                                        'Per 90 Minutes_npxG+xAG':'npxG+xAG_90'})


In [None]:
#Examining the Players that have null values

primeiraliga_stand_df[primeiraliga_stand_df.isnull().any(axis=1)]

In [None]:
wagner_pina.info()

In [63]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Wagner Pina - Estoril
wagner_pina = primeiraliga_stand_df.loc[primeiraliga_stand_df['Player']== 'Wagner Pina']
wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')

#nan_players = pd.concat(wagner_pina, ignore_index=True)

primeiraliga_stand_df = pd.concat([primeiraliga_stand_df, wagner_pina], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')


In [None]:
primeiraliga_stand_df.tail()

In [64]:
# Dropping rows with NANs
primeiraliga_stand_df = primeiraliga_stand_df.dropna(axis=0, how='any')


# Removing column header rows
primeiraliga_stand_df = primeiraliga_stand_df[primeiraliga_stand_df["Player"] != "Player"]

convert_dict = {'No_90s':float,
                'Gls_90':float,
                'Ast_90':float,
                'G+A_90':float,
                'G-PK_90':float,
                'G+A_90':float,
                'G+A-PK_90':float,
                'xG':float,
                'npxG':float,
                'xAG':float,
                'npxG+xAG':float,
                'xG_90':float,
                'xAG_90':float,
                'xG+xAG_90':float,
                'npxG_90':float,
                'npxG+xAG_90':float,
                
                'MP':int, 
                'Starts':int, 
                'Min':int, 
                'Gls':int, 
                'Ast':int, 
                'G+A':int, 
                'G-PK':int, 
                'PK':int, 
                'PKatt':int,
                'CrdY':int,
                'CrdR':int,
                'Prg_Carr':int,
                'Prg_Pass':int,
                'Prg_Pass_Rec':int
    }

primeiraliga_stand_df = primeiraliga_stand_df.astype(convert_dict)

### Shooting

In [66]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/32/shooting/Primeira-Liga-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_shooting' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_shooting')

        primeiraliga_shoot_df = pd.read_html(StringIO(str(div)))[0]
        print(primeiraliga_shoot_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1       Rodrigo Abascal             uy URU   
1                    2          Antonio Adán             es ESP   
2                    3       Dennis Adeniran            eng ENG   
3                    4  Martin Agirregabiria             es ESP   
4                    5         Salvador Agra             pt POR   
..                 ...                   ...                ...   
552                532       Rodrigo Zalazar             uy URU   
553                533     Vinícius Zanocelo             br BRA   
554                534                 Zinho             br BRA   
555                535           Ivan Zlobin             ru RUS   
556                536        Nermin Zolotić             ba BIH   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [67]:
# Flattening multiIndex columns
primeiraliga_shoot_df.columns = ['_'.join(col) for col in primeiraliga_shoot_df.columns.values]

# Dropping useless columns
primeiraliga_shoot_df = primeiraliga_shoot_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 25_level_0_Matches','Standard_Gls','Standard_PK',
                              'Standard_PKatt', 'Expected_xG', 'Expected_npxG'], axis=1)

# Renaming player ID columns
primeiraliga_shoot_df = primeiraliga_shoot_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
primeiraliga_shoot_df = primeiraliga_shoot_df.rename(columns={'Standard_Sh':'Shots', 
                                        'Standard_SoT':'SoT',
                                        'Standard_SoT%':'SoT_pct', 
                                        'Standard_Sh/90':'Shots_90', 
                                        'Standard_SoT/90':'SoT_90', 
                                        'Standard_G/Sh':'Gls_per_Sh',
                                        'Standard_G/SoT':'Gls_per_SoT', 
                                        'Standard_Dist':'Avg_Sh_Dist', 
                                        'Standard_FK':'Sh_FK', 
                                        'Expected_npxG/Sh':'npxG_per_Sh',
                                        'Expected_G-xG':'G-xG', 
                                        'Expected_np:G-xG':'npG-npxG'})

In [68]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Peter Kioso - Rotherham United
wagner_pina = primeiraliga_shoot_df.loc[primeiraliga_shoot_df['Player']== 'Wagner Pina']
wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')

# nan_players = pd.concat(wagner_pina, ignore_index=True)

primeiraliga_shoot_df = pd.concat([primeiraliga_shoot_df, wagner_pina], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')


In [69]:
# Dropping rows with NANs
primeiraliga_shoot_df = primeiraliga_shoot_df.dropna(axis=0, how='any')


# Removing column header rows
primeiraliga_shoot_df = primeiraliga_shoot_df[primeiraliga_shoot_df["Player"] != "Player"]

convert_dict = {
    'Shots':int,
    'SoT':int,
    'SoT_pct':float,
    'Shots_90':float,
    'SoT_90':float,
    'Gls_per_Sh':float,
    'Gls_per_SoT':float,
    'Avg_Sh_Dist':float,
    'Sh_FK':int,
    'npxG_per_Sh':float,
    'G-xG':float,
    'npG-npxG':float
    }

primeiraliga_shoot_df = primeiraliga_shoot_df.astype(convert_dict)

### Possession

In [70]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/32/possession/Primeira-Liga-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_possession' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_possession')

        primeiraliga_poss_df = pd.read_html(StringIO(str(div)))[0]
        print(primeiraliga_poss_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1       Rodrigo Abascal             uy URU   
1                    2          Antonio Adán             es ESP   
2                    3       Dennis Adeniran            eng ENG   
3                    4  Martin Agirregabiria             es ESP   
4                    5         Salvador Agra             pt POR   
..                 ...                   ...                ...   
552                532       Rodrigo Zalazar             uy URU   
553                533     Vinícius Zanocelo             br BRA   
554                534                 Zinho             br BRA   
555                535           Ivan Zlobin             ru RUS   
556                536        Nermin Zolotić             ba BIH   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [71]:
# Flattening multiIndex columns
primeiraliga_poss_df.columns = ['_'.join(col) for col in primeiraliga_poss_df.columns.values]

# Dropping useless columns
primeiraliga_poss_df = primeiraliga_poss_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 30_level_0_Matches'], axis=1)

# Renaming player ID columns
primeiraliga_poss_df = primeiraliga_poss_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
primeiraliga_poss_df = primeiraliga_poss_df.rename(columns={'Touches_Touches':'Touches','Carries_Carries':'Carries'})

In [72]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Wagner Pina - Rotherham United
wagner_pina = primeiraliga_poss_df.loc[primeiraliga_poss_df['Player']== 'Wagner Pina']
wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')

# nan_players = pd.concat(wagner_pina, ignore_index=True)

primeiraliga_poss_df = pd.concat([primeiraliga_poss_df, wagner_pina], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')


In [73]:
# Dropping rows with NANs
primeiraliga_poss_df = primeiraliga_poss_df.dropna(axis=0, how='any')


# Removing column header rows
primeiraliga_poss_df = primeiraliga_poss_df[primeiraliga_poss_df["Player"] != "Player"]

convert_dict = {
                'Touches':int, 
                'Touches_Def Pen':int, 
                'Touches_Def 3rd':int, 
                'Touches_Mid 3rd':int,
                'Touches_Att 3rd':int, 
                'Touches_Att Pen':int, 
                'Touches_Live':int, 
                'Take-Ons_Att':int,
                'Take-Ons_Succ':int, 
                'Take-Ons_Succ%':float, 
                'Take-Ons_Tkld':int, 
                'Take-Ons_Tkld%':float,
                'Carries':int, 
                'Carries_TotDist':int, 
                'Carries_PrgDist':int, 
                'Carries_PrgC':int,
                'Carries_1/3':int, 
                'Carries_CPA':int, 
                'Carries_Mis':int, 
                'Carries_Dis':int,
                'Receiving_Rec':int, 
                'Receiving_PrgR':int
    }

primeiraliga_poss_df = primeiraliga_poss_df.astype(convert_dict)

In [None]:
primeiraliga_poss_df.tail()

### Passing

In [74]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/32/passing/Primeira-Liga-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_passing' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_passing')

        primeiraliga_pass_df = pd.read_html(StringIO(str(div)))[0]
        print(primeiraliga_pass_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1       Rodrigo Abascal             uy URU   
1                    2          Antonio Adán             es ESP   
2                    3       Dennis Adeniran            eng ENG   
3                    4  Martin Agirregabiria             es ESP   
4                    5         Salvador Agra             pt POR   
..                 ...                   ...                ...   
552                532       Rodrigo Zalazar             uy URU   
553                533     Vinícius Zanocelo             br BRA   
554                534                 Zinho             br BRA   
555                535           Ivan Zlobin             ru RUS   
556                536        Nermin Zolotić             ba BIH   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [75]:
# Flattening multiIndex columns
primeiraliga_pass_df.columns = ['_'.join(col) for col in primeiraliga_pass_df.columns.values]

# Dropping useless columns
primeiraliga_pass_df = primeiraliga_pass_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 22_level_0_Ast',
                                                  'Unnamed: 31_level_0_Matches','Unnamed: 23_level_0_xAG','Unnamed: 30_level_0_PrgP'], axis=1)

# Renaming player ID columns
primeiraliga_pass_df = primeiraliga_pass_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
primeiraliga_pass_df = primeiraliga_pass_df.rename(columns={'Expected_xA':'xA',
                                      'Expected_A-xAG':'A-xAG',
                                      'Unnamed: 26_level_0_KP':'KP',
                                      'Unnamed: 27_level_0_1/3':'Pass_Fin_3rd',
                                      'Unnamed: 28_level_0_PPA':'Pass_Pen_Area',
                                      'Unnamed: 29_level_0_CrsPA':'Cross_Pen_Area'})

In [76]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Wagner Pina - Rotherham United
wagner_pina = primeiraliga_pass_df.loc[primeiraliga_pass_df['Player']== 'Wagner Pina']
wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')

# nan_players = pd.concat(wagner_pina, ignore_index=True)

primeiraliga_pass_df = pd.concat([primeiraliga_pass_df, wagner_pina], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')


In [77]:
# Dropping rows with NANs
primeiraliga_pass_df = primeiraliga_pass_df.dropna(axis=0, how='any')


# Removing column header rows
primeiraliga_pass_df = primeiraliga_pass_df[primeiraliga_pass_df["Player"] != "Player"]

convert_dict = {
                'Total_Cmp':int, 
                'Total_Att':int, 
                'Total_Cmp%':float, 
                'Total_TotDist':int,
                'Total_PrgDist':int, 
                'Short_Cmp':int, 
                'Short_Att':int, 
                'Short_Cmp%':float, 
                'Medium_Cmp':int,
                'Medium_Att':int, 
                'Medium_Cmp%':float, 
                'Long_Cmp':int, 
                'Long_Att':int, 
                'Long_Cmp%':float, 
                'xA':float,
                'A-xAG':float, 
                'KP':int, 
                'Pass_Fin_3rd':int, 
                'Pass_Pen_Area':int, 
                'Cross_Pen_Area':int
    }

primeiraliga_pass_df = primeiraliga_pass_df.astype(convert_dict)

### Passing Types

In [78]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/32/passing_types/Primeira-Liga-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_passing_types' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_passing_types')

        primeiraliga_ptype_df = pd.read_html(StringIO(str(div)))[0]
        print(primeiraliga_ptype_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1       Rodrigo Abascal             uy URU   
1                    2          Antonio Adán             es ESP   
2                    3       Dennis Adeniran            eng ENG   
3                    4  Martin Agirregabiria             es ESP   
4                    5         Salvador Agra             pt POR   
..                 ...                   ...                ...   
552                532       Rodrigo Zalazar             uy URU   
553                533     Vinícius Zanocelo             br BRA   
554                534                 Zinho             br BRA   
555                535           Ivan Zlobin             ru RUS   
556                536        Nermin Zolotić             ba BIH   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [79]:
# Flattening multiIndex columns
primeiraliga_ptype_df.columns = ['_'.join(col) for col in primeiraliga_ptype_df.columns.values]

# Dropping useless columns
primeiraliga_ptype_df = primeiraliga_ptype_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 23_level_0_Matches','Outcomes_Cmp',
                                                    'Unnamed: 8_level_0_Att'], axis=1)

# Renaming player ID columns
primeiraliga_ptype_df = primeiraliga_ptype_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
primeiraliga_ptype_df = primeiraliga_ptype_df.rename(columns={'Outcomes_Blocks':'Pass_Blocked',
                                      'Outcomes_Off':'Pass_Offsides'})

In [80]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Wagner Pina - Rotherham United
wagner_pina = primeiraliga_ptype_df.loc[primeiraliga_ptype_df['Player']== 'Wagner Pina']
wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')

# nan_players = pd.concat(wagner_pina, ignore_index=True)

primeiraliga_ptype_df = pd.concat([primeiraliga_ptype_df, wagner_pina], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')


In [81]:
# Dropping rows with NANs
primeiraliga_ptype_df = primeiraliga_ptype_df.dropna(axis=0, how='any')


# Removing column header rows
primeiraliga_ptype_df = primeiraliga_ptype_df[primeiraliga_ptype_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'Pass Types_Live':int, 
                'Pass Types_Dead':int, 
                'Pass Types_FK':int, 
                'Pass Types_TB':int,
                'Pass Types_Sw':int, 
                'Pass Types_Crs':int, 
                'Pass Types_TI':int, 
                'Pass Types_CK':int,
                'Corner Kicks_In':int, 
                'Corner Kicks_Out':int, 
                'Corner Kicks_Str':int,
                'Pass_Offsides':int, 
                'Pass_Blocked':int  
}
primeiraliga_ptype_df = primeiraliga_ptype_df.astype(convert_dict)

### Goal and Shot Creation

In [82]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/32/gca/Primeira-Liga-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_gca' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_gca')

        primeiraliga_gca_df = pd.read_html(StringIO(str(div)))[0]
        print(primeiraliga_gca_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1       Rodrigo Abascal             uy URU   
1                    2          Antonio Adán             es ESP   
2                    3       Dennis Adeniran            eng ENG   
3                    4  Martin Agirregabiria             es ESP   
4                    5         Salvador Agra             pt POR   
..                 ...                   ...                ...   
552                532       Rodrigo Zalazar             uy URU   
553                533     Vinícius Zanocelo             br BRA   
554                534                 Zinho             br BRA   
555                535           Ivan Zlobin             ru RUS   
556                536        Nermin Zolotić             ba BIH   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [83]:
# Flattening multiIndex columns
primeiraliga_gca_df.columns = ['_'.join(col) for col in primeiraliga_gca_df.columns.values]

# Dropping useless columns
primeiraliga_gca_df = primeiraliga_gca_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 24_level_0_Matches'], axis=1)

# Renaming player ID columns
primeiraliga_gca_df = primeiraliga_gca_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
primeiraliga_gca_df = primeiraliga_gca_df.rename(columns={'SCA_SCA':'SCA',
                                    'SCA_SCA90':'SCA_90',
                                    'SCA_Types_PassLive':'SCA_PassLive',
                                    'SCA_Types_PassDead':'SCA_PassDead',
                                    'SCA_Types_TO':'SCA_TO',
                                    'SCA_Types_Sh':'SCA_Shot',
                                    'SCA_Types_Fld':'SCA_Fouls_Drawn',
                                    'SCA_Types_Def':'SCA_Def_Action',
                                    'GCA_GCA':'GCA',
                                    'GCA_GCA90':'GCA_90',
                                    'GCA_Types_PassLive':'GCA_PassLive',
                                    'GCA_Types_PassDead':'GCA_PassDead',
                                    'GCA_Types_TO':'GCA_TO',
                                    'GCA_Types_Sh':'GCA_Shot',
                                    'GCA_Types_Fld':'GCA_Fouls_Drawn',
                                    'GCA_Types_Def':'GCA_Def_Action'})

In [84]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Wagner Pina - Rotherham United
wagner_pina = primeiraliga_gca_df.loc[primeiraliga_gca_df['Player']== 'Wagner Pina']
wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')

# nan_players = pd.concat(wagner_pina, ignore_index=True)

primeiraliga_gca_df = pd.concat([primeiraliga_gca_df, wagner_pina], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')


In [85]:
# Dropping rows with NANs
primeiraliga_gca_df = primeiraliga_gca_df.dropna(axis=0, how='any')


# Removing column header rows
primeiraliga_gca_df = primeiraliga_gca_df[primeiraliga_gca_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'SCA':int,
                'SCA_90':float, 
                'SCA Types_PassLive':int, 
                'SCA Types_PassDead':int, 
                'SCA Types_TO':int,
                'SCA Types_Sh':int, 
                'SCA Types_Fld':int, 
                'SCA Types_Def':int, 
                'GCA':int, 
                'GCA_90':float,
                'GCA Types_PassLive':int, 
                'GCA Types_PassDead':int, 
                'GCA Types_TO':int,
                'GCA Types_Sh':int, 
                'GCA Types_Fld':int, 
                'GCA Types_Def':int  
}
primeiraliga_gca_df = primeiraliga_gca_df.astype(convert_dict)

### Defensive Actions

In [86]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/32/defense/Primeira-Liga-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_defense' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_defense')

        primeiraliga_def_df = pd.read_html(StringIO(str(div)))[0]
        print(primeiraliga_def_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1       Rodrigo Abascal             uy URU   
1                    2          Antonio Adán             es ESP   
2                    3       Dennis Adeniran            eng ENG   
3                    4  Martin Agirregabiria             es ESP   
4                    5         Salvador Agra             pt POR   
..                 ...                   ...                ...   
552                532       Rodrigo Zalazar             uy URU   
553                533     Vinícius Zanocelo             br BRA   
554                534                 Zinho             br BRA   
555                535           Ivan Zlobin             ru RUS   
556                536        Nermin Zolotić             ba BIH   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [87]:
# Flattening multiIndex columns
primeiraliga_def_df.columns = ['_'.join(col) for col in primeiraliga_def_df.columns.values]

# Dropping useless columns
primeiraliga_def_df = primeiraliga_def_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 24_level_0_Matches', 'Challenges_Lost'], axis=1)

# Renaming player ID columns
primeiraliga_def_df = primeiraliga_def_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
primeiraliga_def_df = primeiraliga_def_df.rename(columns={'Tackles_Tkl':'Tkls',
                                    'Tackles_TklW':'Tkls_Won',
                                    'Tackles_Def_3rd':'Tkl_Def_3rd',
                                    'Tackles_Mid_3rd':'Tkl_Mid_3rd',
                                    'Tackles_Att_3rd':'Tkl_Att_3rd',
                                    'Challenges_Tkl':'Drib_Tkl',
                                    'Challenges_Att':'Drib_Tkl_Att',
                                    'Challenges_Tkl%':'Drib_Tkl%',
                                    'Blocks_Blocks':'Def_Blocks',
                                    'Blocks_Sh':'Def_Shot_Blocks',
                                    'Blocks_Pass':'Def_Pass_Blocks',
                                    'Unnamed: 20_level_0_Int':'Int',
                                    'Unnamed: 21_level_0_Tkl+Int':'Tkl+Int',
                                    'Unnamed: 22_level_0_Clr':'Clearances',
                                    'Unnamed: 23_level_0_Err':'Errors'})

In [88]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Wagner Pina - Rotherham United
wagner_pina = primeiraliga_def_df.loc[primeiraliga_def_df['Player']== 'Wagner Pina']
wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')

# nan_players = pd.concat(wagner_pina, ignore_index=True)

primeiraliga_def_df = pd.concat([primeiraliga_def_df, wagner_pina], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')


In [89]:
primeiraliga_def_df = primeiraliga_def_df.dropna(axis=0, how='any')


# Removing column header rows
primeiraliga_def_df = primeiraliga_def_df[primeiraliga_def_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'Tkls':int,
                'Tkls_Won':int, 
                'Tackles_Def 3rd':int, 
                'Tackles_Mid 3rd':int, 
                'Tackles_Att 3rd':int,
                'Drib_Tkl':int, 
                'Drib_Tkl_Att':int, 
                'Drib_Tkl%':float, 
                'Def_Blocks':int,
                'Def_Shot_Blocks':int, 
                'Def_Pass_Blocks':int, 
                'Int':int, 
                'Tkl+Int':int, 
                'Clearances':int,
                'Errors':int 
}
primeiraliga_def_df = primeiraliga_def_df.astype(convert_dict)

### Miscellaneous Stats

In [90]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/32/misc/Primeira-Liga-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_misc' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_misc')

        primeiraliga_misc_df = pd.read_html(StringIO(str(div)))[0]
        print(primeiraliga_misc_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1       Rodrigo Abascal             uy URU   
1                    2          Antonio Adán             es ESP   
2                    3       Dennis Adeniran            eng ENG   
3                    4  Martin Agirregabiria             es ESP   
4                    5         Salvador Agra             pt POR   
..                 ...                   ...                ...   
552                532       Rodrigo Zalazar             uy URU   
553                533     Vinícius Zanocelo             br BRA   
554                534                 Zinho             br BRA   
555                535           Ivan Zlobin             ru RUS   
556                536        Nermin Zolotić             ba BIH   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [91]:
# Flattening multiIndex columns
primeiraliga_misc_df.columns = ['_'.join(col) for col in primeiraliga_misc_df.columns.values]

# Dropping useless columns
primeiraliga_misc_df = primeiraliga_misc_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 24_level_0_Matches','Aerial Duels_Lost',
                         'Performance_CrdY','Performance_CrdR','Performance_Crs','Performance_Int','Performance_TklW',
                         'Performance_Fld'], axis=1)

# Renaming player ID columns
primeiraliga_misc_df = primeiraliga_misc_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
primeiraliga_misc_df = primeiraliga_misc_df.rename(columns={'Performance_2CrdY':'CrdY2',
                                    'Performance_Fls':'Fls_Comm',
                                    'Performance_Off':'Offsides',
                                    'Performance_PKwon':'PK_Won',
                                    'Performance_PKcon':'PK_Conv',
                                    'Performance_OG':'Own_Goal',
                                    'Performance_Recov':'Ball_Recoveries'})

In [92]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Wagner Pina - Rotherham United
wagner_pina = primeiraliga_misc_df.loc[primeiraliga_misc_df['Player']== 'Wagner Pina']
wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')

# nan_players = pd.concat(wagner_pina, ignore_index=True)

primeiraliga_misc_df = pd.concat([primeiraliga_misc_df, wagner_pina], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wagner_pina.Nation = wagner_pina.Nation.fillna('pt POR')


In [93]:
primeiraliga_misc_df = primeiraliga_misc_df.dropna(axis=0, how='any')


# Removing column header rows
primeiraliga_misc_df = primeiraliga_misc_df[primeiraliga_misc_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'CrdY2':int,
                'Fls_Comm':int, 
                'Offsides':int, 
                'PK_Won':int, 
                'PK_Conv':int, 
                'Own_Goal':int,
                'Ball_Recoveries':int, 
                'Aerial Duels_Won':int, 
                'Aerial Duels_Won%':float 
}
primeiraliga_misc_df = primeiraliga_misc_df.astype(convert_dict)

### Merging Stat Group datasets

In [94]:
primeiraliga_conditions_join = ['Player','Nation','Position','Squad','Age','Born']

primeiraliga_outfield_df = pd.merge(primeiraliga_stand_df, primeiraliga_shoot_df, left_on=primeiraliga_conditions_join, right_on=primeiraliga_conditions_join,
                          how='left')

primeiraliga_outfield_df = pd.merge(primeiraliga_outfield_df, primeiraliga_poss_df, left_on=primeiraliga_conditions_join, right_on=primeiraliga_conditions_join,
                          how='left')

primeiraliga_outfield_df = pd.merge(primeiraliga_outfield_df, primeiraliga_pass_df, left_on=primeiraliga_conditions_join, right_on=primeiraliga_conditions_join,
                          how='left')

primeiraliga_outfield_df = pd.merge(primeiraliga_outfield_df, primeiraliga_ptype_df, left_on=primeiraliga_conditions_join, right_on=primeiraliga_conditions_join,
                          how='left')

primeiraliga_outfield_df = pd.merge(primeiraliga_outfield_df, primeiraliga_gca_df, left_on=primeiraliga_conditions_join, right_on=primeiraliga_conditions_join,
                          how='left')

primeiraliga_outfield_df = pd.merge(primeiraliga_outfield_df, primeiraliga_def_df, left_on=primeiraliga_conditions_join, right_on=primeiraliga_conditions_join,
                          how='left')

primeiraliga_outfield_df = pd.merge(primeiraliga_outfield_df, primeiraliga_misc_df, left_on=primeiraliga_conditions_join, right_on=primeiraliga_conditions_join,
                          how='left')

In [95]:
primeiraliga_outfield_df['League'] = 'Primeira Liga'

In [96]:
primeiraliga_outfield_df.head()

Unnamed: 0,Player,Nation,Position,Squad,Age,Born,MP,Starts,Min,No_90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,Prg_Carr,Prg_Pass,Prg_Pass_Rec,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90,Shots,SoT,SoT_pct,Shots_90,SoT_90,Gls_per_Sh,Gls_per_SoT,Avg_Sh_Dist,Sh_FK,npxG_per_Sh,G-xG,npG-npxG,Touches,Touches_Def Pen,Touches_Def 3rd,Touches_Mid 3rd,Touches_Att 3rd,Touches_Att Pen,Touches_Live,Take-Ons_Att,Take-Ons_Succ,Take-Ons_Succ%,Take-Ons_Tkld,Take-Ons_Tkld%,Carries,Carries_TotDist,Carries_PrgDist,Carries_PrgC,Carries_1/3,Carries_CPA,Carries_Mis,Carries_Dis,Receiving_Rec,Receiving_PrgR,Total_Cmp,Total_Att,Total_Cmp%,Total_TotDist,Total_PrgDist,Short_Cmp,Short_Att,Short_Cmp%,Medium_Cmp,Medium_Att,Medium_Cmp%,Long_Cmp,Long_Att,Long_Cmp%,xA,A-xAG,KP,Pass_Fin_3rd,Pass_Pen_Area,Cross_Pen_Area,Pass Types_Live,Pass Types_Dead,Pass Types_FK,Pass Types_TB,Pass Types_Sw,Pass Types_Crs,Pass Types_TI,Pass Types_CK,Corner Kicks_In,Corner Kicks_Out,Corner Kicks_Str,Pass_Offsides,Pass_Blocked,SCA,SCA_90,SCA Types_PassLive,SCA Types_PassDead,SCA Types_TO,SCA Types_Sh,SCA Types_Fld,SCA Types_Def,GCA,GCA_90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def,Tkls,Tkls_Won,Tackles_Def 3rd,Tackles_Mid 3rd,Tackles_Att 3rd,Drib_Tkl,Drib_Tkl_Att,Drib_Tkl%,Def_Blocks,Def_Shot_Blocks,Def_Pass_Blocks,Int,Tkl+Int,Clearances,Errors,CrdY2,Fls_Comm,Offsides,PK_Won,PK_Conv,Own_Goal,Ball_Recoveries,Aerial Duels_Won,Aerial Duels_Won%,League
0,Rodrigo Abascal,uy URU,DF,Boavista,29,1994,29,29,2479,27.5,2,0,2,2,0,0,8,0,1.4,1.4,0.6,2.0,17,83,19,0.07,0.0,0.07,0.07,0.07,0.05,0.02,0.07,0.05,0.07,12.0,4.0,33.3,0.44,0.15,0.17,0.5,22.3,4.0,0.12,0.6,0.6,1928.0,264.0,1103.0,718.0,118.0,19.0,1928.0,10.0,8.0,80.0,2.0,20.0,1060.0,4679.0,2625.0,17.0,14.0,1.0,18.0,1.0,1258.0,19.0,1285.0,1625.0,79.1,29048.0,12863.0,331.0,368.0,89.9,710.0,784.0,90.6,239.0,438.0,54.6,0.8,-0.6,9.0,90.0,7.0,6.0,1499,116,81,2,27,41,33,0,0,0,0,10,8,30,1.09,25,1,0,1,1,2,3,0.11,2,0,0,0,1,0,46.0,26.0,33.0,13.0,0.0,26.0,40.0,65.0,59.0,32.0,27.0,34.0,80.0,110.0,3.0,0.0,35.0,0.0,0.0,1.0,1.0,132.0,45.0,72.6,Primeira Liga
1,Antonio Adán,es ESP,GK,Sporting CP,36,1987,22,22,1980,22.0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,623.0,521.0,612.0,12.0,0.0,0.0,623.0,1.0,1.0,100.0,0.0,0.0,370.0,1933.0,1233.0,0.0,0.0,0.0,1.0,0.0,300.0,0.0,478.0,601.0,79.5,12116.0,8466.0,90.0,90.0,100.0,281.0,284.0,98.9,107.0,227.0,47.1,0.0,0.0,0.0,8.0,0.0,0.0,454,147,50,0,1,1,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,2.0,0.0,0.0,2.0,0.0,18.0,1.0,100.0,Primeira Liga
2,Dennis Adeniran,eng ENG,MF,Portimonense,24,1999,1,0,6,0.1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,Primeira Liga
3,Martin Agirregabiria,es ESP,DF,Famalicão,27,1996,24,13,1207,13.4,0,0,0,0,0,0,4,0,0.1,0.1,0.6,0.7,18,52,66,0.0,0.0,0.0,0.0,0.0,0.01,0.05,0.05,0.01,0.05,4.0,1.0,25.0,0.3,0.07,0.0,0.0,27.1,0.0,0.02,-0.1,-0.1,810.0,45.0,284.0,342.0,192.0,6.0,810.0,14.0,8.0,57.1,5.0,35.7,391.0,1564.0,916.0,18.0,20.0,2.0,11.0,5.0,473.0,66.0,516.0,694.0,74.4,9276.0,3944.0,229.0,269.0,85.1,230.0,280.0,82.1,49.0,113.0,43.4,0.8,-0.6,9.0,41.0,18.0,10.0,550,143,17,0,8,55,117,9,9,0,0,1,18,25,1.87,21,3,1,0,0,0,2,0.15,2,0,0,0,0,0,21.0,18.0,15.0,4.0,2.0,17.0,24.0,70.8,12.0,2.0,10.0,11.0,32.0,31.0,0.0,0.0,17.0,2.0,0.0,0.0,0.0,55.0,11.0,47.8,Primeira Liga
4,Salvador Agra,pt POR,"FW,DF",Boavista,31,1991,31,31,2459,27.3,3,1,4,3,0,0,7,0,2.3,2.3,3.6,5.9,60,89,174,0.11,0.04,0.15,0.11,0.15,0.08,0.13,0.22,0.08,0.22,27.0,10.0,37.0,0.99,0.37,0.11,0.3,23.9,6.0,0.08,0.7,0.7,1332.0,27.0,253.0,570.0,516.0,28.0,1332.0,53.0,21.0,39.6,30.0,56.6,714.0,3853.0,1743.0,60.0,38.0,12.0,42.0,27.0,882.0,174.0,840.0,1129.0,74.4,14919.0,4682.0,367.0,419.0,87.6,367.0,446.0,82.3,91.0,203.0,44.8,4.8,-2.6,45.0,41.0,41.0,21.0,916,212,53,3,10,174,94,65,36,21,3,1,26,84,3.08,48,27,1,5,3,0,7,0.26,4,1,1,0,1,0,21.0,9.0,14.0,5.0,2.0,9.0,31.0,29.0,8.0,1.0,7.0,16.0,37.0,19.0,0.0,0.0,42.0,9.0,1.0,0.0,0.0,123.0,5.0,21.7,Primeira Liga


## Eredivisie

### Standard

In [97]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/23/stats/Eredivisie-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_standard' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_standard')

        eredivisie_stand_df = pd.read_html(StringIO(str(div)))[0]
        print(eredivisie_stand_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0     Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                 Player             Nation   
0                    1    Patrick van Aanholt             nl NED   
1                    2        Paxten Aaronson             us USA   
2                    3           Jayden Addai             nl NED   
3                    4         Bobby Adekanye             nl NED   
4                    5          Shawn Adewoye             be BEL   
..                 ...                    ...                ...   
541                522      Lequincio Zeefuik             nl NED   
542                523         Ramiz Zerrouki             dz ALG   
543                524      Giovanni van Zwam             nl NED   
544                525  Willum Þór Willumsson             is ISL   
545                 Rk                 Player             Nation   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad           

In [98]:

# Flattening multiIndex columns
eredivisie_stand_df.columns = ['_'.join(col) for col in eredivisie_stand_df.columns.values]

# Dropping useless columns
eredivisie_stand_df = eredivisie_stand_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 36_level_0_Matches'], axis=1)

# Renaming player ID columns
eredivisie_stand_df = eredivisie_stand_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
eredivisie_stand_df = eredivisie_stand_df.rename(columns={'Playing Time_MP':'MP',
                                        'Playing Time_Starts':'Starts', 
                                        'Playing Time_Min':'Min', 
                                        'Playing Time_90s':'No_90s',
                                        'Performance_Gls':'Gls', 
                                        'Performance_Ast':'Ast',
                                        'Performance_G+A':'G+A',
                                        'Performance_G-PK':'G-PK', 
                                        'Performance_PK':'PK', 
                                        'Performance_PKatt':'PKatt',
                                        'Performance_CrdY':'CrdY',
                                        'Performance_CrdR':'CrdR',
                                        'Expected_xG':'xG', 
                                        'Expected_npxG':'npxG',
                                        'Expected_xAG':'xAG', 
                                        'Expected_npxG+xAG':'npxG+xAG',
                                        'Progression_PrgC':'Prg_Carr',
                                        'Progression_PrgP':'Prg_Pass', 
                                        'Progression_PrgR':'Prg_Pass_Rec', 
                                        'Per 90 Minutes_Gls':'Gls_90',
                                        'Per 90 Minutes_Ast':'Ast_90', 
                                        'Per 90 Minutes_G+A':'G+A_90', 
                                        'Per 90 Minutes_G-PK':'G-PK_90',
                                        'Per 90 Minutes_G+A-PK':'G+A-PK_90', 
                                        'Per 90 Minutes_xG':'xG_90', 
                                        'Per 90 Minutes_xAG':'xAG_90',
                                        'Per 90 Minutes_xG+xAG':'xG+xAG_90', 
                                        'Per 90 Minutes_npxG':'npxG_90',
                                        'Per 90 Minutes_npxG+xAG':'npxG+xAG_90'})

In [None]:
#Examining the Players that have null values

eredivisie_stand_df[eredivisie_stand_df.isnull().any(axis=1)]

In [99]:

# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Younes Taha El Idrissi - Twente
yteli = eredivisie_stand_df.loc[eredivisie_stand_df['Player']== 'Younes Taha El Idrissi']
yteli.Age = yteli.Age.fillna(21)
yteli.Born = yteli.Born.fillna('2002')

# Yann Kitala
yann_kitala = eredivisie_stand_df.loc[eredivisie_stand_df['Player']== 'Yann Kitala'] 
yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


nan_players = pd.concat([yann_kitala,yteli], ignore_index=True)

eredivisie_stand_df = pd.concat([eredivisie_stand_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Born = yteli.Born.fillna('2002')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


In [100]:
# Dropping rows with NANs
eredivisie_stand_df = eredivisie_stand_df.dropna(axis=0, how='any')


# Removing column header rows
eredivisie_stand_df = eredivisie_stand_df[eredivisie_stand_df["Player"] != "Player"]

convert_dict = {'No_90s':float,
                'Gls_90':float,
                'Ast_90':float,
                'G+A_90':float,
                'G-PK_90':float,
                'G+A_90':float,
                'G+A-PK_90':float,
                'xG':float,
                'npxG':float,
                'xAG':float,
                'npxG+xAG':float,
                'xG_90':float,
                'xAG_90':float,
                'xG+xAG_90':float,
                'npxG_90':float,
                'npxG+xAG_90':float,
                
                'MP':int, 
                'Starts':int, 
                'Min':int, 
                'Gls':int, 
                'Ast':int, 
                'G+A':int, 
                'G-PK':int, 
                'PK':int, 
                'PKatt':int,
                'CrdY':int,
                'CrdR':int,
                'Prg_Carr':int,
                'Prg_Pass':int,
                'Prg_Pass_Rec':int
    }

eredivisie_stand_df = eredivisie_stand_df.astype(convert_dict)

### Shooting

In [101]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/23/shooting/Eredivisie-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_shooting' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_shooting')

        eredivisie_shoot_df = pd.read_html(StringIO(str(div)))[0]
        print(eredivisie_shoot_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0     Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                 Player             Nation   
0                    1    Patrick van Aanholt             nl NED   
1                    2        Paxten Aaronson             us USA   
2                    3           Jayden Addai             nl NED   
3                    4         Bobby Adekanye             nl NED   
4                    5          Shawn Adewoye             be BEL   
..                 ...                    ...                ...   
541                522      Lequincio Zeefuik             nl NED   
542                523         Ramiz Zerrouki             dz ALG   
543                524      Giovanni van Zwam             nl NED   
544                525  Willum Þór Willumsson             is ISL   
545                 Rk                 Player             Nation   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad           

In [102]:
# Flattening multiIndex columns
eredivisie_shoot_df.columns = ['_'.join(col) for col in eredivisie_shoot_df.columns.values]

# Dropping useless columns
eredivisie_shoot_df = eredivisie_shoot_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 25_level_0_Matches','Standard_Gls','Standard_PK',
                              'Standard_PKatt', 'Expected_xG', 'Expected_npxG'], axis=1)

# Renaming player ID columns
eredivisie_shoot_df = eredivisie_shoot_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
eredivisie_shoot_df = eredivisie_shoot_df.rename(columns={'Standard_Sh':'Shots', 
                                        'Standard_SoT':'SoT',
                                        'Standard_SoT%':'SoT_pct', 
                                        'Standard_Sh/90':'Shots_90', 
                                        'Standard_SoT/90':'SoT_90', 
                                        'Standard_G/Sh':'Gls_per_Sh',
                                        'Standard_G/SoT':'Gls_per_SoT', 
                                        'Standard_Dist':'Avg_Sh_Dist', 
                                        'Standard_FK':'Sh_FK', 
                                        'Expected_npxG/Sh':'npxG_per_Sh',
                                        'Expected_G-xG':'G-xG', 
                                        'Expected_np:G-xG':'npG-npxG'})

In [103]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Younes Taha El Idrissi - Twente
yteli = eredivisie_shoot_df.loc[eredivisie_shoot_df['Player']== 'Younes Taha El Idrissi']
yteli.Age = yteli.Age.fillna(21)
yteli.Born = yteli.Born.fillna('2002')

# Yann Kitala
yann_kitala = eredivisie_shoot_df.loc[eredivisie_shoot_df['Player']== 'Yann Kitala'] 
yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


nan_players = pd.concat([yann_kitala,yteli], ignore_index=True)

eredivisie_shoot_df = pd.concat([eredivisie_shoot_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Born = yteli.Born.fillna('2002')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


In [None]:
eredivisie_shoot_df.tail()

In [104]:
# Dropping rows with NANs
eredivisie_shoot_df = eredivisie_shoot_df.dropna(axis=0, how='any')


# Removing column header rows
eredivisie_shoot_df = eredivisie_shoot_df[eredivisie_shoot_df["Player"] != "Player"]

convert_dict = {
    'Shots':int,
    'SoT':int,
    'SoT_pct':float,
    'Shots_90':float,
    'SoT_90':float,
    'Gls_per_Sh':float,
    'Gls_per_SoT':float,
    'Avg_Sh_Dist':float,
    'Sh_FK':int,
    'npxG_per_Sh':float,
    'G-xG':float,
    'npG-npxG':float
    }

eredivisie_shoot_df = eredivisie_shoot_df.astype(convert_dict)

### Possession

In [105]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/23/possession/Eredivisie-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_possession' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_possession')

        eredivisie_poss_df = pd.read_html(StringIO(str(div)))[0]
        print(eredivisie_poss_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0     Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                 Player             Nation   
0                    1    Patrick van Aanholt             nl NED   
1                    2        Paxten Aaronson             us USA   
2                    3           Jayden Addai             nl NED   
3                    4         Bobby Adekanye             nl NED   
4                    5          Shawn Adewoye             be BEL   
..                 ...                    ...                ...   
541                522      Lequincio Zeefuik             nl NED   
542                523         Ramiz Zerrouki             dz ALG   
543                524      Giovanni van Zwam             nl NED   
544                525  Willum Þór Willumsson             is ISL   
545                 Rk                 Player             Nation   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad           

In [106]:
# Flattening multiIndex columns
eredivisie_poss_df.columns = ['_'.join(col) for col in eredivisie_poss_df.columns.values]

# Dropping useless columns
eredivisie_poss_df = eredivisie_poss_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 30_level_0_Matches'], axis=1)

# Renaming player ID columns
eredivisie_poss_df = eredivisie_poss_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
eredivisie_poss_df = eredivisie_poss_df.rename(columns={'Touches_Touches':'Touches','Carries_Carries':'Carries'})

In [107]:

# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Younes Taha El Idrissi - Twente
yteli = eredivisie_poss_df.loc[eredivisie_poss_df['Player']== 'Younes Taha El Idrissi']
yteli.Age = yteli.Age.fillna(21)
yteli.Born = yteli.Born.fillna('2002')

# Yann Kitala
yann_kitala = eredivisie_poss_df.loc[eredivisie_poss_df['Player']== 'Yann Kitala'] 
yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


nan_players = pd.concat([yann_kitala,yteli], ignore_index=True)

eredivisie_poss_df = pd.concat([eredivisie_poss_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Born = yteli.Born.fillna('2002')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


In [None]:
eredivisie_poss_df.tail()

In [108]:
# Dropping rows with NANs
eredivisie_poss_df = eredivisie_poss_df.dropna(axis=0, how='any')


# Removing column header rows
eredivisie_poss_df = eredivisie_poss_df[eredivisie_poss_df["Player"] != "Player"]

convert_dict = {
                'Touches':int, 
                'Touches_Def Pen':int, 
                'Touches_Def 3rd':int, 
                'Touches_Mid 3rd':int,
                'Touches_Att 3rd':int, 
                'Touches_Att Pen':int, 
                'Touches_Live':int, 
                'Take-Ons_Att':int,
                'Take-Ons_Succ':int, 
                'Take-Ons_Succ%':float, 
                'Take-Ons_Tkld':int, 
                'Take-Ons_Tkld%':float,
                'Carries':int, 
                'Carries_TotDist':int, 
                'Carries_PrgDist':int, 
                'Carries_PrgC':int,
                'Carries_1/3':int, 
                'Carries_CPA':int, 
                'Carries_Mis':int, 
                'Carries_Dis':int,
                'Receiving_Rec':int, 
                'Receiving_PrgR':int
    }

eredivisie_poss_df = eredivisie_poss_df.astype(convert_dict)

### Passing

In [109]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/23/passing/Eredivisie-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_passing' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_passing')

        eredivisie_pass_df = pd.read_html(StringIO(str(div)))[0]
        print(eredivisie_pass_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0     Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                 Player             Nation   
0                    1    Patrick van Aanholt             nl NED   
1                    2        Paxten Aaronson             us USA   
2                    3           Jayden Addai             nl NED   
3                    4         Bobby Adekanye             nl NED   
4                    5          Shawn Adewoye             be BEL   
..                 ...                    ...                ...   
541                522      Lequincio Zeefuik             nl NED   
542                523         Ramiz Zerrouki             dz ALG   
543                524      Giovanni van Zwam             nl NED   
544                525  Willum Þór Willumsson             is ISL   
545                 Rk                 Player             Nation   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad           

In [110]:
# Flattening multiIndex columns
eredivisie_pass_df.columns = ['_'.join(col) for col in eredivisie_pass_df.columns.values]

# Dropping useless columns
eredivisie_pass_df = eredivisie_pass_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 22_level_0_Ast',
                                            'Unnamed: 31_level_0_Matches','Unnamed: 23_level_0_xAG','Unnamed: 30_level_0_PrgP'], axis=1)

# Renaming player ID columns
eredivisie_pass_df = eredivisie_pass_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
eredivisie_pass_df = eredivisie_pass_df.rename(columns={'Expected_xA':'xA',
                                      'Expected_A-xAG':'A-xAG',
                                      'Unnamed: 26_level_0_KP':'KP',
                                      'Unnamed: 27_level_0_1/3':'Pass_Fin_3rd',
                                      'Unnamed: 28_level_0_PPA':'Pass_Pen_Area',
                                      'Unnamed: 29_level_0_CrsPA':'Cross_Pen_Area'})

In [111]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Younes Taha El Idrissi - Twente
yteli = eredivisie_pass_df.loc[eredivisie_pass_df['Player']== 'Younes Taha El Idrissi']
yteli.Age = yteli.Age.fillna(21)
yteli.Born = yteli.Born.fillna('2002')

# Yann Kitala
yann_kitala = eredivisie_pass_df.loc[eredivisie_pass_df['Player']== 'Yann Kitala'] 
yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


nan_players = pd.concat([yann_kitala,yteli], ignore_index=True)

eredivisie_pass_df = pd.concat([eredivisie_pass_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Born = yteli.Born.fillna('2002')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


In [None]:
eredivisie_pass_df.tail()

In [112]:
# Dropping rows with NANs
eredivisie_pass_df = eredivisie_pass_df.dropna(axis=0, how='any')


# Removing column header rows
eredivisie_pass_df = eredivisie_pass_df[eredivisie_pass_df["Player"] != "Player"]

convert_dict = {
                'Total_Cmp':int, 
                'Total_Att':int, 
                'Total_Cmp%':float, 
                'Total_TotDist':int,
                'Total_PrgDist':int, 
                'Short_Cmp':int, 
                'Short_Att':int, 
                'Short_Cmp%':float, 
                'Medium_Cmp':int,
                'Medium_Att':int, 
                'Medium_Cmp%':float, 
                'Long_Cmp':int, 
                'Long_Att':int, 
                'Long_Cmp%':float, 
                'xA':float,
                'A-xAG':float, 
                'KP':int, 
                'Pass_Fin_3rd':int, 
                'Pass_Pen_Area':int, 
                'Cross_Pen_Area':int
    }

eredivisie_pass_df = eredivisie_pass_df.astype(convert_dict)

### Passing Types

In [113]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/23/passing_types/Eredivisie-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_passing_types' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_passing_types')

        eredivisie_ptype_df = pd.read_html(StringIO(str(div)))[0]
        print(eredivisie_ptype_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0     Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                 Player             Nation   
0                    1    Patrick van Aanholt             nl NED   
1                    2        Paxten Aaronson             us USA   
2                    3           Jayden Addai             nl NED   
3                    4         Bobby Adekanye             nl NED   
4                    5          Shawn Adewoye             be BEL   
..                 ...                    ...                ...   
541                522      Lequincio Zeefuik             nl NED   
542                523         Ramiz Zerrouki             dz ALG   
543                524      Giovanni van Zwam             nl NED   
544                525  Willum Þór Willumsson             is ISL   
545                 Rk                 Player             Nation   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad           

In [114]:
# Flattening multiIndex columns
eredivisie_ptype_df.columns = ['_'.join(col) for col in eredivisie_ptype_df.columns.values]

# Dropping useless columns
eredivisie_ptype_df = eredivisie_ptype_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 23_level_0_Matches','Outcomes_Cmp',
                                                    'Unnamed: 8_level_0_Att'], axis=1)

# Renaming player ID columns
eredivisie_ptype_df = eredivisie_ptype_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
eredivisie_ptype_df = eredivisie_ptype_df.rename(columns={'Outcomes_Blocks':'Pass_Blocked',
                                      'Outcomes_Off':'Pass_Offsides'})

In [115]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Younes Taha El Idrissi - Twente
yteli = eredivisie_ptype_df.loc[eredivisie_ptype_df['Player']== 'Younes Taha El Idrissi']
yteli.Age = yteli.Age.fillna(21)
yteli.Born = yteli.Born.fillna('2002')

# Yann Kitala
yann_kitala = eredivisie_ptype_df.loc[eredivisie_ptype_df['Player']== 'Yann Kitala'] 
yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


nan_players = pd.concat([yann_kitala,yteli], ignore_index=True)

eredivisie_ptype_df = pd.concat([eredivisie_ptype_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Born = yteli.Born.fillna('2002')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


In [116]:
# Dropping rows with NANs
eredivisie_ptype_df = eredivisie_ptype_df.dropna(axis=0, how='any')


# Removing column header rows
eredivisie_ptype_df = eredivisie_ptype_df[eredivisie_ptype_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'Pass Types_Live':int, 
                'Pass Types_Dead':int, 
                'Pass Types_FK':int, 
                'Pass Types_TB':int,
                'Pass Types_Sw':int, 
                'Pass Types_Crs':int, 
                'Pass Types_TI':int, 
                'Pass Types_CK':int,
                'Corner Kicks_In':int, 
                'Corner Kicks_Out':int, 
                'Corner Kicks_Str':int,
                'Pass_Offsides':int, 
                'Pass_Blocked':int  
}
eredivisie_ptype_df = eredivisie_ptype_df.astype(convert_dict)

### Goal and Shot Creation (GCA)

In [117]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/23/gca/Eredivisie-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_gca' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_gca')

        eredivisie_gca_df = pd.read_html(StringIO(str(div)))[0]
        print(eredivisie_gca_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0     Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                 Player             Nation   
0                    1    Patrick van Aanholt             nl NED   
1                    2        Paxten Aaronson             us USA   
2                    3           Jayden Addai             nl NED   
3                    4         Bobby Adekanye             nl NED   
4                    5          Shawn Adewoye             be BEL   
..                 ...                    ...                ...   
541                522      Lequincio Zeefuik             nl NED   
542                523         Ramiz Zerrouki             dz ALG   
543                524      Giovanni van Zwam             nl NED   
544                525  Willum Þór Willumsson             is ISL   
545                 Rk                 Player             Nation   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad           

In [118]:
# Flattening multiIndex columns
eredivisie_gca_df.columns = ['_'.join(col) for col in eredivisie_gca_df.columns.values]

# Dropping useless columns
eredivisie_gca_df = eredivisie_gca_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 24_level_0_Matches'], axis=1)

# Renaming player ID columns
eredivisie_gca_df = eredivisie_gca_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
eredivisie_gca_df = eredivisie_gca_df.rename(columns={'SCA_SCA':'SCA',
                                    'SCA_SCA90':'SCA_90',
                                    'SCA_Types_PassLive':'SCA_PassLive',
                                    'SCA_Types_PassDead':'SCA_PassDead',
                                    'SCA_Types_TO':'SCA_TO',
                                    'SCA_Types_Sh':'SCA_Shot',
                                    'SCA_Types_Fld':'SCA_Fouls_Drawn',
                                    'SCA_Types_Def':'SCA_Def_Action',
                                    'GCA_GCA':'GCA',
                                    'GCA_GCA90':'GCA_90',
                                    'GCA_Types_PassLive':'GCA_PassLive',
                                    'GCA_Types_PassDead':'GCA_PassDead',
                                    'GCA_Types_TO':'GCA_TO',
                                    'GCA_Types_Sh':'GCA_Shot',
                                    'GCA_Types_Fld':'GCA_Fouls_Drawn',
                                    'GCA_Types_Def':'GCA_Def_Action'})

In [119]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Younes Taha El Idrissi - Twente
yteli = eredivisie_gca_df.loc[eredivisie_gca_df['Player']== 'Younes Taha El Idrissi']
yteli.Age = yteli.Age.fillna(21)
yteli.Born = yteli.Born.fillna('2002')

# Yann Kitala
yann_kitala = eredivisie_gca_df.loc[eredivisie_gca_df['Player']== 'Yann Kitala'] 
yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


nan_players = pd.concat([yann_kitala,yteli], ignore_index=True)

eredivisie_gca_df = pd.concat([eredivisie_gca_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Born = yteli.Born.fillna('2002')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


In [None]:
eredivisie_gca_df.tail()

In [120]:
eredivisie_gca_df = eredivisie_gca_df.dropna(axis=0, how='any')


# Removing column header rows
eredivisie_gca_df = eredivisie_gca_df[eredivisie_gca_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'SCA':int,
                'SCA_90':float, 
                'SCA Types_PassLive':int, 
                'SCA Types_PassDead':int, 
                'SCA Types_TO':int,
                'SCA Types_Sh':int, 
                'SCA Types_Fld':int, 
                'SCA Types_Def':int, 
                'GCA':int, 
                'GCA_90':float,
                'GCA Types_PassLive':int, 
                'GCA Types_PassDead':int, 
                'GCA Types_TO':int,
                'GCA Types_Sh':int, 
                'GCA Types_Fld':int, 
                'GCA Types_Def':int  
}
eredivisie_gca_df = eredivisie_gca_df.astype(convert_dict)

### Defensive Actions

In [121]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/23/defense/Eredivisie-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_defense' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_defense')

        eredivisie_def_df = pd.read_html(StringIO(str(div)))[0]
        print(eredivisie_def_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0     Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                 Player             Nation   
0                    1    Patrick van Aanholt             nl NED   
1                    2        Paxten Aaronson             us USA   
2                    3           Jayden Addai             nl NED   
3                    4         Bobby Adekanye             nl NED   
4                    5          Shawn Adewoye             be BEL   
..                 ...                    ...                ...   
541                522      Lequincio Zeefuik             nl NED   
542                523         Ramiz Zerrouki             dz ALG   
543                524      Giovanni van Zwam             nl NED   
544                525  Willum Þór Willumsson             is ISL   
545                 Rk                 Player             Nation   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad           

In [122]:
# Flattening multiIndex columns
eredivisie_def_df.columns = ['_'.join(col) for col in eredivisie_def_df.columns.values]

# Dropping useless columns
eredivisie_def_df = eredivisie_def_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 24_level_0_Matches', 'Challenges_Lost'], axis=1)

# Renaming player ID columns
eredivisie_def_df = eredivisie_def_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
eredivisie_def_df = eredivisie_def_df.rename(columns={'Tackles_Tkl':'Tkls',
                                    'Tackles_TklW':'Tkls_Won',
                                    'Tackles_Def_3rd':'Tkl_Def_3rd',
                                    'Tackles_Mid_3rd':'Tkl_Mid_3rd',
                                    'Tackles_Att_3rd':'Tkl_Att_3rd',
                                    'Challenges_Tkl':'Drib_Tkl',
                                    'Challenges_Att':'Drib_Tkl_Att',
                                    'Challenges_Tkl%':'Drib_Tkl%',
                                    'Blocks_Blocks':'Def_Blocks',
                                    'Blocks_Sh':'Def_Shot_Blocks',
                                    'Blocks_Pass':'Def_Pass_Blocks',
                                    'Unnamed: 20_level_0_Int':'Int',
                                    'Unnamed: 21_level_0_Tkl+Int':'Tkl+Int',
                                    'Unnamed: 22_level_0_Clr':'Clearances',
                                    'Unnamed: 23_level_0_Err':'Errors'})

In [123]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Younes Taha El Idrissi - Twente
yteli = eredivisie_def_df.loc[eredivisie_def_df['Player']== 'Younes Taha El Idrissi']
yteli.Age = yteli.Age.fillna(21)
yteli.Born = yteli.Born.fillna('2002')

# Yann Kitala
yann_kitala = eredivisie_def_df.loc[eredivisie_def_df['Player']== 'Yann Kitala'] 
yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


nan_players = pd.concat([yann_kitala,yteli], ignore_index=True)

eredivisie_def_df = pd.concat([eredivisie_def_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Born = yteli.Born.fillna('2002')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


In [None]:
eredivisie_def_df.tail()

In [124]:
eredivisie_def_df = eredivisie_def_df.dropna(axis=0, how='any')


# Removing column header rows
eredivisie_def_df = eredivisie_def_df[eredivisie_def_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'Tkls':int,
                'Tkls_Won':int, 
                'Tackles_Def 3rd':int, 
                'Tackles_Mid 3rd':int, 
                'Tackles_Att 3rd':int,
                'Drib_Tkl':int, 
                'Drib_Tkl_Att':int, 
                'Drib_Tkl%':float, 
                'Def_Blocks':int,
                'Def_Shot_Blocks':int, 
                'Def_Pass_Blocks':int, 
                'Int':int, 
                'Tkl+Int':int, 
                'Clearances':int,
                'Errors':int 
}
eredivisie_def_df = eredivisie_def_df.astype(convert_dict)

### Miscellaneous Stats

In [125]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/23/misc/Eredivisie-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_misc' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_misc')

        eredivisie_misc_df = pd.read_html(StringIO(str(div)))[0]
        print(eredivisie_misc_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0     Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                 Player             Nation   
0                    1    Patrick van Aanholt             nl NED   
1                    2        Paxten Aaronson             us USA   
2                    3           Jayden Addai             nl NED   
3                    4         Bobby Adekanye             nl NED   
4                    5          Shawn Adewoye             be BEL   
..                 ...                    ...                ...   
541                522      Lequincio Zeefuik             nl NED   
542                523         Ramiz Zerrouki             dz ALG   
543                524      Giovanni van Zwam             nl NED   
544                525  Willum Þór Willumsson             is ISL   
545                 Rk                 Player             Nation   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad           

In [126]:
# Flattening multiIndex columns
eredivisie_misc_df.columns = ['_'.join(col) for col in eredivisie_misc_df.columns.values]

# Dropping useless columns
eredivisie_misc_df = eredivisie_misc_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 24_level_0_Matches','Aerial Duels_Lost',
                         'Performance_CrdY','Performance_CrdR','Performance_Crs','Performance_Int','Performance_TklW',
                         'Performance_Fld'], axis=1)

# Renaming player ID columns
eredivisie_misc_df = eredivisie_misc_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
eredivisie_misc_df = eredivisie_misc_df.rename(columns={'Performance_2CrdY':'CrdY2',
                                    'Performance_Fls':'Fls_Comm',
                                    'Performance_Off':'Offsides',
                                    'Performance_PKwon':'PK_Won',
                                    'Performance_PKcon':'PK_Conv',
                                    'Performance_OG':'Own_Goal',
                                    'Performance_Recov':'Ball_Recoveries'})

In [127]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Younes Taha El Idrissi - Twente
yteli = eredivisie_misc_df.loc[eredivisie_misc_df['Player']== 'Younes Taha El Idrissi']
yteli.Age = yteli.Age.fillna(21)
yteli.Born = yteli.Born.fillna('2002')

# Yann Kitala
yann_kitala = eredivisie_misc_df.loc[eredivisie_misc_df['Player']== 'Yann Kitala'] 
yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


nan_players = pd.concat([yann_kitala,yteli], ignore_index=True)

eredivisie_misc_df = pd.concat([eredivisie_misc_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Age = yteli.Age.fillna(21)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yteli.Born = yteli.Born.fillna('2002')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yann_kitala.Nation = yann_kitala.Nation.fillna('dc DRC')


In [128]:
eredivisie_misc_df = eredivisie_misc_df.dropna(axis=0, how='any')


# Removing column header rows
eredivisie_misc_df = eredivisie_misc_df[eredivisie_misc_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'CrdY2':int,
                'Fls_Comm':int, 
                'Offsides':int, 
                'PK_Won':int, 
                'PK_Conv':int, 
                'Own_Goal':int,
                'Ball_Recoveries':int, 
                'Aerial Duels_Won':int, 
                'Aerial Duels_Won%':float 
}
eredivisie_misc_df = eredivisie_misc_df.astype(convert_dict)

### Merging stat group dataframes

In [129]:
eredivisie_conditions_join = ['Player','Nation','Position','Squad','Age','Born']

eredivisie_outfield_df = pd.merge(eredivisie_stand_df, eredivisie_shoot_df, left_on=eredivisie_conditions_join, right_on=eredivisie_conditions_join,
                          how='left')

eredivisie_outfield_df = pd.merge(eredivisie_outfield_df, eredivisie_poss_df, left_on=eredivisie_conditions_join, right_on=eredivisie_conditions_join,
                          how='left')

eredivisie_outfield_df = pd.merge(eredivisie_outfield_df, eredivisie_pass_df, left_on=eredivisie_conditions_join, right_on=eredivisie_conditions_join,
                          how='left')

eredivisie_outfield_df = pd.merge(eredivisie_outfield_df, eredivisie_ptype_df, left_on=eredivisie_conditions_join, right_on=eredivisie_conditions_join,
                          how='left')

eredivisie_outfield_df = pd.merge(eredivisie_outfield_df, eredivisie_gca_df, left_on=eredivisie_conditions_join, right_on=eredivisie_conditions_join,
                          how='left')

eredivisie_outfield_df = pd.merge(eredivisie_outfield_df, eredivisie_def_df, left_on=eredivisie_conditions_join, right_on=eredivisie_conditions_join,
                          how='left')

eredivisie_outfield_df = pd.merge(eredivisie_outfield_df, eredivisie_misc_df, left_on=eredivisie_conditions_join, right_on=eredivisie_conditions_join,
                          how='left')

In [130]:
eredivisie_outfield_df['League'] = 'Eredivisie'

In [131]:
eredivisie_outfield_df.head(20)

Unnamed: 0,Player,Nation,Position,Squad,Age,Born,MP,Starts,Min,No_90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,Prg_Carr,Prg_Pass,Prg_Pass_Rec,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90,Shots,SoT,SoT_pct,Shots_90,SoT_90,Gls_per_Sh,Gls_per_SoT,Avg_Sh_Dist,Sh_FK,npxG_per_Sh,G-xG,npG-npxG,Touches,Touches_Def Pen,Touches_Def 3rd,Touches_Mid 3rd,Touches_Att 3rd,Touches_Att Pen,Touches_Live,Take-Ons_Att,Take-Ons_Succ,Take-Ons_Succ%,Take-Ons_Tkld,Take-Ons_Tkld%,Carries,Carries_TotDist,Carries_PrgDist,Carries_PrgC,Carries_1/3,Carries_CPA,Carries_Mis,Carries_Dis,Receiving_Rec,Receiving_PrgR,Total_Cmp,Total_Att,Total_Cmp%,Total_TotDist,Total_PrgDist,Short_Cmp,Short_Att,Short_Cmp%,Medium_Cmp,Medium_Att,Medium_Cmp%,Long_Cmp,Long_Att,Long_Cmp%,xA,A-xAG,KP,Pass_Fin_3rd,Pass_Pen_Area,Cross_Pen_Area,Pass Types_Live,Pass Types_Dead,Pass Types_FK,Pass Types_TB,Pass Types_Sw,Pass Types_Crs,Pass Types_TI,Pass Types_CK,Corner Kicks_In,Corner Kicks_Out,Corner Kicks_Str,Pass_Offsides,Pass_Blocked,SCA,SCA_90,SCA Types_PassLive,SCA Types_PassDead,SCA Types_TO,SCA Types_Sh,SCA Types_Fld,SCA Types_Def,GCA,GCA_90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def,Tkls,Tkls_Won,Tackles_Def 3rd,Tackles_Mid 3rd,Tackles_Att 3rd,Drib_Tkl,Drib_Tkl_Att,Drib_Tkl%,Def_Blocks,Def_Shot_Blocks,Def_Pass_Blocks,Int,Tkl+Int,Clearances,Errors,CrdY2,Fls_Comm,Offsides,PK_Won,PK_Conv,Own_Goal,Ball_Recoveries,Aerial Duels_Won,Aerial Duels_Won%,League
0,Patrick van Aanholt,nl NED,DF,PSV Eindhoven,32,1990,24,9,1114,12.4,1,1,2,1,0,0,0,0,2.1,2.1,2.3,4.5,40,65,98,0.08,0.08,0.16,0.08,0.16,0.17,0.19,0.36,0.17,0.36,26.0,9.0,34.6,2.1,0.73,0.04,0.11,19.5,1.0,0.08,-1.1,-1.1,927.0,17.0,208.0,467.0,257.0,50.0,927.0,19.0,6.0,31.6,12.0,63.2,494.0,2539.0,1472.0,40.0,25.0,9.0,14.0,7.0,615.0,98.0,689.0,808.0,85.3,11024.0,3341.0,346.0,385.0,89.9,273.0,320.0,85.3,50.0,65.0,76.9,3.1,-1.3,23.0,68.0,22.0,7.0,709,96,5,3,4,28,90,1,0,1,0,3,8,45,3.63,36,2,3,3,0,1,5,0.4,5,0,0,0,0,0,22.0,17.0,13.0,6.0,3.0,13.0,20.0,65.0,4.0,1.0,3.0,15.0,37.0,9.0,0.0,0.0,6.0,2.0,0.0,0.0,0.0,95.0,3.0,37.5,Eredivisie
1,Paxten Aaronson,us USA,MF,Vitesse,19,2003,14,14,1253,13.9,4,0,4,4,0,0,4,0,5.0,5.0,1.7,6.8,34,43,61,0.29,0.0,0.29,0.29,0.29,0.36,0.12,0.48,0.36,0.48,38.0,15.0,39.5,2.73,1.08,0.11,0.27,16.7,0.0,0.14,-1.0,-1.0,602.0,16.0,112.0,277.0,226.0,57.0,602.0,22.0,9.0,40.9,12.0,54.5,303.0,1878.0,889.0,34.0,24.0,11.0,22.0,25.0,351.0,61.0,339.0,425.0,79.8,5532.0,1298.0,167.0,188.0,88.8,135.0,155.0,87.1,26.0,44.0,59.1,2.0,-1.7,27.0,22.0,6.0,1.0,384,39,6,0,2,30,1,19,7,6,0,2,16,45,3.23,26,9,3,6,1,0,4,0.29,2,0,1,1,0,0,44.0,25.0,22.0,16.0,6.0,18.0,30.0,60.0,21.0,3.0,18.0,9.0,53.0,9.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,51.0,11.0,32.4,Eredivisie
2,Jayden Addai,nl NED,FW,AZ Alkmaar,17,2005,8,2,297,3.3,0,0,0,0,0,0,1,0,0.2,0.2,0.1,0.4,11,8,29,0.0,0.0,0.0,0.0,0.0,0.07,0.04,0.11,0.07,0.11,,,,,,,,,,,,,145.0,3.0,24.0,41.0,81.0,18.0,145.0,24.0,10.0,41.7,11.0,45.8,95.0,517.0,231.0,11.0,4.0,6.0,11.0,8.0,92.0,29.0,50.0,79.0,63.3,681.0,203.0,29.0,40.0,72.5,16.0,18.0,88.9,2.0,9.0,22.2,0.9,-0.1,3.0,4.0,1.0,0.0,77,1,0,0,0,6,1,0,0,0,0,1,6,7,2.12,6,0,0,0,1,0,1,0.3,1,0,0,0,0,0,9.0,7.0,6.0,1.0,2.0,6.0,11.0,54.5,3.0,0.0,3.0,0.0,9.0,2.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,15.0,2.0,50.0,Eredivisie
3,Bobby Adekanye,nl NED,FW,Go Ahead Eag,24,1999,28,23,1804,20.0,0,5,5,0,0,0,4,1,2.6,2.6,5.7,8.3,67,51,114,0.0,0.25,0.25,0.0,0.25,0.13,0.29,0.41,0.13,0.41,40.0,10.0,25.0,2.0,0.5,0.0,0.0,18.7,0.0,0.06,-2.6,-2.6,734.0,12.0,102.0,300.0,355.0,85.0,734.0,108.0,48.0,44.4,48.0,44.4,450.0,3299.0,1620.0,67.0,35.0,38.0,65.0,31.0,507.0,114.0,314.0,453.0,69.3,5075.0,1606.0,169.0,205.0,82.4,104.0,147.0,70.7,33.0,61.0,54.1,4.0,-0.7,31.0,27.0,24.0,12.0,448,3,2,5,4,46,1,0,0,0,0,2,15,71,3.54,45,0,11,7,8,0,12,0.6,6,0,3,2,1,0,18.0,11.0,6.0,9.0,3.0,8.0,27.0,29.6,18.0,2.0,16.0,4.0,22.0,12.0,0.0,0.0,21.0,4.0,0.0,0.0,0.0,91.0,8.0,28.6,Eredivisie
4,Shawn Adewoye,be BEL,DF,RKC Waalwijk,23,2000,29,26,2338,26.0,1,0,1,1,0,0,1,1,1.3,1.3,0.4,1.7,26,97,3,0.04,0.0,0.04,0.04,0.04,0.05,0.02,0.07,0.05,0.07,16.0,4.0,25.0,0.62,0.15,0.06,0.25,10.8,0.0,0.08,-0.3,-0.3,1489.0,283.0,789.0,638.0,69.0,26.0,1489.0,8.0,7.0,87.5,0.0,0.0,740.0,5351.0,3241.0,26.0,10.0,0.0,19.0,10.0,817.0,3.0,972.0,1129.0,86.1,19469.0,6137.0,253.0,283.0,89.4,605.0,649.0,93.2,101.0,158.0,63.9,0.4,-0.4,6.0,73.0,3.0,0.0,1084,45,31,1,5,1,7,0,0,0,0,0,17,21,0.81,17,1,0,2,0,1,2,0.08,1,0,0,0,0,1,36.0,24.0,23.0,9.0,4.0,22.0,33.0,66.7,57.0,36.0,21.0,32.0,68.0,169.0,1.0,0.0,17.0,0.0,0.0,1.0,0.0,127.0,71.0,64.5,Eredivisie
5,Nikolas Agrafiotis,nl NED,FW,Excelsior,23,2000,18,10,758,8.4,5,2,7,5,0,0,0,0,4.0,4.0,1.3,5.3,7,12,44,0.59,0.24,0.83,0.59,0.83,0.47,0.16,0.62,0.47,0.62,29.0,13.0,44.8,3.44,1.54,0.17,0.38,14.7,0.0,0.14,1.0,1.0,304.0,13.0,22.0,132.0,153.0,46.0,304.0,16.0,3.0,18.8,13.0,81.3,155.0,582.0,206.0,7.0,4.0,4.0,31.0,8.0,215.0,44.0,115.0,185.0,62.2,1604.0,266.0,69.0,100.0,69.0,34.0,49.0,69.4,5.0,11.0,45.5,1.2,0.7,11.0,8.0,2.0,1.0,178,7,0,0,0,3,1,0,0,0,0,0,9,22,2.6,19,0,1,1,0,1,4,0.47,3,0,0,1,0,0,12.0,7.0,2.0,6.0,4.0,4.0,8.0,50.0,3.0,1.0,2.0,1.0,13.0,13.0,0.0,0.0,13.0,1.0,0.0,0.0,0.0,25.0,39.0,52.7,Eredivisie
6,Chuba Akpom,eng ENG,"FW,MF",Ajax,27,1995,25,8,1049,11.7,11,3,14,11,0,0,0,0,9.0,9.0,1.9,10.9,23,38,74,0.94,0.26,1.2,0.94,1.2,0.77,0.16,0.93,0.77,0.93,41.0,21.0,51.2,3.52,1.8,0.27,0.52,10.7,0.0,0.22,2.0,2.0,415.0,15.0,33.0,178.0,210.0,78.0,415.0,24.0,12.0,50.0,8.0,33.3,256.0,1481.0,611.0,23.0,13.0,8.0,35.0,24.0,309.0,74.0,187.0,252.0,74.2,2752.0,655.0,107.0,136.0,78.7,60.0,73.0,82.2,10.0,12.0,83.3,2.1,1.1,17.0,19.0,12.0,0.0,244,6,0,1,2,5,0,1,0,0,0,2,12,29,2.49,22,0,1,5,1,0,4,0.34,3,0,0,1,0,0,15.0,8.0,2.0,10.0,3.0,4.0,8.0,50.0,9.0,0.0,9.0,3.0,18.0,13.0,0.0,0.0,11.0,5.0,0.0,0.0,0.0,35.0,42.0,48.8,Eredivisie
7,Hamdi Akujobi,nl NED,"DF,FW",Almere City,23,2000,23,16,1557,17.3,0,2,2,0,0,0,4,0,1.0,1.0,1.4,2.5,29,50,64,0.0,0.12,0.12,0.0,0.12,0.06,0.08,0.14,0.06,0.14,,,,,,,,,,,,,900.0,54.0,243.0,386.0,282.0,22.0,900.0,29.0,10.0,34.5,14.0,48.3,384.0,2223.0,990.0,29.0,24.0,5.0,32.0,7.0,416.0,64.0,443.0,682.0,65.0,7259.0,2818.0,224.0,260.0,86.2,161.0,266.0,60.5,40.0,114.0,35.1,2.0,0.6,15.0,36.0,16.0,7.0,549,131,5,2,5,43,126,0,0,0,0,2,10,35,2.03,30,1,2,1,1,0,5,0.29,5,0,0,0,0,0,39.0,29.0,23.0,13.0,3.0,11.0,24.0,45.8,35.0,11.0,24.0,16.0,55.0,33.0,0.0,0.0,27.0,2.0,0.0,0.0,1.0,89.0,17.0,45.9,Eredivisie
8,Hussein Ali,iq IRQ,"DF,FW",Heerenveen,21,2002,15,8,699,7.8,0,0,0,0,0,0,0,0,0.4,0.4,0.7,1.1,13,16,34,0.0,0.0,0.0,0.0,0.0,0.06,0.09,0.15,0.06,0.15,6.0,1.0,16.7,0.77,0.13,0.0,0.0,17.4,0.0,0.07,-0.4,-0.4,359.0,36.0,132.0,147.0,84.0,10.0,359.0,6.0,3.0,50.0,3.0,50.0,161.0,892.0,446.0,13.0,8.0,1.0,15.0,3.0,177.0,34.0,196.0,261.0,75.1,3376.0,814.0,96.0,107.0,89.7,78.0,103.0,75.7,19.0,37.0,51.4,0.6,-0.7,5.0,8.0,4.0,2.0,209,52,3,1,0,20,49,0,0,0,0,0,10,11,1.41,9,0,0,1,1,0,1,0.13,0,0,0,0,1,0,11.0,7.0,8.0,2.0,1.0,8.0,14.0,57.1,7.0,2.0,5.0,12.0,23.0,25.0,0.0,0.0,6.0,1.0,0.0,1.0,0.0,32.0,8.0,50.0,Eredivisie
9,Jordi Altena,nl NED,MF,Vitesse,19,2003,2,0,5,0.1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Eredivisie


## Major League Soccer (MLS)

### Standard

In [132]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_standard' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_standard')

        mls_stand_df = pd.read_html(StringIO(str(div)))[0]
        print(mls_stand_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1            Liel Abada             il ISR   
1                    2  Jose Casas de Abadal             us USA   
2                    3            Luis Abram             pe PER   
3                    4        Lalas Abubakar             gh GHA   
4                    5         Kellyn Acosta             us USA   
..                 ...                   ...                ...   
753                 Rk                Player             Nation   
754                726      Walker Zimmerman             us USA   
755                727           Rida Zouhir             ca CAN   
756                728         Dario Župarić             ba BIH   
757                729       Nökkvi Þórisson             is ISL   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [133]:

# Flattening multiIndex columns
mls_stand_df.columns = ['_'.join(col) for col in mls_stand_df.columns.values]

# Dropping useless columns
mls_stand_df = mls_stand_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 36_level_0_Matches'], axis=1)

# Renaming player ID columns
mls_stand_df = mls_stand_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
mls_stand_df = mls_stand_df.rename(columns={'Playing Time_MP':'MP',
                                        'Playing Time_Starts':'Starts', 
                                        'Playing Time_Min':'Min', 
                                        'Playing Time_90s':'No_90s',
                                        'Performance_Gls':'Gls', 
                                        'Performance_Ast':'Ast',
                                        'Performance_G+A':'G+A',
                                        'Performance_G-PK':'G-PK', 
                                        'Performance_PK':'PK', 
                                        'Performance_PKatt':'PKatt',
                                        'Performance_CrdY':'CrdY',
                                        'Performance_CrdR':'CrdR',
                                        'Expected_xG':'xG', 
                                        'Expected_npxG':'npxG',
                                        'Expected_xAG':'xAG', 
                                        'Expected_npxG+xAG':'npxG+xAG',
                                        'Progression_PrgC':'Prg_Carr',
                                        'Progression_PrgP':'Prg_Pass', 
                                        'Progression_PrgR':'Prg_Pass_Rec', 
                                        'Per 90 Minutes_Gls':'Gls_90',
                                        'Per 90 Minutes_Ast':'Ast_90', 
                                        'Per 90 Minutes_G+A':'G+A_90', 
                                        'Per 90 Minutes_G-PK':'G-PK_90',
                                        'Per 90 Minutes_G+A-PK':'G+A-PK_90', 
                                        'Per 90 Minutes_xG':'xG_90', 
                                        'Per 90 Minutes_xAG':'xAG_90',
                                        'Per 90 Minutes_xG+xAG':'xG+xAG_90', 
                                        'Per 90 Minutes_npxG':'npxG_90',
                                        'Per 90 Minutes_npxG+xAG':'npxG+xAG_90'})

In [None]:
#Examining the Players that have null values

mls_stand_df[mls_stand_df.isnull().any(axis=1)]

In [134]:

# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Tyrese Spicer - Toronto FC
tyrese_spicer = mls_stand_df.loc[mls_stand_df['Player']== 'Tyrese Spicer']
tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')

# Tristan Muyumba- Atlanta Utd
tristan_muyumba = mls_stand_df.loc[mls_stand_df['Player']== 'Tristan Muyumba'] 
tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


nan_players = pd.concat([tyrese_spicer,tristan_muyumba], ignore_index=True)

mls_stand_df = pd.concat([mls_stand_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


In [135]:
# Dropping rows with NANs
mls_stand_df = mls_stand_df.dropna(axis=0, how='any')


# Removing column header rows
mls_stand_df = mls_stand_df[mls_stand_df["Player"] != "Player"]

convert_dict = {'No_90s':float,
                'Gls_90':float,
                'Ast_90':float,
                'G+A_90':float,
                'G-PK_90':float,
                'G+A_90':float,
                'G+A-PK_90':float,
                'xG':float,
                'npxG':float,
                'xAG':float,
                'npxG+xAG':float,
                'xG_90':float,
                'xAG_90':float,
                'xG+xAG_90':float,
                'npxG_90':float,
                'npxG+xAG_90':float,
                
                'MP':int, 
                'Starts':int, 
                'Min':int, 
                'Gls':int, 
                'Ast':int, 
                'G+A':int, 
                'G-PK':int, 
                'PK':int, 
                'PKatt':int,
                'CrdY':int,
                'CrdR':int,
                'Prg_Carr':int,
                'Prg_Pass':int,
                'Prg_Pass_Rec':int
    }

mls_stand_df = mls_stand_df.astype(convert_dict)

### Shooting

In [136]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/22/shooting/Major-League-Soccer-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_shooting' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_shooting')

        mls_shoot_df = pd.read_html(StringIO(str(div)))[0]
        print(mls_shoot_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1            Liel Abada             il ISR   
1                    2  Jose Casas de Abadal             us USA   
2                    3            Luis Abram             pe PER   
3                    4        Lalas Abubakar             gh GHA   
4                    5         Kellyn Acosta             us USA   
..                 ...                   ...                ...   
753                 Rk                Player             Nation   
754                726      Walker Zimmerman             us USA   
755                727           Rida Zouhir             ca CAN   
756                728         Dario Župarić             ba BIH   
757                729       Nökkvi Þórisson             is ISL   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [137]:
# Flattening multiIndex columns
mls_shoot_df.columns = ['_'.join(col) for col in mls_shoot_df.columns.values]

# Dropping useless columns
mls_shoot_df = mls_shoot_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 25_level_0_Matches','Standard_Gls','Standard_PK',
                              'Standard_PKatt', 'Expected_xG', 'Expected_npxG'], axis=1)

# Renaming player ID columns
mls_shoot_df = mls_shoot_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
mls_shoot_df = mls_shoot_df.rename(columns={'Standard_Sh':'Shots', 
                                        'Standard_SoT':'SoT',
                                        'Standard_SoT%':'SoT_pct', 
                                        'Standard_Sh/90':'Shots_90', 
                                        'Standard_SoT/90':'SoT_90', 
                                        'Standard_G/Sh':'Gls_per_Sh',
                                        'Standard_G/SoT':'Gls_per_SoT', 
                                        'Standard_Dist':'Avg_Sh_Dist', 
                                        'Standard_FK':'Sh_FK', 
                                        'Expected_npxG/Sh':'npxG_per_Sh',
                                        'Expected_G-xG':'G-xG', 
                                        'Expected_np:G-xG':'npG-npxG'})

In [138]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Tyrese Spicer - Toronto FC
tyrese_spicer = mls_shoot_df.loc[mls_shoot_df['Player']== 'Tyrese Spicer']
tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')

# Tristan Muyumba- Atlanta Utd
tristan_muyumba = mls_shoot_df.loc[mls_shoot_df['Player']== 'Tristan Muyumba'] 
tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


nan_players = pd.concat([tyrese_spicer,tristan_muyumba], ignore_index=True)

mls_shoot_df = pd.concat([mls_shoot_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


In [None]:
mls_shoot_df.tail()

In [139]:
# Dropping rows with NANs
mls_shoot_df = mls_shoot_df.dropna(axis=0, how='any')


# Removing column header rows
mls_shoot_df = mls_shoot_df[mls_shoot_df["Player"] != "Player"]

convert_dict = {
    'Shots':int,
    'SoT':int,
    'SoT_pct':float,
    'Shots_90':float,
    'SoT_90':float,
    'Gls_per_Sh':float,
    'Gls_per_SoT':float,
    'Avg_Sh_Dist':float,
    'Sh_FK':int,
    'npxG_per_Sh':float,
    'G-xG':float,
    'npG-npxG':float
    }

mls_shoot_df = mls_shoot_df.astype(convert_dict)

### Possession

In [140]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/22/possession/Major-League-Soccer-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_possession' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_possession')

        mls_poss_df = pd.read_html(StringIO(str(div)))[0]
        print(mls_poss_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1            Liel Abada             il ISR   
1                    2  Jose Casas de Abadal             us USA   
2                    3            Luis Abram             pe PER   
3                    4        Lalas Abubakar             gh GHA   
4                    5         Kellyn Acosta             us USA   
..                 ...                   ...                ...   
753                 Rk                Player             Nation   
754                726      Walker Zimmerman             us USA   
755                727           Rida Zouhir             ca CAN   
756                728         Dario Župarić             ba BIH   
757                729       Nökkvi Þórisson             is ISL   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [141]:
# Flattening multiIndex columns
mls_poss_df.columns = ['_'.join(col) for col in mls_poss_df.columns.values]

# Dropping useless columns
mls_poss_df = mls_poss_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 30_level_0_Matches'], axis=1)

# Renaming player ID columns
mls_poss_df = mls_poss_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
mls_poss_df = mls_poss_df.rename(columns={'Touches_Touches':'Touches','Carries_Carries':'Carries'})

In [142]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Tyrese Spicer - Toronto FC
tyrese_spicer = mls_poss_df.loc[mls_poss_df['Player']== 'Tyrese Spicer']
tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')

# Tristan Muyumba- Atlanta Utd
tristan_muyumba = mls_poss_df.loc[mls_poss_df['Player']== 'Tristan Muyumba'] 
tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


nan_players = pd.concat([tyrese_spicer,tristan_muyumba], ignore_index=True)

mls_poss_df = pd.concat([mls_poss_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


In [None]:
mls_poss_df.tail()

In [143]:
# Dropping rows with NANs
mls_poss_df = mls_poss_df.dropna(axis=0, how='any')


# Removing column header rows
mls_poss_df = mls_poss_df[mls_poss_df["Player"] != "Player"]

convert_dict = {
                'Touches':int, 
                'Touches_Def Pen':int, 
                'Touches_Def 3rd':int, 
                'Touches_Mid 3rd':int,
                'Touches_Att 3rd':int, 
                'Touches_Att Pen':int, 
                'Touches_Live':int, 
                'Take-Ons_Att':int,
                'Take-Ons_Succ':int, 
                'Take-Ons_Succ%':float, 
                'Take-Ons_Tkld':int, 
                'Take-Ons_Tkld%':float,
                'Carries':int, 
                'Carries_TotDist':int, 
                'Carries_PrgDist':int, 
                'Carries_PrgC':int,
                'Carries_1/3':int, 
                'Carries_CPA':int, 
                'Carries_Mis':int, 
                'Carries_Dis':int,
                'Receiving_Rec':int, 
                'Receiving_PrgR':int
    }

mls_poss_df = mls_poss_df.astype(convert_dict)

### Passing

In [144]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/22/passing/Major-League-Soccer-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_passing' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_passing')

        mls_pass_df = pd.read_html(StringIO(str(div)))[0]
        print(mls_pass_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1            Liel Abada             il ISR   
1                    2  Jose Casas de Abadal             us USA   
2                    3            Luis Abram             pe PER   
3                    4        Lalas Abubakar             gh GHA   
4                    5         Kellyn Acosta             us USA   
..                 ...                   ...                ...   
753                 Rk                Player             Nation   
754                726      Walker Zimmerman             us USA   
755                727           Rida Zouhir             ca CAN   
756                728         Dario Župarić             ba BIH   
757                729       Nökkvi Þórisson             is ISL   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [145]:
# Flattening multiIndex columns
mls_pass_df.columns = ['_'.join(col) for col in mls_pass_df.columns.values]

# Dropping useless columns
mls_pass_df = mls_pass_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 22_level_0_Ast',
                                            'Unnamed: 31_level_0_Matches','Unnamed: 23_level_0_xAG','Unnamed: 30_level_0_PrgP'], axis=1)

# Renaming player ID columns
mls_pass_df = mls_pass_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
mls_pass_df = mls_pass_df.rename(columns={'Expected_xA':'xA',
                                      'Expected_A-xAG':'A-xAG',
                                      'Unnamed: 26_level_0_KP':'KP',
                                      'Unnamed: 27_level_0_1/3':'Pass_Fin_3rd',
                                      'Unnamed: 28_level_0_PPA':'Pass_Pen_Area',
                                      'Unnamed: 29_level_0_CrsPA':'Cross_Pen_Area'})

In [146]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Tyrese Spicer - Toronto FC
tyrese_spicer = mls_pass_df.loc[mls_pass_df['Player']== 'Tyrese Spicer']
tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')

# Tristan Muyumba- Atlanta Utd
tristan_muyumba = mls_pass_df.loc[mls_pass_df['Player']== 'Tristan Muyumba'] 
tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


nan_players = pd.concat([tyrese_spicer,tristan_muyumba], ignore_index=True)

mls_pass_df = pd.concat([mls_pass_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


In [None]:
mls_pass_df.tail()

In [147]:
# Dropping rows with NANs
mls_pass_df = mls_pass_df.dropna(axis=0, how='any')


# Removing column header rows
mls_pass_df = mls_pass_df[mls_pass_df["Player"] != "Player"]

convert_dict = {
                'Total_Cmp':int, 
                'Total_Att':int, 
                'Total_Cmp%':float, 
                'Total_TotDist':int,
                'Total_PrgDist':int, 
                'Short_Cmp':int, 
                'Short_Att':int, 
                'Short_Cmp%':float, 
                'Medium_Cmp':int,
                'Medium_Att':int, 
                'Medium_Cmp%':float, 
                'Long_Cmp':int, 
                'Long_Att':int, 
                'Long_Cmp%':float, 
                'xA':float,
                'A-xAG':float, 
                'KP':int, 
                'Pass_Fin_3rd':int, 
                'Pass_Pen_Area':int, 
                'Cross_Pen_Area':int
    }

mls_pass_df = mls_pass_df.astype(convert_dict)

### Passing Types

In [148]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/22/passing_types/Major-League-Soccer-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_passing_types' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_passing_types')

        mls_ptype_df = pd.read_html(StringIO(str(div)))[0]
        print(mls_ptype_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1            Liel Abada             il ISR   
1                    2  Jose Casas de Abadal             us USA   
2                    3            Luis Abram             pe PER   
3                    4        Lalas Abubakar             gh GHA   
4                    5         Kellyn Acosta             us USA   
..                 ...                   ...                ...   
753                 Rk                Player             Nation   
754                726      Walker Zimmerman             us USA   
755                727           Rida Zouhir             ca CAN   
756                728         Dario Župarić             ba BIH   
757                729       Nökkvi Þórisson             is ISL   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [149]:
# Flattening multiIndex columns
mls_ptype_df.columns = ['_'.join(col) for col in mls_ptype_df.columns.values]

# Dropping useless columns
mls_ptype_df = mls_ptype_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 23_level_0_Matches','Outcomes_Cmp',
                                                    'Unnamed: 8_level_0_Att'], axis=1)

# Renaming player ID columns
mls_ptype_df = mls_ptype_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
mls_ptype_df = mls_ptype_df.rename(columns={'Outcomes_Blocks':'Pass_Blocked',
                                      'Outcomes_Off':'Pass_Offsides'})

In [150]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Tyrese Spicer - Toronto FC
tyrese_spicer = mls_ptype_df.loc[mls_ptype_df['Player']== 'Tyrese Spicer']
tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')

# Tristan Muyumba- Atlanta Utd
tristan_muyumba = mls_ptype_df.loc[mls_ptype_df['Player']== 'Tristan Muyumba'] 
tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


nan_players = pd.concat([tyrese_spicer,tristan_muyumba], ignore_index=True)

mls_ptype_df = pd.concat([mls_ptype_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


In [151]:
# Dropping rows with NANs
mls_ptype_df = mls_ptype_df.dropna(axis=0, how='any')


# Removing column header rows
mls_ptype_df = mls_ptype_df[mls_ptype_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'Pass Types_Live':int, 
                'Pass Types_Dead':int, 
                'Pass Types_FK':int, 
                'Pass Types_TB':int,
                'Pass Types_Sw':int, 
                'Pass Types_Crs':int, 
                'Pass Types_TI':int, 
                'Pass Types_CK':int,
                'Corner Kicks_In':int, 
                'Corner Kicks_Out':int, 
                'Corner Kicks_Str':int,
                'Pass_Offsides':int, 
                'Pass_Blocked':int  
}
mls_ptype_df = mls_ptype_df.astype(convert_dict)

### Goal and Shot Creation (GCA)

In [152]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/22/gca/Major-League-Soccer-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_gca' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_gca')

        mls_gca_df = pd.read_html(StringIO(str(div)))[0]
        print(mls_gca_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1            Liel Abada             il ISR   
1                    2  Jose Casas de Abadal             us USA   
2                    3            Luis Abram             pe PER   
3                    4        Lalas Abubakar             gh GHA   
4                    5         Kellyn Acosta             us USA   
..                 ...                   ...                ...   
753                 Rk                Player             Nation   
754                726      Walker Zimmerman             us USA   
755                727           Rida Zouhir             ca CAN   
756                728         Dario Župarić             ba BIH   
757                729       Nökkvi Þórisson             is ISL   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [153]:
# Flattening multiIndex columns
mls_gca_df.columns = ['_'.join(col) for col in mls_gca_df.columns.values]

# Dropping useless columns
mls_gca_df = mls_gca_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 24_level_0_Matches'], axis=1)

# Renaming player ID columns
mls_gca_df = mls_gca_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
mls_gca_df = mls_gca_df.rename(columns={'SCA_SCA':'SCA',
                                    'SCA_SCA90':'SCA_90',
                                    'SCA_Types_PassLive':'SCA_PassLive',
                                    'SCA_Types_PassDead':'SCA_PassDead',
                                    'SCA_Types_TO':'SCA_TO',
                                    'SCA_Types_Sh':'SCA_Shot',
                                    'SCA_Types_Fld':'SCA_Fouls_Drawn',
                                    'SCA_Types_Def':'SCA_Def_Action',
                                    'GCA_GCA':'GCA',
                                    'GCA_GCA90':'GCA_90',
                                    'GCA_Types_PassLive':'GCA_PassLive',
                                    'GCA_Types_PassDead':'GCA_PassDead',
                                    'GCA_Types_TO':'GCA_TO',
                                    'GCA_Types_Sh':'GCA_Shot',
                                    'GCA_Types_Fld':'GCA_Fouls_Drawn',
                                    'GCA_Types_Def':'GCA_Def_Action'})

In [154]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Tyrese Spicer - Toronto FC
tyrese_spicer = mls_gca_df.loc[mls_gca_df['Player']== 'Tyrese Spicer']
tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')

# Tristan Muyumba- Atlanta Utd
tristan_muyumba = mls_gca_df.loc[mls_gca_df['Player']== 'Tristan Muyumba'] 
tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


nan_players = pd.concat([tyrese_spicer,tristan_muyumba], ignore_index=True)

mls_gca_df = pd.concat([mls_gca_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


In [None]:
mls_gca_df.tail()

In [155]:
mls_gca_df = mls_gca_df.dropna(axis=0, how='any')


# Removing column header rows
mls_gca_df = mls_gca_df[mls_gca_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'SCA':int,
                'SCA_90':float, 
                'SCA Types_PassLive':int, 
                'SCA Types_PassDead':int, 
                'SCA Types_TO':int,
                'SCA Types_Sh':int, 
                'SCA Types_Fld':int, 
                'SCA Types_Def':int, 
                'GCA':int, 
                'GCA_90':float,
                'GCA Types_PassLive':int, 
                'GCA Types_PassDead':int, 
                'GCA Types_TO':int,
                'GCA Types_Sh':int, 
                'GCA Types_Fld':int, 
                'GCA Types_Def':int  
}
mls_gca_df = mls_gca_df.astype(convert_dict)

### Defensive Actions

In [157]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/22/defense/Major-League-Soccer-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_defense' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_defense')

        mls_def_df = pd.read_html(StringIO(str(div)))[0]
        print(mls_def_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1            Liel Abada             il ISR   
1                    2  Jose Casas de Abadal             us USA   
2                    3            Luis Abram             pe PER   
3                    4        Lalas Abubakar             gh GHA   
4                    5         Kellyn Acosta             us USA   
..                 ...                   ...                ...   
753                 Rk                Player             Nation   
754                726      Walker Zimmerman             us USA   
755                727           Rida Zouhir             ca CAN   
756                728         Dario Župarić             ba BIH   
757                729       Nökkvi Þórisson             is ISL   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [158]:
# Flattening multiIndex columns
mls_def_df.columns = ['_'.join(col) for col in mls_def_df.columns.values]

# Dropping useless columns
mls_def_df = mls_def_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 24_level_0_Matches', 'Challenges_Lost'], axis=1)

# Renaming player ID columns
mls_def_df = mls_def_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
mls_def_df = mls_def_df.rename(columns={'Tackles_Tkl':'Tkls',
                                    'Tackles_TklW':'Tkls_Won',
                                    'Tackles_Def_3rd':'Tkl_Def_3rd',
                                    'Tackles_Mid_3rd':'Tkl_Mid_3rd',
                                    'Tackles_Att_3rd':'Tkl_Att_3rd',
                                    'Challenges_Tkl':'Drib_Tkl',
                                    'Challenges_Att':'Drib_Tkl_Att',
                                    'Challenges_Tkl%':'Drib_Tkl%',
                                    'Blocks_Blocks':'Def_Blocks',
                                    'Blocks_Sh':'Def_Shot_Blocks',
                                    'Blocks_Pass':'Def_Pass_Blocks',
                                    'Unnamed: 20_level_0_Int':'Int',
                                    'Unnamed: 21_level_0_Tkl+Int':'Tkl+Int',
                                    'Unnamed: 22_level_0_Clr':'Clearances',
                                    'Unnamed: 23_level_0_Err':'Errors'})

In [159]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Tyrese Spicer - Toronto FC
tyrese_spicer = mls_def_df.loc[mls_def_df['Player']== 'Tyrese Spicer']
tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')

# Tristan Muyumba- Atlanta Utd
tristan_muyumba = mls_def_df.loc[mls_def_df['Player']== 'Tristan Muyumba'] 
tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


nan_players = pd.concat([tyrese_spicer,tristan_muyumba], ignore_index=True)

mls_def_df = pd.concat([mls_def_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


In [None]:
mls_def_df.tail()

In [160]:
mls_def_df = mls_def_df.dropna(axis=0, how='any')


# Removing column header rows
mls_def_df = mls_def_df[mls_def_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'Tkls':int,
                'Tkls_Won':int, 
                'Tackles_Def 3rd':int, 
                'Tackles_Mid 3rd':int, 
                'Tackles_Att 3rd':int,
                'Drib_Tkl':int, 
                'Drib_Tkl_Att':int, 
                'Drib_Tkl%':float, 
                'Def_Blocks':int,
                'Def_Shot_Blocks':int, 
                'Def_Pass_Blocks':int, 
                'Int':int, 
                'Tkl+Int':int, 
                'Clearances':int,
                'Errors':int 
}
mls_def_df = mls_def_df.astype(convert_dict)

### Miscellaneous Stats

In [161]:
# To scrape leagues outside of the Big 5. USE THIS STRUCTURE
url = 'https://fbref.com/en/comps/22/misc/Major-League-Soccer-Stats'

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

comments = soup.find_all(string=lambda text: isinstance(text, Comment))

for comment in comments:
    # Check if the comment contains the target <div>.
    if 'div_stats_misc' in comment:
        # Parse the comment as HTML and extract target <div>.
        div = BeautifulSoup(comment, 'html.parser').find(id='div_stats_misc')

        mls_misc_df = pd.read_html(StringIO(str(div)))[0]
        print(mls_misc_df.iloc[:, :6])
else:
    print("Unable to find table.")

    Unnamed: 0_level_0    Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk                Player             Nation   
0                    1            Liel Abada             il ISR   
1                    2  Jose Casas de Abadal             us USA   
2                    3            Luis Abram             pe PER   
3                    4        Lalas Abubakar             gh GHA   
4                    5         Kellyn Acosta             us USA   
..                 ...                   ...                ...   
753                 Rk                Player             Nation   
754                726      Walker Zimmerman             us USA   
755                727           Rida Zouhir             ca CAN   
756                728         Dario Župarić             ba BIH   
757                729       Nökkvi Þórisson             is ISL   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  
                   Pos              Squad                Age  
0 

In [162]:
# Flattening multiIndex columns
mls_misc_df.columns = ['_'.join(col) for col in mls_misc_df.columns.values]

# Dropping useless columns
mls_misc_df = mls_misc_df.drop(['Unnamed: 0_level_0_Rk','Unnamed: 7_level_0_90s','Unnamed: 24_level_0_Matches','Aerial Duels_Lost',
                         'Performance_CrdY','Performance_CrdR','Performance_Crs','Performance_Int','Performance_TklW',
                         'Performance_Fld'], axis=1)

# Renaming player ID columns
mls_misc_df = mls_misc_df.rename(columns={'Unnamed: 1_level_0_Player':'Player',
                                        'Unnamed: 2_level_0_Nation':'Nation', 
                                        'Unnamed: 3_level_0_Pos':'Position',
                                        'Unnamed: 4_level_0_Squad':'Squad',
                                        'Unnamed: 5_level_0_Age':'Age',
                                        'Unnamed: 6_level_0_Born':'Born'})

# Renaming stat group columns
mls_misc_df = mls_misc_df.rename(columns={'Performance_2CrdY':'CrdY2',
                                    'Performance_Fls':'Fls_Comm',
                                    'Performance_Off':'Offsides',
                                    'Performance_PKwon':'PK_Won',
                                    'Performance_PKcon':'PK_Conv',
                                    'Performance_OG':'Own_Goal',
                                    'Performance_Recov':'Ball_Recoveries'})

In [163]:
# Fixing nan values for players that we want to include in our dataset (to be used for all stat groups)

# Tyrese Spicer - Toronto FC
tyrese_spicer = mls_misc_df.loc[mls_misc_df['Player']== 'Tyrese Spicer']
tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')

# Tristan Muyumba- Atlanta Utd
tristan_muyumba = mls_misc_df.loc[mls_misc_df['Player']== 'Tristan Muyumba'] 
tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


nan_players = pd.concat([tyrese_spicer,tristan_muyumba], ignore_index=True)

mls_misc_df = pd.concat([mls_misc_df, nan_players], ignore_index=True)
# pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tyrese_spicer.Nation = tyrese_spicer.Nation.fillna('tt TTG')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tristan_muyumba.Nation = tristan_muyumba.Nation.fillna('fr FRA')


In [164]:
mls_misc_df = mls_misc_df.dropna(axis=0, how='any')


# Removing column header rows
mls_misc_df = mls_misc_df[mls_misc_df["Player"] != "Player"]

#Need to update
convert_dict = {
                'CrdY2':int,
                'Fls_Comm':int, 
                'Offsides':int, 
                'PK_Won':int, 
                'PK_Conv':int, 
                'Own_Goal':int,
                'Ball_Recoveries':int, 
                'Aerial Duels_Won':int, 
                'Aerial Duels_Won%':float 
}
mls_misc_df = mls_misc_df.astype(convert_dict)

### Merging stat group dataframes

In [165]:
mls_conditions_join = ['Player','Nation','Position','Squad','Age','Born']

mls_outfield_df = pd.merge(mls_stand_df, mls_shoot_df, left_on=mls_conditions_join, right_on=mls_conditions_join,
                          how='left')

mls_outfield_df = pd.merge(mls_outfield_df, mls_poss_df, left_on=mls_conditions_join, right_on=mls_conditions_join,
                          how='left')

mls_outfield_df = pd.merge(mls_outfield_df, mls_pass_df, left_on=mls_conditions_join, right_on=mls_conditions_join,
                          how='left')

mls_outfield_df = pd.merge(mls_outfield_df, mls_ptype_df, left_on=mls_conditions_join, right_on=mls_conditions_join,
                          how='left')

mls_outfield_df = pd.merge(mls_outfield_df, mls_gca_df, left_on=mls_conditions_join, right_on=mls_conditions_join,
                          how='left')

mls_outfield_df = pd.merge(mls_outfield_df, mls_def_df, left_on=mls_conditions_join, right_on=mls_conditions_join,
                          how='left')

mls_outfield_df = pd.merge(mls_outfield_df, mls_misc_df, left_on=mls_conditions_join, right_on=mls_conditions_join,
                          how='left')

In [166]:
mls_outfield_df['League'] = 'Major League Soccer'

In [167]:
mls_outfield_df.head(20)

Unnamed: 0,Player,Nation,Position,Squad,Age,Born,MP,Starts,Min,No_90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,Prg_Carr,Prg_Pass,Prg_Pass_Rec,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90,Shots,SoT,SoT_pct,Shots_90,SoT_90,Gls_per_Sh,Gls_per_SoT,Avg_Sh_Dist,Sh_FK,npxG_per_Sh,G-xG,npG-npxG,Touches,Touches_Def Pen,Touches_Def 3rd,Touches_Mid 3rd,Touches_Att 3rd,Touches_Att Pen,Touches_Live,Take-Ons_Att,Take-Ons_Succ,Take-Ons_Succ%,Take-Ons_Tkld,Take-Ons_Tkld%,Carries,Carries_TotDist,Carries_PrgDist,Carries_PrgC,Carries_1/3,Carries_CPA,Carries_Mis,Carries_Dis,Receiving_Rec,Receiving_PrgR,Total_Cmp,Total_Att,Total_Cmp%,Total_TotDist,Total_PrgDist,Short_Cmp,Short_Att,Short_Cmp%,Medium_Cmp,Medium_Att,Medium_Cmp%,Long_Cmp,Long_Att,Long_Cmp%,xA,A-xAG,KP,Pass_Fin_3rd,Pass_Pen_Area,Cross_Pen_Area,Pass Types_Live,Pass Types_Dead,Pass Types_FK,Pass Types_TB,Pass Types_Sw,Pass Types_Crs,Pass Types_TI,Pass Types_CK,Corner Kicks_In,Corner Kicks_Out,Corner Kicks_Str,Pass_Offsides,Pass_Blocked,SCA,SCA_90,SCA Types_PassLive,SCA Types_PassDead,SCA Types_TO,SCA Types_Sh,SCA Types_Fld,SCA Types_Def,GCA,GCA_90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def,Tkls,Tkls_Won,Tackles_Def 3rd,Tackles_Mid 3rd,Tackles_Att 3rd,Drib_Tkl,Drib_Tkl_Att,Drib_Tkl%,Def_Blocks,Def_Shot_Blocks,Def_Pass_Blocks,Int,Tkl+Int,Clearances,Errors,CrdY2,Fls_Comm,Offsides,PK_Won,PK_Conv,Own_Goal,Ball_Recoveries,Aerial Duels_Won,Aerial Duels_Won%,League
0,Liel Abada,il ISR,FW,Charlotte,22-291,2001,15,13,1091,12.1,4,1,5,4,0,0,1,0,4.6,4.6,1.7,6.3,56,40,106,0.33,0.08,0.41,0.33,0.41,0.38,0.14,0.52,0.38,0.52,35.0,19.0,54.3,2.89,1.57,0.11,0.21,16.4,0.0,0.13,-0.6,-0.6,419.0,4.0,38.0,124.0,266.0,64.0,419.0,35.0,13.0,37.1,21.0,60.0,282.0,2182.0,1282.0,56.0,23.0,26.0,22.0,26.0,311.0,106.0,206.0,299.0,68.9,2909.0,994.0,120.0,155.0,77.4,63.0,91.0,69.2,10.0,27.0,37.0,2.5,-0.7,16.0,16.0,19.0,3.0,283,16,0,2,2,30,2,14,6,4,1,0,8,40,3.3,32,2,2,2,2,0,3,0.25,3,0,0,0,0,0,9.0,7.0,2.0,3.0,4.0,5.0,15.0,33.3,6.0,0.0,6.0,5.0,14.0,3.0,0.0,0.0,2.0,7.0,0.0,0.0,0.0,43.0,1.0,7.1,Major League Soccer
1,Jose Casas de Abadal,us USA,"FW,DF",Inter Miami,24-030,2000,2,0,31,0.3,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1,3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,33.0,0.0,4.0,20.0,10.0,0.0,33.0,1.0,1.0,100.0,0.0,0.0,19.0,133.0,40.0,1.0,0.0,0.0,0.0,0.0,25.0,3.0,23.0,30.0,76.7,362.0,114.0,9.0,12.0,75.0,12.0,13.0,92.3,1.0,2.0,50.0,0.0,0.0,0.0,4.0,0.0,0.0,29,1,0,0,0,1,0,1,0,1,0,0,1,0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,2.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,,,,,,,,,,Major League Soccer
2,Luis Abram,pe PER,DF,Atlanta Utd,28-144,1996,14,11,1035,11.5,0,0,0,0,0,0,2,0,0.7,0.7,0.1,0.8,2,31,3,0.0,0.0,0.0,0.0,0.0,0.06,0.01,0.07,0.06,0.07,8.0,1.0,12.5,0.7,0.09,0.0,0.0,9.4,0.0,0.09,-0.7,-0.7,729.0,111.0,408.0,300.0,25.0,19.0,729.0,3.0,2.0,66.7,1.0,33.3,404.0,1835.0,1102.0,2.0,3.0,0.0,3.0,1.0,509.0,3.0,563.0,628.0,89.6,11471.0,4671.0,167.0,174.0,96.0,321.0,349.0,92.0,71.0,98.0,72.4,0.1,-0.1,3.0,26.0,0.0,0.0,599,28,9,0,5,0,0,0,0,0,0,1,2,12,1.04,10,0,0,1,0,1,0,0.0,0,0,0,0,0,0,8.0,5.0,4.0,1.0,3.0,3.0,8.0,37.5,10.0,8.0,2.0,8.0,16.0,49.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,39.0,15.0,50.0,Major League Soccer
3,Lalas Abubakar,gh GHA,DF,Rapids,29-208,1994,11,8,693,7.7,0,0,0,0,0,0,2,0,0.3,0.3,0.0,0.3,2,20,0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.04,0.04,0.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,337.0,379.0,88.9,6247.0,1834.0,109.0,121.0,90.1,205.0,224.0,91.5,20.0,29.0,69.0,0.0,0.0,0.0,17.0,0.0,0.0,373,5,5,0,0,0,0,0,0,0,0,1,0,1,0.13,1,0,0,0,0,0,1,0.13,1,0,0,0,0,0,4.0,3.0,3.0,1.0,0.0,2.0,2.0,100.0,5.0,5.0,0.0,11.0,15.0,38.0,0.0,0.0,6.0,0.0,0.0,1.0,1.0,22.0,22.0,64.7,Major League Soccer
4,Kellyn Acosta,us USA,MF,Fire,28-362,1995,24,20,1690,18.8,2,0,2,2,0,0,5,0,1.0,1.0,1.1,2.0,13,76,29,0.11,0.0,0.11,0.11,0.11,0.05,0.06,0.11,0.05,0.11,17.0,5.0,29.4,0.91,0.27,0.12,0.4,26.2,3.0,0.06,1.0,1.0,1090.0,14.0,205.0,639.0,254.0,9.0,1090.0,22.0,12.0,54.5,10.0,45.5,608.0,2617.0,1024.0,13.0,14.0,1.0,23.0,14.0,736.0,29.0,729.0,937.0,77.8,12661.0,3528.0,341.0,388.0,87.9,317.0,368.0,86.1,60.0,146.0,41.1,1.5,-1.1,18.0,61.0,12.0,3.0,832,101,45,2,6,75,5,51,29,16,0,4,9,41,2.18,28,8,2,1,1,1,2,0.11,2,0,0,0,0,0,33.0,25.0,12.0,18.0,3.0,14.0,36.0,38.9,15.0,2.0,13.0,26.0,59.0,8.0,1.0,0.0,29.0,0.0,0.0,0.0,0.0,100.0,9.0,42.9,Major League Soccer
5,Luciano Acosta,ar ARG,MF,FC Cincinnati,30-050,1994,23,22,1986,22.1,11,15,26,9,2,3,6,0,8.2,5.8,9.9,15.7,109,195,161,0.5,0.68,1.18,0.41,1.09,0.37,0.45,0.82,0.26,0.71,61.0,20.0,32.8,2.76,0.91,0.15,0.45,20.9,10.0,0.1,2.8,3.2,1544.0,4.0,86.0,660.0,818.0,122.0,1541.0,173.0,73.0,42.2,95.0,54.9,944.0,5298.0,2924.0,109.0,74.0,50.0,58.0,35.0,1127.0,161.0,896.0,1219.0,73.5,15447.0,5615.0,473.0,569.0,83.1,298.0,370.0,80.5,103.0,189.0,54.5,10.5,5.1,82.0,116.0,81.0,7.0,1021,192,42,19,23,120,19,89,48,20,0,6,40,174,7.89,107,31,20,8,6,2,27,1.22,15,4,7,0,1,0,32.0,21.0,11.0,18.0,3.0,15.0,27.0,55.6,20.0,0.0,20.0,11.0,43.0,4.0,0.0,0.0,36.0,16.0,1.0,0.0,0.0,91.0,7.0,50.0,Major League Soccer
6,Jordan Adebayo-Smith,ng NGA,"FW,MF",Minnesota Utd,24-191,2000,11,1,254,2.8,0,0,0,0,0,0,2,0,0.4,0.4,0.0,0.4,3,3,15,0.0,0.0,0.0,0.0,0.0,0.14,0.01,0.15,0.14,0.15,5.0,3.0,60.0,1.77,1.06,0.0,0.0,14.3,0.0,0.08,-0.4,-0.4,80.0,0.0,8.0,37.0,36.0,6.0,80.0,3.0,1.0,33.3,2.0,66.7,37.0,169.0,78.0,3.0,1.0,2.0,6.0,0.0,50.0,15.0,,,,,,,,,,,,,,,,,,,,,51,1,0,0,0,3,0,0,0,0,0,0,1,4,1.42,2,0,1,1,0,0,1,0.35,1,0,0,0,0,0,5.0,3.0,5.0,0.0,0.0,0.0,2.0,0.0,4.0,0.0,4.0,2.0,7.0,1.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,5.0,5.0,26.3,Major League Soccer
7,Sam Adekugbe,ca CAN,"MF,FW",Vancouver W'caps,29-186,1995,7,3,246,2.7,0,1,1,0,0,0,2,0,0.2,0.2,1.1,1.3,10,15,34,0.0,0.37,0.37,0.0,0.37,0.06,0.42,0.47,0.06,0.47,,,,,,,,,,,,,187.0,5.0,20.0,94.0,73.0,11.0,187.0,1.0,0.0,0.0,0.0,0.0,129.0,515.0,248.0,10.0,7.0,2.0,6.0,0.0,155.0,34.0,133.0,164.0,81.1,2115.0,595.0,61.0,68.0,89.7,64.0,76.0,84.2,3.0,9.0,33.3,1.0,-0.1,7.0,9.0,7.0,3.0,159,4,2,0,0,12,2,0,0,0,0,1,4,8,2.91,7,0,0,1,0,0,2,0.73,2,0,0,0,0,0,3.0,1.0,2.0,1.0,0.0,1.0,1.0,100.0,1.0,0.0,1.0,2.0,5.0,4.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,11.0,4.0,66.7,Major League Soccer
8,Samuel Adeniran,us USA,"FW,MF",St. Louis,25-294,1998,17,4,596,6.6,2,1,3,2,0,0,2,0,2.2,2.2,0.8,3.0,16,9,51,0.3,0.15,0.45,0.3,0.45,0.34,0.12,0.45,0.34,0.45,26.0,6.0,23.1,3.93,0.91,0.08,0.33,16.6,0.0,0.09,-0.2,-0.2,222.0,6.0,15.0,76.0,135.0,39.0,222.0,17.0,6.0,35.3,9.0,52.9,130.0,919.0,478.0,16.0,11.0,10.0,29.0,11.0,163.0,51.0,89.0,125.0,71.2,1250.0,216.0,50.0,65.0,76.9,28.0,41.0,68.3,5.0,8.0,62.5,0.6,0.2,8.0,8.0,5.0,1.0,106,19,1,0,1,4,4,0,0,0,0,0,2,12,1.81,7,0,2,3,0,0,2,0.3,1,0,1,0,0,0,,,,,,,,,,,,,,,,0.0,12.0,3.0,0.0,0.0,0.0,14.0,20.0,55.6,Major League Soccer
9,Leonardo Afonso,br BRA,"FW,MF",Inter Miami,23-007,2001,10,1,226,2.5,2,0,2,2,0,0,0,0,0.7,0.7,0.2,0.9,7,8,30,0.8,0.0,0.8,0.8,0.8,0.28,0.08,0.36,0.28,0.36,5.0,2.0,40.0,1.99,0.8,0.4,1.0,15.9,0.0,0.14,1.3,1.3,132.0,3.0,18.0,58.0,59.0,11.0,132.0,7.0,5.0,71.4,2.0,28.6,75.0,414.0,211.0,7.0,4.0,1.0,8.0,1.0,89.0,30.0,59.0,93.0,63.4,858.0,192.0,28.0,40.0,70.0,21.0,27.0,77.8,4.0,8.0,50.0,0.3,-0.2,4.0,4.0,3.0,0.0,90,1,0,0,2,5,0,0,0,0,0,2,6,11,4.38,8,0,2,0,1,0,2,0.8,0,0,1,0,1,0,8.0,6.0,5.0,2.0,1.0,2.0,6.0,33.3,2.0,0.0,2.0,5.0,13.0,2.0,0.0,0.0,4.0,1.0,1.0,0.0,0.0,10.0,4.0,30.8,Major League Soccer


In [170]:
# outfield_df = pd.concat([b5_outfield_df, championship_outfield_df],ignore_index=True)
# outfield_df = pd.concat([outfield_df, primeiraliga_outfield_df],ignore_index=True)
# outfield_df = pd.concat([outfield_df, eredivisie_outfield_df],ignore_index=True)
# outfield_df = pd.concat([outfield_df, mls_outfield_df],ignore_index=True)

outfield_df = pd.concat([b5_outfield_df, championship_outfield_df, primeiraliga_outfield_df,
                         eredivisie_outfield_df, mls_outfield_df],ignore_index=True)

# outfield_df = outfield_df.append(primeiraliga_outfield_df)
# outfield_df = outfield_df.append(eredivisie_outfield_df)



In [171]:
outfield_df.shape

(5350, 143)

In [173]:
outfield_df.tail(10)

Unnamed: 0,Player,Nation,Position,Squad,Age,Born,MP,Starts,Min,No_90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,Prg_Carr,Prg_Pass,Prg_Pass_Rec,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90,Shots,SoT,SoT_pct,Shots_90,SoT_90,Gls_per_Sh,Gls_per_SoT,Avg_Sh_Dist,Sh_FK,npxG_per_Sh,G-xG,npG-npxG,Touches,Touches_Def Pen,Touches_Def 3rd,Touches_Mid 3rd,Touches_Att 3rd,Touches_Att Pen,Touches_Live,Take-Ons_Att,Take-Ons_Succ,Take-Ons_Succ%,Take-Ons_Tkld,Take-Ons_Tkld%,Carries,Carries_TotDist,Carries_PrgDist,Carries_PrgC,Carries_1/3,Carries_CPA,Carries_Mis,Carries_Dis,Receiving_Rec,Receiving_PrgR,Total_Cmp,Total_Att,Total_Cmp%,Total_TotDist,Total_PrgDist,Short_Cmp,Short_Att,Short_Cmp%,Medium_Cmp,Medium_Att,Medium_Cmp%,Long_Cmp,Long_Att,Long_Cmp%,xA,A-xAG,KP,Pass_Fin_3rd,Pass_Pen_Area,Cross_Pen_Area,Pass Types_Live,Pass Types_Dead,Pass Types_FK,Pass Types_TB,Pass Types_Sw,Pass Types_Crs,Pass Types_TI,Pass Types_CK,Corner Kicks_In,Corner Kicks_Out,Corner Kicks_Str,Pass_Offsides,Pass_Blocked,SCA,SCA_90,SCA Types_PassLive,SCA Types_PassDead,SCA Types_TO,SCA Types_Sh,SCA Types_Fld,SCA Types_Def,GCA,GCA_90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def,Tkls,Tkls_Won,Tackles_Def 3rd,Tackles_Mid 3rd,Tackles_Att 3rd,Drib_Tkl,Drib_Tkl_Att,Drib_Tkl%,Def_Blocks,Def_Shot_Blocks,Def_Pass_Blocks,Int,Tkl+Int,Clearances,Errors,CrdY2,Fls_Comm,Offsides,PK_Won,PK_Conv,Own_Goal,Ball_Recoveries,Aerial Duels_Won,Aerial Duels_Won%,League
5340,Maya Yoshida,jp JPN,DF,LA Galaxy,35-331,1988,25,25,2225,24.7,2,0,2,2,0,0,1,0,1.2,1.2,0.1,1.2,1,44,4,0.08,0.0,0.08,0.08,0.08,0.05,0.0,0.05,0.05,0.05,16.0,6.0,37.5,0.65,0.24,0.13,0.33,10.2,0.0,0.07,0.8,0.8,1909.0,288.0,1105.0,767.0,43.0,31.0,1909.0,3.0,2.0,66.7,1.0,33.3,1097.0,3935.0,1765.0,1.0,2.0,0.0,7.0,3.0,1415.0,4.0,1555.0,1683.0,92.4,28589.0,10770.0,575.0,604.0,95.2,830.0,858.0,96.7,126.0,182.0,69.2,0.4,-0.1,2.0,56.0,1.0,0.0,1625,58,38,0,8,0,2,0,0,0,0,0,7,17,0.69,11,0,0,6,0,0,3,0.12,2,0,0,1,0,0,17.0,10.0,10.0,6.0,1.0,8.0,17.0,47.1,33.0,25.0,8.0,34.0,51.0,102.0,3.0,0.0,17.0,1.0,0.0,0.0,0.0,89.0,52.0,69.3,Major League Soccer
5341,Jackson Yueill,us USA,MF,SJ Earthquakes,27-123,1997,22,21,1843,20.5,0,0,0,0,0,0,3,1,1.3,1.3,2.0,3.3,20,113,30,0.0,0.0,0.0,0.0,0.0,0.06,0.1,0.16,0.06,0.16,16.0,3.0,18.8,0.78,0.15,0.0,0.0,21.2,0.0,0.08,-1.3,-1.3,1267.0,45.0,284.0,775.0,217.0,17.0,1267.0,14.0,8.0,57.1,6.0,42.9,789.0,3522.0,1731.0,20.0,20.0,2.0,21.0,9.0,925.0,30.0,995.0,1119.0,88.9,16470.0,5775.0,504.0,531.0,94.9,400.0,440.0,90.9,71.0,107.0,66.4,1.7,-2.0,19.0,109.0,15.0,4.0,1072,46,40,1,15,10,5,0,0,0,0,1,12,41,2.0,39,1,0,1,0,0,1,0.05,1,0,0,0,0,0,39.0,21.0,8.0,27.0,4.0,19.0,37.0,51.4,9.0,1.0,8.0,16.0,55.0,29.0,1.0,1.0,13.0,0.0,0.0,0.0,0.0,95.0,7.0,43.8,Major League Soccer
5342,Gyasi Zardes,us USA,FW,Austin,32-322,1991,24,7,879,9.8,3,2,5,3,0,0,2,0,2.8,2.8,0.3,3.1,9,12,35,0.31,0.2,0.51,0.31,0.51,0.29,0.03,0.32,0.29,0.32,21.0,7.0,33.3,2.15,0.72,0.14,0.43,12.1,0.0,0.14,0.2,0.2,305.0,16.0,63.0,121.0,124.0,39.0,305.0,16.0,4.0,25.0,10.0,62.5,129.0,609.0,210.0,9.0,1.0,4.0,18.0,16.0,205.0,35.0,126.0,185.0,68.1,1716.0,305.0,80.0,102.0,78.4,32.0,51.0,62.7,8.0,10.0,80.0,0.4,1.7,5.0,6.0,2.0,1.0,181,4,0,0,0,5,0,0,0,0,0,0,5,14,1.44,8,0,1,5,0,0,4,0.41,2,0,1,1,0,0,16.0,12.0,8.0,7.0,1.0,6.0,9.0,66.7,6.0,2.0,4.0,5.0,21.0,15.0,1.0,0.0,18.0,8.0,0.0,0.0,0.0,28.0,31.0,37.8,Major League Soccer
5343,Eriq Zavaleta,sv SLV,DF,LA Galaxy,31-353,1992,5,1,161,1.8,1,0,1,1,0,0,0,1,0.1,0.1,0.0,0.1,0,1,0,0.56,0.0,0.56,0.56,0.56,0.06,0.0,0.06,0.06,0.06,2.0,1.0,50.0,1.12,0.56,0.5,1.0,11.1,0.0,0.06,0.9,0.9,116.0,23.0,89.0,24.0,3.0,3.0,116.0,1.0,1.0,100.0,0.0,0.0,68.0,205.0,84.0,0.0,0.0,0.0,0.0,0.0,79.0,0.0,87.0,97.0,89.7,1350.0,508.0,43.0,43.0,100.0,42.0,45.0,93.3,1.0,7.0,14.3,0.0,0.0,0.0,0.0,0.0,0.0,89,8,2,0,0,0,0,0,0,0,0,0,0,1,0.56,1,0,0,0,0,0,0,0.0,0,0,0,0,0,0,3.0,2.0,3.0,0.0,0.0,2.0,2.0,100.0,3.0,3.0,0.0,2.0,5.0,9.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,5.0,1.0,25.0,Major League Soccer
5344,Sean Zawadzki,us USA,"MF,DF",Crew,24-090,2000,20,14,1318,14.6,1,1,2,1,0,0,3,0,0.6,0.6,1.3,1.8,20,71,20,0.07,0.07,0.14,0.07,0.14,0.04,0.09,0.13,0.04,0.13,10.0,2.0,20.0,0.68,0.14,0.1,0.5,16.9,0.0,0.06,0.4,0.4,1058.0,51.0,300.0,617.0,148.0,11.0,1058.0,19.0,7.0,36.8,11.0,57.9,615.0,2545.0,1313.0,20.0,15.0,4.0,7.0,8.0,767.0,20.0,838.0,925.0,90.6,13666.0,4588.0,428.0,460.0,93.0,349.0,367.0,95.1,49.0,67.0,73.1,0.6,-0.3,6.0,73.0,6.0,0.0,886,36,19,2,3,1,16,0,0,0,0,3,12,30,2.05,22,0,0,2,2,4,5,0.34,2,0,0,2,0,1,36.0,20.0,16.0,14.0,6.0,16.0,26.0,61.5,16.0,4.0,12.0,20.0,56.0,28.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,73.0,18.0,75.0,Major League Soccer
5345,Walker Zimmerman,us USA,DF,Nashville,31-062,1993,16,13,1118,12.4,1,1,2,1,0,0,0,0,0.6,0.6,0.4,1.0,2,36,5,0.08,0.08,0.16,0.08,0.16,0.05,0.03,0.08,0.05,0.08,7.0,2.0,28.6,0.56,0.16,0.14,0.5,9.6,0.0,0.09,0.4,0.4,899.0,133.0,517.0,352.0,32.0,18.0,899.0,2.0,2.0,100.0,0.0,0.0,520.0,2402.0,1274.0,2.0,3.0,0.0,4.0,2.0,608.0,5.0,668.0,765.0,87.3,14083.0,5882.0,165.0,177.0,93.2,412.0,447.0,92.2,88.0,131.0,67.2,0.5,0.6,4.0,40.0,3.0,0.0,736,28,24,0,3,1,0,0,0,0,0,1,4,14,1.13,10,1,0,0,2,1,2,0.16,1,0,0,0,1,0,17.0,8.0,10.0,5.0,2.0,9.0,16.0,56.3,14.0,10.0,4.0,29.0,46.0,55.0,0.0,0.0,6.0,1.0,1.0,0.0,1.0,47.0,35.0,68.6,Major League Soccer
5346,Rida Zouhir,ca CAN,"MF,DF",CF Montréal,20-240,2003,7,1,178,2.0,0,0,0,0,0,0,2,0,0.0,0.0,0.0,0.0,1,16,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,152.0,4.0,31.0,87.0,34.0,1.0,152.0,3.0,2.0,66.7,1.0,33.3,88.0,376.0,157.0,1.0,1.0,0.0,2.0,2.0,119.0,1.0,120.0,138.0,87.0,1832.0,473.0,63.0,68.0,92.6,45.0,50.0,90.0,6.0,10.0,60.0,0.1,0.0,0.0,10.0,1.0,1.0,131,6,2,2,0,7,0,4,2,1,0,1,2,4,2.08,4,0,0,0,0,0,0,0.0,0,0,0,0,0,0,5.0,3.0,1.0,2.0,2.0,2.0,3.0,66.7,2.0,0.0,2.0,2.0,7.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,7.0,1.0,33.3,Major League Soccer
5347,Dario Župarić,ba BIH,DF,Portland Timbers,32-078,1992,19,15,1439,16.0,0,0,0,0,0,0,4,0,0.2,0.2,0.0,0.3,2,38,4,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.02,0.01,0.02,3.0,1.0,33.3,0.19,0.06,0.0,0.0,17.2,0.0,0.08,-0.2,-0.2,1030.0,99.0,524.0,490.0,19.0,5.0,1030.0,2.0,1.0,50.0,0.0,0.0,564.0,2888.0,1388.0,2.0,2.0,0.0,7.0,0.0,637.0,4.0,738.0,853.0,86.5,15124.0,5444.0,201.0,226.0,88.9,433.0,480.0,90.2,96.0,129.0,74.4,0.6,0.0,1.0,35.0,4.0,0.0,813,38,34,1,3,0,1,0,0,0,0,2,5,12,0.75,10,0,0,1,1,0,1,0.06,1,0,0,0,0,0,30.0,19.0,23.0,5.0,2.0,15.0,23.0,65.2,27.0,13.0,14.0,19.0,49.0,80.0,1.0,0.0,15.0,0.0,0.0,0.0,0.0,56.0,41.0,68.3,Major League Soccer
5348,Nökkvi Þórisson,is ISL,"FW,MF",St. Louis,24-342,1999,20,7,793,8.8,2,0,2,2,0,0,1,0,4.5,4.5,1.4,5.9,19,18,68,0.23,0.0,0.23,0.23,0.23,0.51,0.15,0.67,0.51,0.67,37.0,9.0,24.3,4.2,1.02,0.05,0.22,13.5,0.0,0.12,-2.5,-2.5,317.0,7.0,38.0,110.0,171.0,56.0,317.0,21.0,11.0,52.4,10.0,47.6,179.0,986.0,530.0,19.0,10.0,11.0,21.0,9.0,224.0,68.0,143.0,211.0,67.8,2093.0,505.0,91.0,125.0,72.8,33.0,48.0,68.8,14.0,20.0,70.0,0.9,-1.4,12.0,9.0,6.0,0.0,198,12,1,4,1,5,0,0,0,0,0,1,5,27,3.06,17,0,3,6,1,0,2,0.23,0,0,0,2,0,0,10.0,6.0,4.0,3.0,3.0,2.0,7.0,28.6,8.0,0.0,8.0,2.0,12.0,7.0,0.0,0.0,18.0,9.0,0.0,1.0,0.0,22.0,26.0,59.1,Major League Soccer
5349,Tristan Muyumba,fr FRA,MF,Atlanta Utd,27-135,1997,24,22,1823,20.3,0,2,2,0,0,0,4,0,1.3,1.3,1.0,2.3,32,90,31,0.0,0.1,0.1,0.0,0.1,0.07,0.05,0.11,0.07,0.11,26.0,1.0,3.8,1.28,0.05,0.0,0.0,20.4,0.0,0.05,-1.3,-1.3,1275.0,39.0,296.0,699.0,291.0,45.0,1275.0,59.0,36.0,61.0,22.0,37.3,722.0,3687.0,1590.0,32.0,13.0,13.0,24.0,17.0,922.0,31.0,937.0,1060.0,88.4,14939.0,3923.0,470.0,514.0,91.4,361.0,392.0,92.1,63.0,82.0,76.8,1.5,1.0,18.0,75.0,15.0,0.0,1025,34,25,4,10,3,8,0,0,0,0,1,24,49,2.42,37,1,4,4,2,1,3,0.15,3,0,0,0,0,0,65.0,46.0,29.0,31.0,5.0,30.0,50.0,60.0,19.0,5.0,14.0,17.0,82.0,17.0,0.0,0.0,27.0,0.0,0.0,1.0,0.0,102.0,4.0,23.5,Major League Soccer


In [174]:
outfield_df = outfield_df.fillna(0)

outfield_df[outfield_df.isnull().any(axis=1)]

Unnamed: 0,Player,Nation,Position,Squad,Age,Born,MP,Starts,Min,No_90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,Prg_Carr,Prg_Pass,Prg_Pass_Rec,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90,Shots,SoT,SoT_pct,Shots_90,SoT_90,Gls_per_Sh,Gls_per_SoT,Avg_Sh_Dist,Sh_FK,npxG_per_Sh,G-xG,npG-npxG,Touches,Touches_Def Pen,Touches_Def 3rd,Touches_Mid 3rd,Touches_Att 3rd,Touches_Att Pen,Touches_Live,Take-Ons_Att,Take-Ons_Succ,Take-Ons_Succ%,Take-Ons_Tkld,Take-Ons_Tkld%,Carries,Carries_TotDist,Carries_PrgDist,Carries_PrgC,Carries_1/3,Carries_CPA,Carries_Mis,Carries_Dis,Receiving_Rec,Receiving_PrgR,Total_Cmp,Total_Att,Total_Cmp%,Total_TotDist,Total_PrgDist,Short_Cmp,Short_Att,Short_Cmp%,Medium_Cmp,Medium_Att,Medium_Cmp%,Long_Cmp,Long_Att,Long_Cmp%,xA,A-xAG,KP,Pass_Fin_3rd,Pass_Pen_Area,Cross_Pen_Area,Pass Types_Live,Pass Types_Dead,Pass Types_FK,Pass Types_TB,Pass Types_Sw,Pass Types_Crs,Pass Types_TI,Pass Types_CK,Corner Kicks_In,Corner Kicks_Out,Corner Kicks_Str,Pass_Offsides,Pass_Blocked,SCA,SCA_90,SCA Types_PassLive,SCA Types_PassDead,SCA Types_TO,SCA Types_Sh,SCA Types_Fld,SCA Types_Def,GCA,GCA_90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def,Tkls,Tkls_Won,Tackles_Def 3rd,Tackles_Mid 3rd,Tackles_Att 3rd,Drib_Tkl,Drib_Tkl_Att,Drib_Tkl%,Def_Blocks,Def_Shot_Blocks,Def_Pass_Blocks,Int,Tkl+Int,Clearances,Errors,CrdY2,Fls_Comm,Offsides,PK_Won,PK_Conv,Own_Goal,Ball_Recoveries,Aerial Duels_Won,Aerial Duels_Won%,League


In [175]:
outfield_df.head()

Unnamed: 0,Player,Nation,Position,Squad,Age,Born,MP,Starts,Min,No_90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,xG,npxG,xAG,npxG+xAG,Prg_Carr,Prg_Pass,Prg_Pass_Rec,Gls_90,Ast_90,G+A_90,G-PK_90,G+A-PK_90,xG_90,xAG_90,xG+xAG_90,npxG_90,npxG+xAG_90,Shots,SoT,SoT_pct,Shots_90,SoT_90,Gls_per_Sh,Gls_per_SoT,Avg_Sh_Dist,Sh_FK,npxG_per_Sh,G-xG,npG-npxG,Touches,Touches_Def Pen,Touches_Def 3rd,Touches_Mid 3rd,Touches_Att 3rd,Touches_Att Pen,Touches_Live,Take-Ons_Att,Take-Ons_Succ,Take-Ons_Succ%,Take-Ons_Tkld,Take-Ons_Tkld%,Carries,Carries_TotDist,Carries_PrgDist,Carries_PrgC,Carries_1/3,Carries_CPA,Carries_Mis,Carries_Dis,Receiving_Rec,Receiving_PrgR,Total_Cmp,Total_Att,Total_Cmp%,Total_TotDist,Total_PrgDist,Short_Cmp,Short_Att,Short_Cmp%,Medium_Cmp,Medium_Att,Medium_Cmp%,Long_Cmp,Long_Att,Long_Cmp%,xA,A-xAG,KP,Pass_Fin_3rd,Pass_Pen_Area,Cross_Pen_Area,Pass Types_Live,Pass Types_Dead,Pass Types_FK,Pass Types_TB,Pass Types_Sw,Pass Types_Crs,Pass Types_TI,Pass Types_CK,Corner Kicks_In,Corner Kicks_Out,Corner Kicks_Str,Pass_Offsides,Pass_Blocked,SCA,SCA_90,SCA Types_PassLive,SCA Types_PassDead,SCA Types_TO,SCA Types_Sh,SCA Types_Fld,SCA Types_Def,GCA,GCA_90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def,Tkls,Tkls_Won,Tackles_Def 3rd,Tackles_Mid 3rd,Tackles_Att 3rd,Drib_Tkl,Drib_Tkl_Att,Drib_Tkl%,Def_Blocks,Def_Shot_Blocks,Def_Pass_Blocks,Int,Tkl+Int,Clearances,Errors,CrdY2,Fls_Comm,Offsides,PK_Won,PK_Conv,Own_Goal,Ball_Recoveries,Aerial Duels_Won,Aerial Duels_Won%,League
0,Max Aarons,eng ENG,DF,Bournemouth,23,2000,20,13,1237,13.7,0,1,1,0,0,0,1,0,0.0,0.0,0.8,0.9,22,43,26,0.0,0.07,0.07,0.0,0.07,0.0,0.06,0.06,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,711.0,43.0,252.0,303.0,165.0,11.0,711.0,34.0,14.0,41.2,12.0,35.3,364.0,2174.0,1121.0,22.0,12.0,7.0,13.0,8.0,371.0,26.0,450,581,77.5,7402,2789,220,248,88.7,188,235,80.0,34,63,54.0,0.9,0.2,7,25,13,2,453,127,11,2,3,13,116,0,0,0,0,1,23,23,1.68,16,4,0,0,3,0,2,0.15,2,0,0,0,0,0,29.0,19.0,20.0,7.0,2.0,20.0,34.0,58.8,9.0,5.0,4.0,8.0,37.0,27.0,0.0,0.0,12.0,2.0,0.0,1.0,0.0,75.0,5.0,31.3,Premier League
1,Brenden Aaronson,us USA,"MF,FW",Union Berlin,22,2000,30,14,1267,14.1,2,2,4,2,0,0,3,1,2.0,2.0,1.9,3.8,37,56,91,0.14,0.14,0.28,0.14,0.28,0.14,0.13,0.27,0.14,0.27,18.0,7.0,38.9,1.28,0.5,0.11,0.29,18.4,0.0,0.11,0.0,0.0,675.0,11.0,108.0,301.0,293.0,47.0,675.0,77.0,34.0,44.2,41.0,53.2,406.0,2721.0,1387.0,37.0,29.0,9.0,41.0,38.0,457.0,91.0,365,472,77.3,4890,1506,206,240,85.8,105,130,80.8,19,32,59.4,2.0,0.1,22,30,14,3,439,29,3,5,1,22,12,6,2,3,0,4,21,53,3.76,41,1,8,3,0,0,8,0.57,6,0,2,0,0,0,32.0,18.0,13.0,13.0,6.0,16.0,32.0,50.0,26.0,1.0,25.0,2.0,34.0,4.0,0.0,1.0,15.0,5.0,0.0,0.0,0.0,88.0,13.0,44.8,Bundesliga
2,Paxten Aaronson,us USA,MF,Eint Frankfurt,19,2003,7,1,101,1.1,0,1,1,0,0,0,0,0,0.1,0.1,0.1,0.2,2,5,7,0.0,0.89,0.89,0.0,0.89,0.11,0.07,0.19,0.11,0.19,2.0,2.0,100.0,1.78,1.78,0.0,0.0,15.1,0.0,0.06,-0.1,-0.1,72.0,0.0,6.0,39.0,28.0,5.0,72.0,7.0,2.0,28.6,4.0,57.1,43.0,193.0,50.0,2.0,0.0,1.0,5.0,4.0,46.0,7.0,41,50,82.0,576,71,20,25,80.0,20,22,90.9,0,2,0.0,0.1,0.9,1,4,2,0,48,2,1,0,0,0,1,0,0,0,0,0,0,1,0.89,1,0,0,0,0,0,1,0.89,1,0,0,0,0,0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,100.0,2.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,5.0,3.0,100.0,Bundesliga
3,Keyliane Abdallah,fr FRA,FW,Marseille,17,2006,1,0,4,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Ligue 1
4,Yunis Abdelhamid,ma MAR,DF,Reims,35,1987,31,31,2781,30.9,4,0,4,3,1,1,5,0,3.4,2.6,0.3,2.9,36,137,9,0.13,0.0,0.13,0.1,0.1,0.11,0.01,0.12,0.09,0.09,21.0,7.0,33.3,0.68,0.23,0.14,0.43,15.0,0.0,0.13,0.6,0.4,2185.0,293.0,976.0,1119.0,114.0,35.0,2184.0,15.0,8.0,53.3,7.0,46.7,1506.0,8663.0,4921.0,36.0,19.0,0.0,23.0,4.0,1403.0,9.0,1552,1836,84.5,29618,9672,487,548,88.9,893,976,91.5,141,252,56.0,0.6,-0.3,8,129,3,0,1650,178,75,2,13,3,29,0,0,0,0,8,20,24,0.78,19,1,0,3,1,0,1,0.03,0,0,0,1,0,0,64.0,35.0,36.0,23.0,5.0,26.0,45.0,57.8,51.0,32.0,19.0,39.0,103.0,109.0,2.0,0.0,26.0,0.0,0.0,0.0,1.0,149.0,61.0,62.2,Ligue 1


In [176]:
outfield_df.to_csv('fbref_outfield_players.csv',index=False)

In [177]:
pwd

'/Users/aaronwoodward'

In [None]:
leaguenames = ['eflchamp', 'primliga', 'eredivisie', 'jupiler', 'mls']
fbrefleagues = ['Championship', 'Primeira-Liga', 'Eredivisie', 'Belgian-Pro-League', 'Major-League-Soccer']
fbrefleagueid = ['10', '32', '23', '37', '22']
statcomment = ['standard','shooting', 'passession', 'passing', 'passing_types', 'gca', 'defense', 'misc']
statgroupid = ['stats', 'shooting', 'possession', 'passing', 'passing_types', 'gca', 'defense', 'misc']
statshortnames = ['stand', 'shoot', 'poss', 'passing', 'ptype', 'gca', 'defense', 'misc']

In [None]:

for i,j in fbrefleagueid,fbrefleagues:
    url = f'https://fbref.com/en/comps/{i}/stats/{j}'
    response = requests.get(url)
    html_content = response.text

    soup = BeautifulSoup(html_content, 'html.parser')

    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    for l,m,n in leaguenames,statshortnames,statcomment:
        for comment in comments:
        # Check if the comment contains the target <div>.
            if f'div_stats_{n}' in comment:
                # Parse the comment as HTML and extract target <div>.
                div = BeautifulSoup(comment, 'html.parser').find(id=f'div_stats_{n}')

                f'{l}_{m}_df' == pd.read_html(StringIO(str(div)))[0]
                #print(championship_stand_df.iloc[:, :6])
    else:
        print("Unable to find table.")

