In [22]:
import pandas as pd
import utils

PATH_TO_FILES = "../data/fbref/cleaned/"
dataframes = utils.get_all_attributes(PATH_TO_FILES)
datafset_names = list(dataframes.keys())
datafset_names.remove('standard_stats')
composite_key = ['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age']
datafset_names

['advanced_goalkeeping',
 'defensive_actions',
 'goalkeeping',
 'goal_and_shot_creation',
 'miscellaneous_stats',
 'passing',
 'playing_time',
 'possession',
 'shooting']

### Create composite id to overcome none-unique player names
This can be due to transfer during winter period or similar names

In [23]:
df = dataframes['standard_stats']
df['unique_id'] = df['Player'] + '_' + df['Season'] + '_' + df['League'] + '_' + df['Team']
df.set_index('unique_id',inplace=True)
df.shape

(5071, 36)

### Define which columns to keep

In [24]:
def_cols = ['Tackles_Att', 'Tackles_Def 3rd', 'Tackles_Mid 3rd',
        'Tackles_Att 3rd','Interceptions', 'Clearances','Errors',
        'Dribblers_Tkl_Succ','Dribblers_Tkl_Att','Dribblers_Tkl_Lost',
        'Blocks_Total', 'Blocks_Shots',
        ]

misc_cols = ['2CrdY', 'Fls', 'Fld', 'Off', 'Crs','PKwon', 
            'PKcon', 'OG', 'Recov', 'Aerial Duels_Won',
            'Aerial Duels_Lost'
        ]

passing_cols = ['Total_Cmp', 'Total_Att', 'Total_Cmp%', 'Total_TotDist',
       'Total_PrgDist','Short_Cmp', 'Short_Att', 'Short_Cmp%', 'Medium_Cmp',
       'Medium_Att', 'Medium_Cmp%', 'Long_Cmp', 'Long_Att', 'Long_Cmp%','Assists', 'Key Passes', 'Passes_to_1/3',
       'Passes_to_Penalt_Area', 'Crosses_into_Penalty_Area','Progressive Passes']

playing_time_cols = ['Playing Time_Minutes', 'Playing Time_Mn/MP','Starts', 'Mn/Start', 'Compl',
                     'Subs', 'unSub', 'PPM','onG', 'onGA','On-Off'
                    ]

possession_cols = ['Touches_Number', 'Touches_Def Pen', 'Touches_Def 3rd',
       'Touches_Mid 3rd', 'Touches_Att 3rd', 'Touches_Att Pen','Take-Ons_Att', 'Take-Ons_Succ',
       'Take-Ons_Tkld','Take-Ons_Tkld%','Carries_Number', 'Carries_TotDist',
       'Carries_PrgDist', 'Carries_PrgC', 'Carries_1/3', 'Carries_CPA',
       'Carries_Mis', 'Carries_Dis','Receiving_Succ', 'Receiving_PrgR'
        ]

shooting_cols = ['Goals', 'Shots', 'SoT', 'SoT%', 'Shots/90','Goals/Shot', 'Goals/SoT']

gsc_cols = ['SCA', 'SCA90', 'SCA Types_PassLive', 'SCA Types_PassDead',
       'SCA Types_TO', 'SCA Types_Sh', 'SCA Types_Fld', 'SCA Types_Def', 'GCA',
       'GCA90', 'GCA Types_PassLive', 'GCA Types_PassDead', 'GCA Types_TO',
       'GCA Types_Sh', 'GCA Types_Fld', 'GCA Types_Def'
        ]
adv_goalkeeping = ['Goals_GA', 'Goals_PKA', 'Goals_FK', 'Goals_CK', 'Goals_OG', 'PSxG',
       'PSxG/SoT', 'PSxG+/-', '/90', 'Launched_Cmp', 'Launched_Att',
       'Launched_Cmp%', 'Passes_Att', 'Passes_Thr', 'Passes_Launch%',
       'Passes_AvgLen', 'Goal Kicks_Att', 'Goal Kicks_Launch%',
       'Goal Kicks_AvgLen', 'Opp', 'Stp', 'Stp%', '#OPA', '#OPA/90',
       'AvgDist']
goalkeeping_cols = ['GA',
       'GA90', 'SoTA', 'Saves', 'Save%', 'W', 'D', 'L', 'CS', 'CS%',
       'Penalty Kicks_PKatt', 'Penalty Kicks_PKA', 'Penalty Kicks_PKsv',
       'Penalty Kicks_PKm', 'Penalty Kicks_Save%']

dic_col_to_keep = {
    'defensive_actions': def_cols,
    'goal_and_shot_creation' : gsc_cols,
    'miscellaneous_stats': misc_cols,
    'passing' : passing_cols,
    'playing_time' : playing_time_cols,
    'possession' : possession_cols,
    'shooting' : shooting_cols,
    'advanced_goalkeeping' : adv_goalkeeping,
    'goalkeeping':goalkeeping_cols
}

### Define the renaming of certain columns

In [25]:
rename_def = { 'Tackles_Tkl': 'Tackles_Att',
           'Challenges_Tkl': 'Dribblers_Tkl_Succ',
           'Challenges_Att': 'Dribblers_Tkl_Att',
           'Challenges_Lost': 'Dribblers_Tkl_Lost',
           'Blocks_Blocks': 'Blocks_Total',
           'Blocks_Sh': 'Blocks_Shots',
          }

rename_gsc = {'SCA_SCA': 'SCA',
          'SCA_SCA90' : 'SCA90',
          'GCA_GCA':'GCA',
          'GCA_GCA90': 'GCA90'
          }
rename_misc = {}
rename_passing = {}
rename_playing_time = {'Playing Time_Min': 'Playing Time_Minutes'
          }

rename_possession = {'Touches_Touches': 'Touches_Number',
          'Carries_Carries': 'Carries_Number',
          'Receiving_Rec': 'Receiving_Succ'}

rename_shooting = {'Gls': 'Goals',
          'Sh': 'Shots',
          'Sh/90': 'Shots/90',
          'G/Sh': 'Goals/Shot',
          'G/SoT': 'Goals/SoT'
          }
rename_advgoal = {}
rename_goalkeeping = {}
dic_col_to_rename = {
    'defensive_actions': rename_def,
    'goal_and_shot_creation' : rename_gsc,
    'miscellaneous_stats': rename_misc,
    'passing' : rename_passing,
    'playing_time' : rename_playing_time,
    'possession' : rename_possession,
    'shooting' : rename_shooting,
    'advanced_goalkeeping' : rename_advgoal,
    'goalkeeping': rename_goalkeeping
}

### Merge all dataframes

In [26]:
for name in datafset_names:
    df_temp = dataframes[name]
    df_temp['unique_id'] = df_temp['Player'] + '_' + df_temp['Season'] + '_' + df_temp['League'] + '_' + df_temp['Team']
    df_temp.set_index('unique_id',inplace=True)
    
    columns_to_rename = dic_col_to_rename[name]
    df_temp = df_temp.rename(columns=columns_to_rename)

    columns_to_keep = dic_col_to_keep[name]
    df_temp = df_temp[columns_to_keep]

    df = df.join(df_temp)

In [27]:
df

Unnamed: 0_level_0,Season,League,Team,Player,Nation,Pos,Age,Playing Time_Starts,Playing Time_Min,Playing Time_90s,...,Carries_Dis,Receiving_Succ,Receiving_PrgR,Goals,Shots,SoT,SoT%,Shots/90,Goals/Shot,Goals/SoT
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ederson_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,Ederson,BRA,GK,28.0,35.0,3150.0,35.0,...,0.0,833.0,0.0,0.0,0.0,0.0,,0.00,,
Rodri_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,Rodri,ESP,MF,26.0,34.0,2911.0,32.3,...,19.0,2511.0,37.0,2.0,50.0,14.0,28.0,1.55,0.04,0.14
Erling Haaland_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,Erling Haaland,NOR,FW,22.0,33.0,2769.0,30.8,...,26.0,583.0,151.0,36.0,116.0,53.0,45.7,3.77,0.25,0.55
Kevin De Bruyne_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,Kevin De Bruyne,BEL,MF,31.0,28.0,2417.0,26.9,...,29.0,1445.0,245.0,7.0,65.0,21.0,32.3,2.42,0.11,0.33
İlkay Gündoğan_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,İlkay Gündoğan,GER,MF,31.0,27.0,2353.0,26.1,...,27.0,1425.0,124.0,8.0,53.0,20.0,37.7,2.03,0.15,0.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
João Afonso_2022-2023_PrimeiraLiga_Santa Clara,2022-2023,PrimeiraLiga,Santa Clara,João Afonso,POR,DF,32.0,0.0,,,...,,,,,,,,,,
Marcos Guillermo Díaz_2022-2023_PrimeiraLiga_Santa Clara,2022-2023,PrimeiraLiga,Santa Clara,Marcos Guillermo Díaz,ARG,GK,36.0,0.0,,,...,,,,,,,,,,
Ricardo Manuel Silva_2022-2023_PrimeiraLiga_Santa Clara,2022-2023,PrimeiraLiga,Santa Clara,Ricardo Manuel Silva,POR,GK,23.0,0.0,,,...,,,,,,,,,,
Squad Total_2022-2023_PrimeiraLiga_Santa Clara,2022-2023,PrimeiraLiga,Santa Clara,Squad Total,,,25.0,374.0,3060.0,34.0,...,273.0,9214.0,908.0,26.0,353.0,111.0,31.4,10.38,0.06,0.18


### Assign Global Positions


In [28]:
def map_position(pos):
    glob_pos_dic = {
    'DF' : ['DF,MF', 'DF,FW'],
    'MF': ['MF,DF', 'MF,FW'],
    'FW': ['FW,MF', 'FW,DF']}

    for key, value_list in glob_pos_dic.items():
        if pos in value_list:
            return key
    return pos

df['Global Pos'] = df['Pos'].apply(map_position)

In [29]:
t = ['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age','Matches Played','Goals']
#df.loc[:,['Matches Played']]
df[t]

Unnamed: 0_level_0,Season,League,Team,Player,Nation,Pos,Age,Matches Played,Goals
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ederson_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,Ederson,BRA,GK,28.0,35.0,0.0
Rodri_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,Rodri,ESP,MF,26.0,36.0,2.0
Erling Haaland_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,Erling Haaland,NOR,FW,22.0,35.0,36.0
Kevin De Bruyne_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,Kevin De Bruyne,BEL,MF,31.0,32.0,7.0
İlkay Gündoğan_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,İlkay Gündoğan,GER,MF,31.0,31.0,8.0
...,...,...,...,...,...,...,...,...,...
João Afonso_2022-2023_PrimeiraLiga_Santa Clara,2022-2023,PrimeiraLiga,Santa Clara,João Afonso,POR,DF,32.0,0.0,
Marcos Guillermo Díaz_2022-2023_PrimeiraLiga_Santa Clara,2022-2023,PrimeiraLiga,Santa Clara,Marcos Guillermo Díaz,ARG,GK,36.0,0.0,
Ricardo Manuel Silva_2022-2023_PrimeiraLiga_Santa Clara,2022-2023,PrimeiraLiga,Santa Clara,Ricardo Manuel Silva,POR,GK,23.0,0.0,
Squad Total_2022-2023_PrimeiraLiga_Santa Clara,2022-2023,PrimeiraLiga,Santa Clara,Squad Total,,,25.0,34.0,26.0


### Arrange feature space

In [30]:
for c in df.columns:
    print(c)

Season
League
Team
Player
Nation
Pos
Age
Playing Time_Starts
Playing Time_Min
Playing Time_90s
Gls
Ast
G+A
G-PK
PK
PKatt
CrdY
CrdR
PrgC
PrgP
PrgR
Per 90 Minutes_Gls
Per 90 Minutes_Ast
Per 90 Minutes_G+A
Per 90 Minutes_G-PK
Per 90 Minutes_G+A-PK
xG
npxG
xAG
npxG+xAG
Per 90 Minutes_xG
Per 90 Minutes_xAG
Per 90 Minutes_xG+xAG
Per 90 Minutes_npxG
Per 90 Minutes_npxG+xAG
Matches Played
Goals_GA
Goals_PKA
Goals_FK
Goals_CK
Goals_OG
PSxG
PSxG/SoT
PSxG+/-
/90
Launched_Cmp
Launched_Att
Launched_Cmp%
Passes_Att
Passes_Thr
Passes_Launch%
Passes_AvgLen
Goal Kicks_Att
Goal Kicks_Launch%
Goal Kicks_AvgLen
Opp
Stp
Stp%
#OPA
#OPA/90
AvgDist
Tackles_Att
Tackles_Def 3rd
Tackles_Mid 3rd
Tackles_Att 3rd
Interceptions
Clearances
Errors
Dribblers_Tkl_Succ
Dribblers_Tkl_Att
Dribblers_Tkl_Lost
Blocks_Total
Blocks_Shots
GA
GA90
SoTA
Saves
Save%
W
D
L
CS
CS%
Penalty Kicks_PKatt
Penalty Kicks_PKA
Penalty Kicks_PKsv
Penalty Kicks_PKm
Penalty Kicks_Save%
SCA
SCA90
SCA Types_PassLive
SCA Types_PassDead
SCA Types_TO

In [31]:
dataframes['standard_stats'].columns

Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age',
       'Playing Time_Starts', 'Playing Time_Min', 'Playing Time_90s', 'Gls',
       'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'PrgC', 'PrgP',
       'PrgR', 'Per 90 Minutes_Gls', 'Per 90 Minutes_Ast',
       'Per 90 Minutes_G+A', 'Per 90 Minutes_G-PK', 'Per 90 Minutes_G+A-PK',
       'xG', 'npxG', 'xAG', 'npxG+xAG', 'Per 90 Minutes_xG',
       'Per 90 Minutes_xAG', 'Per 90 Minutes_xG+xAG', 'Per 90 Minutes_npxG',
       'Per 90 Minutes_npxG+xAG', 'Matches Played'],
      dtype='object')

In [32]:
player_info_cols = ['Season', 'League', 'Team', 'Player', 'Nation', 'Pos','Global Pos', 'Age','Matches Played','Playing Time_Starts','Playing Time_Min','Playing Time_90s']

def_cols = ['Tackles_Att', 'Tackles_Def 3rd', 'Tackles_Mid 3rd',
        'Tackles_Att 3rd','Interceptions', 'Clearances','Errors',
        'Dribblers_Tkl_Succ','Dribblers_Tkl_Att','Dribblers_Tkl_Lost',
        'Blocks_Total', 'Blocks_Shots',
        ]

possession_cols = ['Touches_Number', 'Touches_Def Pen', 'Touches_Def 3rd',
       'Touches_Mid 3rd', 'Touches_Att 3rd', 'Touches_Att Pen','Take-Ons_Att', 'Take-Ons_Succ',
       'Take-Ons_Tkld','Take-Ons_Tkld%','Carries_Number', 'Carries_TotDist',
       'Carries_PrgDist', 'Carries_PrgC', 'Carries_1/3', 'Carries_CPA',
       'Carries_Mis', 'Carries_Dis','Receiving_Succ', 'Receiving_PrgR'
        ]
passing_cols = ['Total_Cmp', 'Total_Att', 'Total_Cmp%', 'Total_TotDist',
       'Total_PrgDist','Short_Cmp', 'Short_Att', 'Short_Cmp%', 'Medium_Cmp',
       'Medium_Att', 'Medium_Cmp%', 'Long_Cmp', 'Long_Att', 'Long_Cmp%','Assists', 'Key Passes', 'Passes_to_1/3',
       'Passes_to_Penalt_Area', 'Crosses_into_Penalty_Area','Progressive Passes']

gsc_cols = ['SCA', 'SCA90', 'SCA Types_PassLive', 'SCA Types_PassDead',
       'SCA Types_TO', 'SCA Types_Sh', 'SCA Types_Fld', 'SCA Types_Def', 'GCA',
       'GCA90', 'GCA Types_PassLive', 'GCA Types_PassDead', 'GCA Types_TO',
       'GCA Types_Sh', 'GCA Types_Fld', 'GCA Types_Def'
        ]

shooting_cols = ['Goals', 'Shots', 'SoT', 'SoT%', 'Shots/90','Goals/Shot', 'Goals/SoT']

adv_goalkeeping = ['Goals_GA', 'Goals_PKA', 'Goals_FK', 'Goals_CK', 'Goals_OG', 'PSxG',
       'PSxG/SoT', 'PSxG+/-', '/90', 'Launched_Cmp', 'Launched_Att',
       'Launched_Cmp%', 'Passes_Att', 'Passes_Thr', 'Passes_Launch%',
       'Passes_AvgLen', 'Goal Kicks_Att', 'Goal Kicks_Launch%',
       'Goal Kicks_AvgLen', 'Opp', 'Stp', 'Stp%', '#OPA', '#OPA/90',
       'AvgDist']
goalkeeping_cols = ['GA',
       'GA90', 'SoTA', 'Saves', 'Save%', 'W', 'D', 'L', 'CS', 'CS%',
       'Penalty Kicks_PKatt', 'Penalty Kicks_PKA', 'Penalty Kicks_PKsv',
       'Penalty Kicks_PKm', 'Penalty Kicks_Save%']


misc_cols = ['2CrdY', 'Fls', 'Fld', 'Off', 'Crs','PKwon', 
            'PKcon', 'OG', 'Recov', 'Aerial Duels_Won',
            'Aerial Duels_Lost'
        ]



playing_time_cols = ['Playing Time_Minutes', 'Playing Time_Mn/MP','Starts', 'Mn/Start', 'Compl',
                     'Subs', 'unSub', 'PPM','onG', 'onGA','On-Off'
                    ]

feature_vector = (player_info_cols + def_cols + possession_cols + passing_cols + 
               gsc_cols + shooting_cols + adv_goalkeeping + goalkeeping_cols + 
               misc_cols + playing_time_cols)


df = df[feature_vector]

### Find possession indeces in df

In [33]:
start_idx = df.columns.get_loc(possession_cols[0])
end_idx = df.columns.get_loc(possession_cols[-1])
print(f"The 'playing_time_cols' are located from index {start_idx} to {end_idx} in df")
df.shape

The 'playing_time_cols' are located from index 24 to 43 in df


(5071, 149)

### Store dataset

In [34]:
df.head()

Unnamed: 0_level_0,Season,League,Team,Player,Nation,Pos,Global Pos,Age,Matches Played,Playing Time_Starts,...,Playing Time_Mn/MP,Starts,Mn/Start,Compl,Subs,unSub,PPM,onG,onGA,On-Off
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ederson_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,Ederson,BRA,GK,GK,28.0,35.0,35.0,...,90.0,35.0,90.0,35.0,0.0,3.0,2.34,89.0,32.0,0.3
Rodri_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,Rodri,ESP,MF,MF,26.0,36.0,34.0,...,81.0,34.0,85.0,23.0,2.0,1.0,2.39,83.0,26.0,1.06
Erling Haaland_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,Erling Haaland,NOR,FW,FW,22.0,35.0,33.0,...,79.0,33.0,83.0,21.0,2.0,1.0,2.37,84.0,26.0,1.47
Kevin De Bruyne_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,Kevin De Bruyne,BEL,MF,MF,31.0,32.0,28.0,...,76.0,28.0,84.0,16.0,4.0,3.0,2.31,69.0,23.0,0.37
İlkay Gündoğan_2022-2023_EPL_Manchester City,2022-2023,EPL,Manchester City,İlkay Gündoğan,GER,MF,MF,31.0,31.0,27.0,...,76.0,27.0,85.0,17.0,4.0,7.0,2.39,71.0,20.0,1.11


In [35]:
# store squad opponent statistics
squad_opponent_stats = df[(df['Player'] == 'Squad Total') | (df['Player'] == 'Opponent Total') ] 
squad_opponent_stats = squad_opponent_stats.reset_index().drop(columns='unique_id').rename(columns={'Player': 'Status'})
squad_opponent_stats.to_csv('../data/fbref/squad_opponent_stats.csv',index=True)

# store player statistics
player_stats = df[(df['Player'] != 'Squad Total') & (df['Player'] != 'Opponent Total')]
player_stats.to_csv('../data/fbref/player_stats.csv',index=True)