### Import

In [1]:
import pandas as pd
import utils

PATH_TO_FILES = "../data/fbref/cleaned/"
dataframes = utils.get_all_attributes(PATH_TO_FILES)
standard_stats = ['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s']

### Load and merge datasets

#### Defensive Action

In [2]:
df = dataframes['defensive_actions']
df = df.sort_values('Player')

rename = { 'Tackles_Tkl': 'Tackles_Att',
           'Challenges_Tkl': 'Dribblers_Tkl_Succ',
           'Challenges_Att': 'Dribblers_Tkl_Att',
           'Challenges_Lost': 'Dribblers_Tkl_Lost',
           'Blocks_Blocks': 'Blocks_Total',
           'Blocks_Sh': 'Blocks_Shots',
          }
df = df.rename(columns=rename)
df.columns

Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s',
       'Tackles_Att', 'Tackles_TklW', 'Tackles_Def 3rd', 'Tackles_Mid 3rd',
       'Tackles_Att 3rd', 'Dribblers_Tkl_Succ', 'Dribblers_Tkl_Att',
       'Challenges_Tkl%', 'Dribblers_Tkl_Lost', 'Blocks_Total', 'Blocks_Shots',
       'Blocks_Pass', 'Interceptions', 'Tkl+Interceptions', 'Clearances',
       'Errors'],
      dtype='object')

In [3]:
# columns to keep
def_cols = ['Tackles_Att', 'Tackles_Def 3rd', 'Tackles_Mid 3rd',
        'Tackles_Att 3rd','Interceptions', 'Clearances','Errors',
        'Dribblers_Tkl_Succ','Dribblers_Tkl_Att','Dribblers_Tkl_Lost',
        'Blocks_Total', 'Blocks_Shots',
        ]
columns_to_keep = standard_stats + def_cols

main_df = df[columns_to_keep]
main_df.columns, main_df.shape

(Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s',
        'Tackles_Att', 'Tackles_Def 3rd', 'Tackles_Mid 3rd', 'Tackles_Att 3rd',
        'Interceptions', 'Clearances', 'Errors', 'Dribblers_Tkl_Succ',
        'Dribblers_Tkl_Att', 'Dribblers_Tkl_Lost', 'Blocks_Total',
        'Blocks_Shots'],
       dtype='object'),
 (3996, 20))

#### Goal and Shot Creation

In [4]:
df = dataframes['goal_and_shot_creation']

df = df.sort_values('Player')

rename = {'SCA_SCA': 'SCA',
          'SCA_SCA90' : 'SCA90',
          'GCA_GCA':'GCA',
          'GCA_GCA90': 'GCA90'
          }
df = df.rename(columns=rename)
df.columns


Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s',
       'SCA', 'SCA90', 'SCA Types_PassLive', 'SCA Types_PassDead',
       'SCA Types_TO', 'SCA Types_Sh', 'SCA Types_Fld', 'SCA Types_Def', 'GCA',
       'GCA90', 'GCA Types_PassLive', 'GCA Types_PassDead', 'GCA Types_TO',
       'GCA Types_Sh', 'GCA Types_Fld', 'GCA Types_Def'],
      dtype='object')

In [5]:
# columns to keep
gsc_cols = ['SCA', 'SCA90', 'SCA Types_PassLive', 'SCA Types_PassDead',
       'SCA Types_TO', 'SCA Types_Sh', 'SCA Types_Fld', 'SCA Types_Def', 'GCA',
       'GCA90', 'GCA Types_PassLive', 'GCA Types_PassDead', 'GCA Types_TO',
       'GCA Types_Sh', 'GCA Types_Fld', 'GCA Types_Def'
        ]

df[gsc_cols]

Unnamed: 0,SCA,SCA90,SCA Types_PassLive,SCA Types_PassDead,SCA Types_TO,SCA Types_Sh,SCA Types_Fld,SCA Types_Def,GCA,GCA90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def
365,73.0,2.94,43.0,27.0,0.0,2.0,1.0,0.0,1.0,0.04,0.0,1.0,0.0,0.0,0.0,0.0
223,20.0,0.94,14.0,1.0,1.0,2.0,2.0,0.0,2.0,0.09,1.0,0.0,0.0,1.0,0.0,0.0
3054,8.0,2.44,7.0,1.0,0.0,0.0,0.0,0.0,1.0,0.31,1.0,0.0,0.0,0.0,0.0,0.0
24,3.0,0.08,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0
1915,62.0,3.52,42.0,14.0,0.0,5.0,1.0,0.0,4.0,0.23,2.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,90.0,3.44,71.0,7.0,2.0,4.0,4.0,2.0,17.0,0.65,13.0,1.0,0.0,1.0,2.0,0.0
361,2.0,0.06,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0
1724,9.0,3.10,8.0,0.0,0.0,0.0,0.0,1.0,2.0,0.69,1.0,0.0,0.0,0.0,0.0,1.0
2508,4.0,0.11,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
main_df = pd.concat([main_df, df[gsc_cols]], axis=1)
main_df.columns, main_df.shape

(Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s',
        'Tackles_Att', 'Tackles_Def 3rd', 'Tackles_Mid 3rd', 'Tackles_Att 3rd',
        'Interceptions', 'Clearances', 'Errors', 'Dribblers_Tkl_Succ',
        'Dribblers_Tkl_Att', 'Dribblers_Tkl_Lost', 'Blocks_Total',
        'Blocks_Shots', 'SCA', 'SCA90', 'SCA Types_PassLive',
        'SCA Types_PassDead', 'SCA Types_TO', 'SCA Types_Sh', 'SCA Types_Fld',
        'SCA Types_Def', 'GCA', 'GCA90', 'GCA Types_PassLive',
        'GCA Types_PassDead', 'GCA Types_TO', 'GCA Types_Sh', 'GCA Types_Fld',
        'GCA Types_Def'],
       dtype='object'),
 (3996, 36))

#### Miscellaneous Stats

In [7]:
df = dataframes['miscellaneous_stats']

df = df.sort_values('Player')

rename = {}
df = df.rename(columns=rename)
df.columns

Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s',
       'CrdY', 'CrdR', '2CrdY', 'Fls', 'Fld', 'Off', 'Crs', 'Int', 'TklW',
       'PKwon', 'PKcon', 'OG', 'Recov', 'Aerial Duels_Won',
       'Aerial Duels_Lost', 'Aerial Duels_Won%'],
      dtype='object')

In [8]:
# columns to keep
misc_cols = ['CrdY', 'CrdR', '2CrdY', 'Fls', 'Fld', 'Off', 'Crs','PKwon', 
            'PKcon', 'OG', 'Recov', 'Aerial Duels_Won',
            'Aerial Duels_Lost'
        ]

df[misc_cols]

Unnamed: 0,CrdY,CrdR,2CrdY,Fls,Fld,Off,Crs,PKwon,PKcon,OG,Recov,Aerial Duels_Won,Aerial Duels_Lost
365,3.0,0.0,0.0,9.0,9.0,2.0,133.0,0.0,0.0,0.0,120.0,28.0,27.0
223,7.0,0.0,0.0,20.0,35.0,1.0,27.0,0.0,1.0,0.0,111.0,14.0,22.0
3054,0.0,0.0,0.0,4.0,7.0,0.0,15.0,0.0,0.0,0.0,32.0,6.0,4.0
24,1.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,33.0,6.0,0.0
1915,3.0,0.0,0.0,23.0,18.0,3.0,62.0,1.0,0.0,0.0,115.0,6.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0.0,0.0,0.0,23.0,25.0,6.0,30.0,2.0,0.0,0.0,155.0,19.0,12.0
361,1.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,17.0,15.0,1.0
1724,1.0,0.0,0.0,9.0,2.0,0.0,6.0,0.0,0.0,0.0,12.0,3.0,1.0
2508,4.0,0.0,0.0,1.0,7.0,0.0,0.0,0.0,1.0,0.0,53.0,6.0,1.0


In [9]:
main_df = pd.concat([main_df, df[misc_cols]], axis=1)
main_df.columns, main_df.shape

(Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s',
        'Tackles_Att', 'Tackles_Def 3rd', 'Tackles_Mid 3rd', 'Tackles_Att 3rd',
        'Interceptions', 'Clearances', 'Errors', 'Dribblers_Tkl_Succ',
        'Dribblers_Tkl_Att', 'Dribblers_Tkl_Lost', 'Blocks_Total',
        'Blocks_Shots', 'SCA', 'SCA90', 'SCA Types_PassLive',
        'SCA Types_PassDead', 'SCA Types_TO', 'SCA Types_Sh', 'SCA Types_Fld',
        'SCA Types_Def', 'GCA', 'GCA90', 'GCA Types_PassLive',
        'GCA Types_PassDead', 'GCA Types_TO', 'GCA Types_Sh', 'GCA Types_Fld',
        'GCA Types_Def', 'CrdY', 'CrdR', '2CrdY', 'Fls', 'Fld', 'Off', 'Crs',
        'PKwon', 'PKcon', 'OG', 'Recov', 'Aerial Duels_Won',
        'Aerial Duels_Lost'],
       dtype='object'),
 (3996, 49))

#### Passing

In [10]:
df = dataframes['passing']

df = df.sort_values('Player')

rename = {}
df = df.rename(columns=rename)
df.columns

Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s',
       'Total_Cmp', 'Total_Att', 'Total_Cmp%', 'Total_TotDist',
       'Total_PrgDist', 'Short_Cmp', 'Short_Att', 'Short_Cmp%', 'Medium_Cmp',
       'Medium_Att', 'Medium_Cmp%', 'Long_Cmp', 'Long_Att', 'Long_Cmp%',
       'Assists', 'xAG', 'Key Passes', 'Passes_to_1/3',
       'Passes_to_Penalt_Area', 'Crosses_into_Penalty_Area',
       'Progressive Passes', 'Expected_xA', 'Expected_A-xAG'],
      dtype='object')

In [11]:
# columns to keep
passing_cols = ['Total_Cmp', 'Total_Att', 'Total_Cmp%', 'Total_TotDist',
       'Total_PrgDist','Short_Cmp', 'Short_Att', 'Short_Cmp%', 'Medium_Cmp',
       'Medium_Att', 'Medium_Cmp%', 'Long_Cmp', 'Long_Att', 'Long_Cmp%','Assists', 'Key Passes', 'Passes_to_1/3',
       'Passes_to_Penalt_Area', 'Crosses_into_Penalty_Area','Progressive Passes']

df[passing_cols]

Unnamed: 0,Total_Cmp,Total_Att,Total_Cmp%,Total_TotDist,Total_PrgDist,Short_Cmp,Short_Att,Short_Cmp%,Medium_Cmp,Medium_Att,Medium_Cmp%,Long_Cmp,Long_Att,Long_Cmp%,Assists,Key Passes,Passes_to_1/3,Passes_to_Penalt_Area,Crosses_into_Penalty_Area,Progressive Passes
365,1127.0,1448.0,77.8,18285.0,8587.0,626.0,670.0,93.4,387.0,497.0,77.9,100.0,206.0,48.5,1.0,40.0,91.0,28.0,15.0,145.0
223,695.0,838.0,82.9,10995.0,3282.0,373.0,411.0,90.8,265.0,310.0,85.5,46.0,80.0,57.5,1.0,6.0,62.0,13.0,4.0,81.0
3054,179.0,233.0,76.8,3203.0,1226.0,84.0,91.0,92.3,78.0,96.0,81.3,16.0,35.0,45.7,0.0,4.0,13.0,3.0,2.0,12.0
24,761.0,1111.0,68.5,18974.0,13109.0,190.0,196.0,96.9,383.0,390.0,98.2,186.0,519.0,35.8,0.0,0.0,30.0,2.0,0.0,8.0
1915,915.0,1088.0,84.1,16069.0,3809.0,398.0,431.0,92.3,382.0,444.0,86.0,100.0,159.0,62.9,1.0,28.0,88.0,20.0,3.0,126.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,1374.0,1578.0,87.1,21396.0,4941.0,746.0,819.0,91.1,492.0,543.0,90.6,99.0,130.0,76.2,4.0,37.0,143.0,35.0,1.0,170.0
361,598.0,930.0,64.3,18882.0,15094.0,117.0,117.0,100.0,225.0,228.0,98.7,254.0,579.0,43.9,0.0,0.0,12.0,0.0,0.0,0.0
1724,116.0,143.0,81.1,1866.0,539.0,52.0,63.0,82.5,48.0,57.0,84.2,9.0,13.0,69.2,1.0,5.0,12.0,5.0,2.0,20.0
2508,957.0,1216.0,78.7,24937.0,16773.0,197.0,199.0,99.0,426.0,430.0,99.1,306.0,554.0,55.2,0.0,0.0,4.0,0.0,0.0,0.0


In [12]:
main_df = pd.concat([main_df, df[passing_cols]], axis=1)
main_df.columns, main_df.shape

(Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s',
        'Tackles_Att', 'Tackles_Def 3rd', 'Tackles_Mid 3rd', 'Tackles_Att 3rd',
        'Interceptions', 'Clearances', 'Errors', 'Dribblers_Tkl_Succ',
        'Dribblers_Tkl_Att', 'Dribblers_Tkl_Lost', 'Blocks_Total',
        'Blocks_Shots', 'SCA', 'SCA90', 'SCA Types_PassLive',
        'SCA Types_PassDead', 'SCA Types_TO', 'SCA Types_Sh', 'SCA Types_Fld',
        'SCA Types_Def', 'GCA', 'GCA90', 'GCA Types_PassLive',
        'GCA Types_PassDead', 'GCA Types_TO', 'GCA Types_Sh', 'GCA Types_Fld',
        'GCA Types_Def', 'CrdY', 'CrdR', '2CrdY', 'Fls', 'Fld', 'Off', 'Crs',
        'PKwon', 'PKcon', 'OG', 'Recov', 'Aerial Duels_Won',
        'Aerial Duels_Lost', 'Total_Cmp', 'Total_Att', 'Total_Cmp%',
        'Total_TotDist', 'Total_PrgDist', 'Short_Cmp', 'Short_Att',
        'Short_Cmp%', 'Medium_Cmp', 'Medium_Att', 'Medium_Cmp%', 'Long_Cmp',
        'Long_Att', 'Long_Cmp%', 'Assists', 'Key Passes', 'Passes_to_

#### Playing Time

In [13]:
df = dataframes['playing_time']

df = df.sort_values('Player')

rename = {'Playing Time_Min': 'Playing Time_Minutes'
          }
df = df.rename(columns=rename)
df.columns

Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age',
       'Playing Time_Minutes', 'Playing Time_Mn/MP', 'Playing Time_Min%',
       'Playing Time_90s', 'Starts', 'Mn/Start', 'Compl', 'Subs', 'Mn/Sub',
       'unSub', 'PPM', 'onG', 'onGA', '+/-', '+/-90', 'On-Off', '(xG)_onxG',
       '(xG)_onxGA', '(xG)_xG+/-', '(xG)_xG+/-90', '(xG)_On-Off'],
      dtype='object')

In [14]:
# columns to keep
playing_time_cols = ['Player','Playing Time_Minutes', 'Playing Time_Mn/MP','Starts', 'Mn/Start', 'Compl',
                     'Subs', 'unSub', 'PPM','onG', 'onGA','On-Off'
                    ]

df[playing_time_cols].dropna()

Unnamed: 0,Player,Playing Time_Minutes,Playing Time_Mn/MP,Starts,Mn/Start,Compl,Subs,unSub,PPM,onG,onGA,On-Off
467,Aaron Cresswell,2235.0,80.0,24.0,88.0,20.0,4.0,8.0,1.11,26.0,32.0,0.29
285,Aaron Hickey,1916.0,74.0,23.0,81.0,8.0,3.0,2.0,1.46,33.0,30.0,-0.40
3731,Aaron Meijers,294.0,23.0,1.0,82.0,0.0,12.0,21.0,1.88,8.0,3.0,0.94
2331,Aaron Ramsey,1584.0,59.0,18.0,74.0,6.0,9.0,0.0,1.59,25.0,10.0,1.05
78,Aaron Wan-Bissaka,1435.0,76.0,16.0,87.0,12.0,3.0,7.0,2.21,28.0,10.0,1.26
...,...,...,...,...,...,...,...,...,...,...,...,...
3336,Þórir Jóhann Helgason,222.0,19.0,2.0,45.0,0.0,10.0,24.0,0.58,1.0,4.0,-0.93
4,İlkay Gündoğan,2353.0,76.0,27.0,85.0,17.0,4.0,7.0,2.39,71.0,20.0,1.11
463,Łukasz Fabiański,3111.0,86.0,36.0,86.0,33.0,0.0,0.0,1.08,37.0,48.0,0.26
2103,Łukasz Poręba,260.0,26.0,3.0,67.0,0.0,7.0,27.0,2.30,6.0,2.0,0.39


In [15]:
main_df = pd.concat([main_df, df[playing_time_cols]], axis=1)
main_df.columns, main_df.shape

(Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s',
        'Tackles_Att', 'Tackles_Def 3rd', 'Tackles_Mid 3rd', 'Tackles_Att 3rd',
        'Interceptions', 'Clearances', 'Errors', 'Dribblers_Tkl_Succ',
        'Dribblers_Tkl_Att', 'Dribblers_Tkl_Lost', 'Blocks_Total',
        'Blocks_Shots', 'SCA', 'SCA90', 'SCA Types_PassLive',
        'SCA Types_PassDead', 'SCA Types_TO', 'SCA Types_Sh', 'SCA Types_Fld',
        'SCA Types_Def', 'GCA', 'GCA90', 'GCA Types_PassLive',
        'GCA Types_PassDead', 'GCA Types_TO', 'GCA Types_Sh', 'GCA Types_Fld',
        'GCA Types_Def', 'CrdY', 'CrdR', '2CrdY', 'Fls', 'Fld', 'Off', 'Crs',
        'PKwon', 'PKcon', 'OG', 'Recov', 'Aerial Duels_Won',
        'Aerial Duels_Lost', 'Total_Cmp', 'Total_Att', 'Total_Cmp%',
        'Total_TotDist', 'Total_PrgDist', 'Short_Cmp', 'Short_Att',
        'Short_Cmp%', 'Medium_Cmp', 'Medium_Att', 'Medium_Cmp%', 'Long_Cmp',
        'Long_Att', 'Long_Cmp%', 'Assists', 'Key Passes', 'Passes_to_

#### Possession

In [16]:
df = dataframes['possession']

df = df.sort_values('Player')

rename = {'Touches_Touches': 'Touches_Number',
          'Carries_Carries': 'Carries_Number',
          'Receiving_Rec': 'Receiving_Succ'}
df = df.rename(columns=rename)
df.columns

Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s',
       'Touches_Number', 'Touches_Def Pen', 'Touches_Def 3rd',
       'Touches_Mid 3rd', 'Touches_Att 3rd', 'Touches_Att Pen', 'Touches_Live',
       'Take-Ons_Att', 'Take-Ons_Succ', 'Take-Ons_Succ%', 'Take-Ons_Tkld',
       'Take-Ons_Tkld%', 'Carries_Number', 'Carries_TotDist',
       'Carries_PrgDist', 'Carries_PrgC', 'Carries_1/3', 'Carries_CPA',
       'Carries_Mis', 'Carries_Dis', 'Receiving_Succ', 'Receiving_PrgR'],
      dtype='object')

In [17]:
# columns to keep
possession_cols = ['Touches_Number', 'Touches_Def Pen', 'Touches_Def 3rd',
       'Touches_Mid 3rd', 'Touches_Att 3rd', 'Touches_Att Pen','Take-Ons_Att', 'Take-Ons_Succ',
       'Take-Ons_Tkld','Take-Ons_Tkld%','Carries_Number', 'Carries_TotDist',
       'Carries_PrgDist', 'Carries_PrgC', 'Carries_1/3', 'Carries_CPA',
       'Carries_Mis', 'Carries_Dis','Receiving_Succ', 'Receiving_PrgR'
        ]

df[possession_cols]

Unnamed: 0,Touches_Number,Touches_Def Pen,Touches_Def 3rd,Touches_Mid 3rd,Touches_Att 3rd,Touches_Att Pen,Take-Ons_Att,Take-Ons_Succ,Take-Ons_Tkld,Take-Ons_Tkld%,Carries_Number,Carries_TotDist,Carries_PrgDist,Carries_PrgC,Carries_1/3,Carries_CPA,Carries_Mis,Carries_Dis,Receiving_Succ,Receiving_PrgR
365,1607.0,97.0,507.0,668.0,438.0,13.0,7.0,2.0,4.0,57.1,766.0,3497.0,1751.0,35.0,26.0,1.0,6.0,6.0,960.0,82.0
223,1028.0,47.0,328.0,499.0,211.0,21.0,45.0,17.0,18.0,40.0,616.0,3295.0,1802.0,41.0,17.0,3.0,16.0,13.0,624.0,47.0
3054,268.0,13.0,87.0,110.0,72.0,2.0,6.0,2.0,1.0,16.7,142.0,810.0,342.0,9.0,7.0,0.0,1.0,2.0,151.0,18.0
24,1196.0,872.0,1170.0,28.0,0.0,0.0,1.0,1.0,0.0,0.0,722.0,3934.0,2459.0,0.0,1.0,0.0,0.0,0.0,629.0,0.0
1915,1279.0,43.0,208.0,723.0,361.0,50.0,24.0,14.0,5.0,20.8,980.0,4866.0,1934.0,29.0,34.0,8.0,24.0,32.0,902.0,84.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,1806.0,31.0,243.0,975.0,596.0,88.0,42.0,16.0,18.0,42.9,1072.0,5457.0,2628.0,53.0,44.0,7.0,28.0,27.0,1425.0,124.0
361,1005.0,871.0,1000.0,5.0,0.0,0.0,0.0,0.0,0.0,,506.0,3330.0,2146.0,0.0,0.0,0.0,3.0,0.0,350.0,0.0
1724,170.0,4.0,20.0,93.0,57.0,1.0,2.0,1.0,1.0,50.0,126.0,625.0,223.0,3.0,8.0,0.0,5.0,2.0,120.0,18.0
2508,1311.0,1189.0,1306.0,5.0,0.0,0.0,0.0,0.0,0.0,,707.0,3512.0,2253.0,0.0,0.0,0.0,1.0,0.0,553.0,0.0


In [18]:
main_df = df[possession_cols].join(main_df)
main_df.columns, main_df.shape

(Index(['Touches_Number', 'Touches_Def Pen', 'Touches_Def 3rd',
        'Touches_Mid 3rd', 'Touches_Att 3rd', 'Touches_Att Pen', 'Take-Ons_Att',
        'Take-Ons_Succ', 'Take-Ons_Tkld', 'Take-Ons_Tkld%',
        ...
        'Playing Time_Mn/MP', 'Starts', 'Mn/Start', 'Compl', 'Subs', 'unSub',
        'PPM', 'onG', 'onGA', 'On-Off'],
       dtype='object', length=101),
 (3996, 101))

#### Shooting

In [19]:
df = dataframes['shooting']

df = df.sort_values('Player')

rename = {'Gls': 'Goals',
          'Sh': 'Shots',
          'Sh/90': 'Shots',
          'G/Sh': 'Goals/Shot',
          'G/SoT': 'Goals/SoT'
          }

df = df.rename(columns=rename)
df.columns

Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s',
       'Goals', 'Shots', 'SoT', 'SoT%', 'Shots', 'SoT/90', 'Goals/Shot',
       'Goals/SoT', 'Dist', 'PK', 'PKatt', 'FK', 'xG', 'npxG', 'npxG/Sh',
       'G-xG', 'np:G-xG'],
      dtype='object')

In [20]:
# columns to keep
shooting_cols = ['Goals', 'Shots', 'SoT', 'SoT%', 'Shots','Goals/Shot', 'Goals/SoT']

df[shooting_cols]

Unnamed: 0,Goals,Shots,Shots.1,SoT,SoT%,Shots.2,Shots.3,Goals/Shot,Goals/SoT
365,0.0,9.0,0.36,1.0,11.1,9.0,0.36,0.00,0.00
223,0.0,11.0,0.52,2.0,18.2,11.0,0.52,0.00,0.00
3054,0.0,1.0,0.31,1.0,100.0,1.0,0.31,0.00,0.00
24,0.0,0.0,0.00,0.0,,0.0,0.00,,
1915,1.0,22.0,1.25,6.0,27.3,22.0,1.25,0.05,0.17
...,...,...,...,...,...,...,...,...,...
4,8.0,53.0,2.03,20.0,37.7,53.0,2.03,0.15,0.40
361,0.0,0.0,0.00,0.0,,0.0,0.00,,
1724,0.0,3.0,1.04,0.0,0.0,3.0,1.04,0.00,
2508,0.0,0.0,0.00,0.0,,0.0,0.00,,


In [21]:
main_df = pd.concat([main_df, df[shooting_cols]], axis=1)
main_df.columns, main_df.shape

(Index(['Touches_Number', 'Touches_Def Pen', 'Touches_Def 3rd',
        'Touches_Mid 3rd', 'Touches_Att 3rd', 'Touches_Att Pen', 'Take-Ons_Att',
        'Take-Ons_Succ', 'Take-Ons_Tkld', 'Take-Ons_Tkld%',
        ...
        'On-Off', 'Goals', 'Shots', 'Shots', 'SoT', 'SoT%', 'Shots', 'Shots',
        'Goals/Shot', 'Goals/SoT'],
       dtype='object', length=110),
 (3996, 110))

### Handle Duplicates
- some player transfered during winter period and show records for two different teams
- not all players have unique names

In [55]:
## TODO

### Assign Global Positions


### Store dataset

In [46]:
main_df.to_csv('../data/fbref/player_stats.csv')