# 01. Normalization of selected columns

Select relevant columns and normalize per team

## Combine all FBRef data into one dataframe

In [1]:
# change wd to repo home
import os
if os.getcwd().split('\\')[-1] != 'FM skills prediction':
    os.chdir('..')

In [2]:
import pandas as pd

In [3]:
# load FBRef data in separate dfs
fbref_dfs = {filename : pd.read_csv(f'data/FBRef_split/{filename}', index_col=0)
             for filename in os.listdir('data/FBRef_split')}

In [4]:
# combine FBRef dfs by type first
table_types = ['defense', 'gca', 'keepersadv', 'keepers', 'misc', 'passingtypes',
               'passing', 'playingtime', 'possession', 'shooting', 'stats']

fbref_dfs_types = {}

for t in table_types:
    fbref_dfs_types[t] = {}
    for k in fbref_dfs.keys():
        if k.split('_')[1] == t:
            fbref_dfs_types[t][k.split('_')[0]] = fbref_dfs[k]
    fbref_dfs_types[t] = pd.concat(fbref_dfs_types[t], axis=0, ignore_index=True).set_index(['_Player', '_Squad', '_Born'])

In [6]:
# combine FBRef dfs into one
fbref_combined_df = pd.concat(fbref_dfs_types, axis=1).reset_index()
fbref_combined_df.columns = fbref_combined_df.columns.map(lambda x: '_'.join(map(str, x)))

In [7]:
# store a complete FBRef file
fbref_combined_df.to_csv('data/FBRef_combined.csv')

## Select relevant columns

In [8]:
# select columns
FM_columns = ['Name', 'Position', 'Corners', 'Crossing', 'Dribbling', 'Finishing', 'First Touch',
              'Free Kick Taking', 'Heading', 'Long Shots', 'Long Throws', 'Marking', 'Passing',
              'Penalty Taking', 'Tackling', 'Technique', 'Aggressiion', 'Anticipation', 'Bravery',
              'Composure', 'Concentration', 'Vision', 'Decision', 'Determination', 'Flair', 'Leadership',
              'Off The Ball', 'Teamwork', 'Work Rate', 'Positioning', 'Acceleration', 'Agility', 'Balance',
              'Jumping Reach', 'Natural Fitness', 'Pace', 'Stamina', 'Strength']
defense_columns = ['defense_Blocks_Blocks', 'defense_Tackles_Tkl', 'defense_Tackles_TklW',
                   'defense_Tackles_Def 3rd', 'defense_Tackles_Mid 3rd', 'defense_Tackles_Att 3rd',
                   'defense__Int', 'defense__Clr']
touches_columns = ['possession_Touches_Def Pen', 'possession_Touches_Def 3rd', 'possession_Touches_Mid 3rd',
                   'possession_Touches_Att 3rd', 'possession_Touches_Att Pen', 'possession_Touches_Touches']
passing_columns = ['passing_Total_Att', 'passing_Short_Att', 'passing_Medium_Att', 'passing_Long_Att',
                   'passing__KP', 'passing__CrsPA', 'passing__PrgP']
progres_columns = ['possession_Carries_PrgDist', 'possession_Carries_TotDist', 'possession_Receiving_Rec',
                   'possession_Receiving_PrgR', 'possession_Take-Ons_Att', 'possession_Take-Ons_Succ']
attack_columns = ['gca_SCATypes_TO', 'gca_SCATypes_Sh', 'gca_SCATypes_Fld', 'gca_SCATypes_Def',
                  'gca_SCATypes_PassLive', 'gca_SCATypes_PassDead', 'shooting_Standard_Dist',
                  'shooting_Expected_npxG']
misc_columns = ['playingtime_PlayingTime_Min', 'misc_Performance_Fls', 'misc_Performance_Fld',
                'misc_Performance_Off', 'misc_AerialDuels_Won', 'misc_AerialDuels_Lost']

In [66]:
all_stats_columns = (defense_columns + touches_columns + passing_columns +
                     progres_columns + attack_columns + misc_columns)

## Normalize each column selection by team

In [18]:
from sklearn.preprocessing import MinMaxScaler

Drop players with fewer than 500 minutes

In [52]:
fbref_combined_filtered_df = fbref_combined_df[fbref_combined_df['playingtime_PlayingTime_Min']>=500]

In [100]:
# function to apply Min-Max scaling based on selected group and per-90 minutes stats
def minmax_scaling(group):
    scaler = MinMaxScaler()
    # Divide each column by 'playingtime_PlayingTime_Min' except for 'playingtime_PlayingTime_Min' itself
    for column in all_stats_columns:
        if column != 'playingtime_PlayingTime_Min' and column in group.columns:
            group[column] = group[column] / group['playingtime_PlayingTime_Min']
    # Apply Min-Max scaling
    group[all_stats_columns] = scaler.fit_transform(group[all_stats_columns])
    return group

In [101]:
fbref_normalized_df = fbref_combined_filtered_df.groupby('_Squad_').apply(minmax_scaling)

In [105]:
fbref_normalized_df.loc['Chelsea', ['_Player_'] + defense_columns]

Unnamed: 0,_Player_,defense_Blocks_Blocks,defense_Tackles_Tkl,defense_Tackles_TklW,defense_Tackles_Def 3rd,defense_Tackles_Mid 3rd,defense_Tackles_Att 3rd,defense__Int,defense__Clr
546,Kepa Arrizabalaga,0.0,0.019741,0.038301,0.026839,0.0,0.0,0.022052,0.112183
547,Pierre-Emerick Aubameyang,0.142731,0.045701,0.0,0.062132,0.0,0.0,0.204197,0.133478
555,César Azpilicueta,0.673184,0.712955,0.804211,0.518457,0.508841,0.430255,0.70379,0.743548
556,Benoît Badiashile,0.218434,0.391662,0.434221,0.380343,0.22895,0.0,1.0,0.79136
605,Trevoh Chalobah,0.417986,0.488099,0.61096,0.470935,0.193284,0.245149,0.844216,0.731895
608,Ben Chilwell,0.589679,0.377615,0.593083,0.293361,0.294318,0.093324,0.7231,0.377255
628,Marc Cucurella,0.662492,1.0,0.970078,0.823963,0.464991,0.864991,0.541592,0.599651
684,João Félix,0.126046,0.376679,0.365408,0.182896,0.330287,0.418916,0.060109,0.0
687,Enzo Fernández,0.817822,0.736469,0.857319,0.556255,0.569231,0.254816,0.585003,0.145113
692,Wesley Fofana,0.56815,0.64205,0.747406,0.552832,0.306509,0.444294,0.956255,0.581656


In [104]:
fbref_combined_df[['_Squad_'] + touches_columns]

Unnamed: 0,_Squad_,possession_Touches_Def Pen,possession_Touches_Def 3rd,possession_Touches_Mid 3rd,possession_Touches_Att 3rd,possession_Touches_Att Pen,possession_Touches_Touches
0,Eint Frankfurt,2.0,6.0,41.0,53.0,17.0,99.0
1,Köln,6.0,36.0,134.0,147.0,48.0,310.0
2,Dortmund,11.0,68.0,240.0,384.0,65.0,676.0
3,Leverkusen,9.0,74.0,394.0,327.0,67.0,786.0
4,Werder Bremen,1.0,7.0,13.0,14.0,2.0,33.0
...,...,...,...,...,...,...,...
3522,Salernitana,,,,,,
3523,Sampdoria,,,,,,
3524,Torino,,,,,,
3525,Sassuolo,,,,,,


In [4]:
# load the combined df
combined_data = pd.read_csv('data/FB_FBRef_combined.csv', index_col=0)

# drop keepers
combined_data = combined_data[combined_data['Position'] != 'GK']
combined_data.shape

(1269, 384)

## Processing of stats columns
We will normalize stats per each team. The point is that a defensive player of a stronger team might have more attacking stats than an attacking player of a weaker team just because of overall possession and tactics. Team information would be passed as a cluster type, but for the purpose of stats analysis it is best to have information about a player from within the team, how important their contribution is for the team

### defense

In [17]:
combined_data['_Squad_'].unique()

array(['Dortmund', 'Leverkusen', 'Stuttgart', 'Mainz 05', 'Hoffenheim',
       'Bochum', 'Wolfsburg', 'Augsburg', 'Union Berlin', 'Werder Bremen',
       'Hertha BSC', 'Schalke 04', 'Eint Frankfurt', 'Bayern Munich',
       'Köln', 'Freiburg', 'Gladbach', 'RB Leipzig', 'Leeds United',
       'Southampton', 'Fulham', 'West Ham', 'Wolves', 'Manchester City',
       'Liverpool', 'Newcastle Utd', 'Leicester City', 'Crystal Palace',
       'Bournemouth', 'Manchester Utd', "Nott'ham Forest", 'Chelsea',
       'Aston Villa', 'Tottenham', 'Brighton', 'Everton', 'Brentford',
       'Arsenal', 'Sevilla', 'Valladolid', 'Celta Vigo', 'Almería',
       'Real Madrid', 'Barcelona', 'Villarreal', 'Cádiz', 'Getafe',
       'Osasuna', 'Mallorca', 'Rayo Vallecano', 'Athletic Club', 'Elche',
       'Espanyol', 'Girona', 'Betis', 'Atlético Madrid', 'Valencia',
       'Real Sociedad', 'Reims', 'Lens', 'Lorient', 'Troyes', 'Monaco',
       'Strasbourg', 'Lille', 'Auxerre', 'Marseille', 'Lyon', 'Nice',
      

In [21]:
combined_data.groupby('_Squad_')['_Player_'].agg('size')

_Squad_
Ajaccio           6
Almería          11
Angers           12
Arsenal          10
Aston Villa      12
                 ..
Villarreal       13
Werder Bremen    12
West Ham         14
Wolfsburg        15
Wolves           15
Name: _Player_, Length: 98, dtype: int64

### touches

### passing

### progression

### attack

### miscelanious