# 01. Data Preparation

1. Merge separate files for each league and statistic type
2. Calculate percentage above/below team average for each player statistic
3. Aggregate original pre-transformation data per team for team-level statistics
4. Apply Principal Component Analysis (PCA) to reduce the dimensionality of player statistics separately for each statistic group
5. Apply PCA to reduce the dimensionality of team statistics
6. Create tags to combine FBRef statistics with Football Manager and FC24 ratings

In [2]:
# change wd to repo home
import os
if os.getcwd().split('\\')[-1] == 'notebooks':
    os.chdir('..')

In [72]:
from modules.fm_functions import *
import warnings
warnings.filterwarnings(action='ignore')

In [4]:
tqdm.pandas()

## 01. Merge separate files for each league and statistic type

In [5]:
season = '2022'

# load FBRef data in separate dfs
fbref_dfs = {filename : pd.read_csv(f'data/FBRef_split/{filename}', index_col=0)
             for filename in os.listdir('data/FBRef_split')
             if filename.split('_')[-1][:4] == season}

In [6]:
# combine FBRef dfs by type first
table_types = ['defense', 'gca', 'keepersadv', 'keepers', 'misc', 'passingtypes',
               'passing', 'playingtime', 'possession', 'shooting', 'stats']

fbref_dfs_types = {}

for t in table_types:
    fbref_dfs_types[t] = {}
    for k in fbref_dfs.keys():
        if k.split('_')[1] == t:
            fbref_dfs_types[t][k.split('_')[0]] = fbref_dfs[k]
    try:
        fbref_dfs_types[t] = pd.concat(fbref_dfs_types[t], axis=0, ignore_index=True).set_index(['_Player', '_Squad', '_Born'])
    except:
        print(t)

In [7]:
# combine FBRef dfs into one
fbref_combined_df = pd.concat(fbref_dfs_types, axis=1).reset_index()
fbref_combined_df.columns = fbref_combined_df.columns.map(lambda x: '_'.join(map(str, x)))

In [46]:
# drop keepers
fbref_combined_df = fbref_combined_df[fbref_combined_df['stats__Pos']!='GK']
# drop unknown YOB
fbref_combined_df = fbref_combined_df.dropna(subset=['_Born_'])
fbref_combined_df['yob'] = fbref_combined_df['_Born_'].astype('int')
# drop players with less than 500 minutes
fbref_combined_df = fbref_combined_df[fbref_combined_df['playingtime_PlayingTime_Min'] >= 500]

## 02. Calculate percentage above/below team average for each player statistic

In [47]:
def percent_above_below(group):
    return (group - group.mean()) / group.mean()

In [58]:
fbref_meandiff_df = fbref_combined_df.copy()

for column_group in [defense_columns, touches_columns, passing_columns,
                     progres_columns, attack_columns, misc_columns]:
        fbref_meandiff_df[column_group] = fbref_meandiff_df.groupby('_Squad_')[column_group].transform(percent_above_below)

In [144]:
fbref_meandiff_df.head()

Unnamed: 0,_Player_,_Squad_,_Born_,defense__Rk,defense__Nation,defense__Pos,defense__Age,defense__90s,defense_Tackles_Tkl,defense_Tackles_TklW,...,stats_Per90Minutes_G+A,stats_Per90Minutes_G-PK,stats_Per90Minutes_G+A-PK,stats_Per90Minutes_xG,stats_Per90Minutes_xAG,stats_Per90Minutes_xG+xAG,stats_Per90Minutes_npxG,stats_Per90Minutes_npxG+xAG,stats_Matches,yob
1,Sargis Adamyan,Köln,1993,2.0,am ARM,FWMF,29.0,7.2,-0.573604,-0.646067,...,0.42,0.14,0.42,0.47,0.11,0.58,0.47,0.58,Matches,1993
2,Karim Adeyemi,Dortmund,2002,3.0,de GER,FWMF,20.0,15.4,-0.443223,-0.334395,...,0.71,0.39,0.71,0.33,0.24,0.57,0.28,0.52,Matches,2002
3,Amine Adli,Leverkusen,2000,4.0,ma MAR,FWMF,22.0,15.9,-0.331878,-0.348659,...,0.5,0.31,0.5,0.31,0.11,0.42,0.31,0.42,Matches,2000
5,Naouirou Ahamada,Stuttgart,2002,6.0,fr FRA,MF,20.0,15.5,-0.404814,-0.338346,...,0.26,0.13,0.26,0.05,0.08,0.13,0.05,0.13,Matches,2002
6,Ludovic Ajorque,Mainz 05,1994,7.0,fr FRA,FWMF,28.0,13.5,-0.788104,-0.716418,...,0.52,0.45,0.52,0.34,0.13,0.47,0.34,0.47,Matches,1994


In [158]:
fbref_meandiff_df[all_stats_columns].columns[fbref_meandiff_df[all_stats_columns].isnull().sum() > 0]

Index(['shooting_Standard_Dist'], dtype='object')

In [160]:
fbref_meandiff_df['shooting_Standard_Dist'].isnull().sum()

15

In [163]:
fbref_meandiff_df.dropna(subset=['shooting_Standard_Dist'], inplace=True)

In [164]:
fbref_meandiff_df = fbref_meandiff_df[['_Player_', '_Born_', '_Squad_'] + all_stats_columns]

In [166]:
fbref_meandiff_df.to_csv(f'data/{season}_FBRef_meandiff.csv')

## 03. Aggregate original pre-transformation data per team for team-level statistics.

In [167]:
team_stats_df = fbref_combined_df.groupby('_Squad_')[all_stats_columns].mean()

In [169]:
team_stats_df.columns[team_stats_df.isnull().sum() > 0]

Index([], dtype='object')

In [168]:
team_stats_df.to_csv(f'data/{season}_FBRef_teams.csv')

## 04. Apply PCA to reduce the dimensionality of player statistics separately for each statistic group:
- defense
- touches
- passing
- progression
- attack
- miscelaneous

In [119]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.8)

In [125]:
pca.fit_transform(fbref_meandiff_df[attack_columns])

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [124]:
pca.explained_variance_ratio_

array([0.62728758, 0.23231409])

In [None]:
fbref_pca_df = 

for column_group in [defense_columns, touches_columns, passing_columns,
                     progres_columns, attack_columns, misc_columns]:
        fbref_pca_df[column_group] = fbref_pca_df.groupby('_Squad_')[column_group].transform(percent_above_below)

## Combine with FM and FC data

In [8]:
# load FM data
FM_data = pd.read_csv('./data/FM 2023.csv')
FM_data['dob'] = pd.to_datetime(FM_data['Date of birth'])
FM_data['yob'] = FM_data['dob'].dt.year
# load FC data
FC_data = pd.read_csv('./data/FC24.csv')
FC_data = FC_data[FC_data['update_as_of']=='2022-09-26']
FC_data['dob'] = pd.to_datetime(FC_data['dob'])
FC_data['yob'] = FC_data['dob'].dt.year

In [9]:
fbref_combined_outfielders_grouped = fbref_combined_outfielders_df.groupby('yob')

In [10]:
FM_data['name_match'] = FM_data.progress_apply(lambda row: get_best_match(
    row['Name'], row['yob'], grouped_df=fbref_combined_outfielders_grouped,
    player_col='_Player_', threshold=80), axis=1)

100%|██████████| 8452/8452 [02:04<00:00, 68.16it/s] 


In [11]:
FC_data['name_match'] = FC_data.progress_apply(lambda row: get_best_match(
    row['long_name'], row['yob'], grouped_df=fbref_combined_outfielders_grouped,
    player_col='_Player_', threshold=80), axis=1)

100%|██████████| 18533/18533 [06:41<00:00, 46.20it/s] 


In [12]:
# combine FM and FC dataframes with FBRef
FM_fbref_combined_df = fbref_combined_outfielders_df.merge(right=FM_data, left_on=['_Player_', '_Born_'], right_on=['name_match', 'yob'], how='inner')
FC_fbref_combined_df = fbref_combined_outfielders_df.merge(right=FC_data, left_on=['_Player_', '_Born_'], right_on=['name_match', 'yob'], how='inner')

In [13]:
FM_fbref_combined_df = FM_fbref_combined_df[['_Player_'] + all_stats_columns + FM_columns]
FC_fbref_combined_df = FC_fbref_combined_df[['_Player_'] + all_stats_columns + FC_columns]

In [14]:
# store the combined files
FM_fbref_combined_df.to_csv(f'data/FM_fbref_combined_data_{season}.csv')
FC_fbref_combined_df.to_csv(f'data/FC_fbref_combined_data_{season}.csv')