# 01. Data Consolidation

1. Merge separate files for each league and statistic type
2. Combine FBRef statistics with Football Manager and FC data
3. Ensure consistent player and team naming across all datasets

In [1]:
# change wd to repo home
import os
if os.getcwd().split('\\')[-1] == 'notebooks':
    os.chdir('..')

In [2]:
from modules.fm_functions import *
import warnings
warnings.filterwarnings(action='ignore')

## Combine all FBRef data into one dataframe

In [3]:
import pandas as pd

In [4]:
season = '2023'

# load FBRef data in separate dfs
fbref_dfs = {filename : pd.read_csv(f'data/FBRef_split/{filename}', index_col=0)
             for filename in os.listdir('data/FBRef_split')
             if filename.split('_')[-1][:4] == season}

In [5]:
# combine FBRef dfs by type first
table_types = ['defense', 'gca', 'keepersadv', 'keepers', 'misc', 'passingtypes',
               'passing', 'playingtime', 'possession', 'shooting', 'stats']

fbref_dfs_types = {}

for t in table_types:
    fbref_dfs_types[t] = {}
    for k in fbref_dfs.keys():
        if k.split('_')[1] == t:
            fbref_dfs_types[t][k.split('_')[0]] = fbref_dfs[k]
    try:
        fbref_dfs_types[t] = pd.concat(fbref_dfs_types[t], axis=0, ignore_index=True).set_index(['_Player', '_Squad', '_Born'])
    except:
        print(t)

In [6]:
# combine FBRef dfs into one
fbref_combined_df = pd.concat(fbref_dfs_types, axis=1).reset_index()
fbref_combined_df.columns = fbref_combined_df.columns.map(lambda x: '_'.join(map(str, x)))

## Combine with FM and FC data

In [186]:
# drop keepers
fbref_combined_outfielders_df = fbref_combined_df[fbref_combined_df['stats__Pos']!='GK']
# drop unknown YOB
fbref_combined_outfielders_df = fbref_combined_outfielders_df.dropna(subset=['_Born_'])
fbref_combined_outfielders_df['yob'] = fbref_combined_outfielders_df['_Born_'].astype('int')

In [188]:
# load FM data
FM_data = pd.read_csv('./data/FM 2023.csv')
FM_data['dob'] = pd.to_datetime(FM_data['Date of birth'])
FM_data['yob'] = FM_data['dob'].dt.year
# load FC data
FC_data = pd.read_csv('./data/FC24.csv')
FC_data = FC_data[FC_data['update_as_of']=='2022-09-26']
FC_data['dob'] = pd.to_datetime(FC_data['dob'])
FC_data['yob'] = FC_data['dob'].dt.year

In [201]:
from fuzzywuzzy import process

# FM_data_grouped = FM_data.groupby('yob')
# FC_data_grouped = FC_data.groupby('yob')
fbref_combined_outfielders_grouped = fbref_combined_outfielders_df.groupby('yob')

# Function to get the best match
def get_best_match(name, yob, threshold=80):
    try:
        choices = fbref_combined_outfielders_grouped.get_group(yob)['_Player_']
    except:
        return None
    match, score = process.extractOne(name, choices.values)
    return match if score >= threshold else None

In [217]:
FM_data['name_match'] = FM_data.apply(lambda row: get_best_match(
    row['Name'], row['yob'], threshold=80), axis=1)

In [218]:
FC_data['name_match'] = FC_data.apply(lambda row: get_best_match(
    row['long_name'], row['yob'], threshold=80), axis=1)

In [223]:
# combine FM and FC dataframes with FBRef
FM_fbref_combined_df = fbref_combined_outfielders_df.merge(right=FM_data, left_on=['_Player_', '_Born_'], right_on=['name_match', 'yob'], how='inner')
FC_fbref_combined_df = fbref_combined_outfielders_df.merge(right=FC_data, left_on=['_Player_', '_Born_'], right_on=['name_match', 'yob'], how='inner')

In [225]:
# store the combined files
FM_fbref_combined_df.to_csv(f'data/FM_fbref_combined_data_{season}.csv')
FC_fbref_combined_df.to_csv(f'data/FC_fbref_combined_data_{season}.csv')