In [None]:
import pandas as pd
import numpy as np
import os
import glob
from pandas.api.types import is_numeric_dtype

# Combine the fbref data into a single dataframe. And standardise stats to per 90.

- We only keep players who have played in the in scope leagues in the final year (2024).
- We keep one row per player and aggregate their stats over all the seasons in the data.

In [None]:
os.makedirs(os.path.join('data', 'fbref_clean'), exist_ok=True)
os.makedirs(os.path.join('data', 'fbref_clean', '2024'), exist_ok=True)
os.makedirs(os.path.join('data', 'fbref_clean', '2023'), exist_ok=True)
os.makedirs(os.path.join('data', 'fbref_clean', '2022'), exist_ok=True)
os.makedirs(os.path.join('data', 'fbref_clean', 'all_years'), exist_ok=True)

In [None]:
LEAGUES = glob.glob(os.path.join('data', 'fbref', '*'))
LEAGUES = [f for f in LEAGUES if os.path.isdir(f)]
FILES = ['keepersadv.parquet', 'gca.parquet', 'defense.parquet', 'playingtime.parquet',
         'passing.parquet', 'passing_types.parquet', 'stats.parquet', 'shooting.parquet',
         'keepers.parquet', 'misc.parquet', 'possession.parquet']
YEARS = [2022, 2023, 2024]
for year in YEARS:
    for file in FILES:
        all_leagues = []
        for league in LEAGUES:
            df = pd.read_parquet(os.path.join(league, str(year), file))
            df['league'] = os.path.basename(league)
            all_leagues.append(df)
        df = pd.concat(all_leagues)
        df.to_parquet(os.path.join('data', 'fbref_clean', str(year), file))

In [None]:
for year in YEARS:
    clean_files = glob.glob(os.path.join('data', 'fbref_clean', str(year), '*'))
    df = pd.read_parquet([f for f in clean_files if 'playingtime' in f][0])
    clean_files = [f for f in clean_files if 'playingtime' not in f]
    for f in clean_files:
        df_temp = pd.read_parquet(f)
        df = df.merge(df_temp, on=['player_link', 'squad', 'league'],
                      how='left', suffixes=['', '_to_remove'])
    cols_to_remove = [col for col in df.columns if '_to_remove' in col]
    df.drop(cols_to_remove, axis='columns', inplace=True)
    df.drop(['90s', 'att', 'expected_xag', 'penalty_kicks_pka',
             'performance_ast', 'performance_crs', 'performance_ga',
             'performance_gls', 'performance_int', 'performance_pk',
             'performance_pkatt', 'performance_tklw',
             'progression_prgc', 'progression_prgp',
             'progression_prgr', 'starts_starts',
            ], axis='columns', inplace=True)

    # dropping these columns as either impossible to re-create as averages and need to recreate from the summed stats
    drop_list = ['playing_time_min_percent', 'starts_mn_start', 'subs_mn_sub', 'team_success_xg_on_minus_off',
                 'team_success_on_minus_off', 'team_success_ppm', 'playing_time_mn_mp',
                 # these are ratios
                 'standard_g_sh', 'standard_g_sot', 'expected_npxg_sh']
    cols90 = [c for c in df.columns if '90' in c and c != 'playing_time_90s']
    colspct = [c for c in df.columns if 'percent' in c]
    df.drop(cols90 + colspct + drop_list, axis='columns', inplace=True)

    df['year'] = year
    df.to_parquet(os.path.join('data', 'fbref_clean', 'all_years', f'fbref_{year}.parquet'))

In [None]:
all_clean_files = glob.glob(os.path.join('data', 'fbref_clean', 'all_years', '*'))
df = pd.read_parquet(all_clean_files)
# add one column for shot total distance
df['shot_total_distance'] = df['standard_dist'] * df['standard_sh']
cols_to_sum = [c for c in df.columns if is_numeric_dtype(df[c]) and c not in ['age', 'born', 'year']]
other_cols = [col for col in df.columns if col not in cols_to_sum]
# split players into one club/ multi club
df_one_club = df.drop_duplicates('player_link', keep=False).copy()
df_multi_club = df[df.duplicated('player_link', keep=False)].sort_values(['year', 'playing_time_min']).copy()
df_multi_club_sum = df_multi_club.groupby('player_link')[cols_to_sum].sum().reset_index()
df_multi_club = df_multi_club[other_cols].drop_duplicates('player_link', keep='last').copy()
df_multi_club = df_multi_club.merge(df_multi_club_sum, on='player_link', validate='1:1')
df = pd.concat([df_one_club, df_multi_club])
# recreate the average distance from the column for the shot total distance we created earlier
df['standard_dist'] = df['shot_total_distance'].divide(df['standard_sh']).round(1).fillna(0)
df.drop('shot_total_distance', axis='columns', inplace=True)

Add the percent/ ratio columns

In [None]:
df['aerial_duels_won_percent'] =  (df['aerial_duels_won'].divide(df['aerial_duels_won'] + df['aerial_duels_lost']) * 100.).round(1)
df['long_cmp_percent'] =  (df['long_cmp'].divide(df['long_att']) * 100.).round(1)
df['medium_cmp_percent'] =  (df['medium_cmp'].divide(df['medium_att']) * 100.).round(1)
df['short_cmp_percent'] =  (df['short_cmp'].divide(df['short_att']) * 100.).round(1)
df['standard_sot_percent'] =  (df['standard_sot'].divide(df['standard_sh']) * 100.).round(1)
df['total_cmp_percent'] =  (df['total_cmp'].divide(df['total_att']) * 100.).round(1)
df['challenges_tkl_percent'] = (df['challenges_tkl'].divide(df['challenges_att']) * 100.).round(1)
df['crosses_stp_percent'] = (df['crosses_stp'].divide(df['crosses_opp']) * 100.).round(1)
df['launched_cmp_percent'] = (df['launched_cmp'].divide(df['launched_att']) * 100.).round(1)
df['penalty_kicks_save_percent'] = (df['penalty_kicks_pksv'].divide(df['penalty_kicks_pkatt']) * 100.).round(1)
df['performance_cs_percent'] = (df['performance_cs'].divide(df['playing_time_mp']) * 100.).round(1)
df['take_ons_succ_percent'] = (df['take_ons_succ'].divide(df['take_ons_att']) * 100.).round(1)
non_penalty_goals = df['standard_gls'] - df['standard_pk']
df['standard_g/sh'] =  non_penalty_goals.divide(df['standard_sh']).round(2)
df['expected_npxg/sh'] =  df['expected_npxg'].divide(df['standard_sh']).round(2)
df['standard_g/sot'] = non_penalty_goals.divide(df['standard_sot']).round(2)
df['standard_sot_percent'] =  (df['standard_sot'].divide(df['standard_sh']) * 100.).round(1)
# to be considered leader in fbref have to have more than .395 shots per game / made it so 10+ shots too
mask_null_shot = ((df['standard_sh'].divide(df['playing_time_mp']) < .395) |
                  (df['standard_sh'] < 10))
df.loc[mask_null_shot, ['standard_g/sh', 'expected_npxg/sh',
                        'standard_sot_percent']] = np.nan
# to be considered leader in fbref have to have more than .111 sots on target per game / made it so 4+ shots too
mask_null_sot = ((df['standard_sot'].divide(df['playing_time_mp']) < .111) |
                 (df['standard_sot'] < 4)
                )
df.loc[mask_null_sot, ['standard_g/sot']] = np.nan

Standardise columns to per 90 stats if not ratio/ percent stats

In [None]:
exclude_per_90 = ['playing_time_mp', 'playing_time_min', 'playing_time_90s',
                  'starts_compl', 'subs_subs', 'subs_unsub']
for col in exclude_per_90:
    cols_to_sum.remove(col)
cols_to_sum.remove('shot_total_distance') # dropped already
df[cols_to_sum] = (df[cols_to_sum].divide(df['playing_time_min'], axis='rows') * 90.).round(3)

Save the dataset

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.to_parquet(os.path.join('data', 'fbref_clean', 'fbref_combined.parquet'))