In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.ensemble._iforest import _average_path_length
import shap
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
import os
from pandas.api.types import is_numeric_dtype
pd.set_option('display.max_columns', None)

# Loading combined fbref/ transfermarkt data

In [None]:
df = pd.read_parquet(os.path.join('data', 'fbref_tm_combined.parquet'))

In [None]:
# Drop some negative columns that you might not want to identify players based on, e.g. number of yellow cards.
# penalties are rare so they tend to show up a lot in the outliers so removed for showing more interesting things
to_drop = ['performance_off', 'performance_crdy', 'performance_2crdy',
           'performance_crdr', 'performance_fls',
           'performance_og', 'performance_pkcon', 'performance_pkwon',
           'standard_fk', 'pass_types_fk', 
           'outcomes_off', 
           'outcomes_blocks', 'standard_pk', 
           'short_att',
           'challenges_lost',
           'year',
           'take_ons_tkld',
           'expected_np_g_minus_xg',
           #'vs_dribbles_att',
           #'playing_time_mn/mp',
           #'outcomes_out', 'outcomes_int',
           #'pass_types_press',
           #'body_parts_right', 'body_parts_other',
           #'body_parts_head',
           #'a-xa',
           #'expected_np:g-xg',
           'aerial_duels_lost', 'carries_mis', 'carries_dis', 'err', 'standard_pkatt']
df.drop(to_drop, axis='columns', inplace=True)

# Drop centre backs / goalkeepers

Centre-back stats are notoriously hard and goalkeepers are judged differently so we drop them here.

In [None]:
mask_remove = (((df.position_tm == 'Centre-Back') | (df.position_tm == 'Goalkeeper')) &
               ((df.pos.str.contains('GK')) | (df.pos.str.contains('DF'))))
df = df[~mask_remove].copy()
gk_cols = ['performance_sota', 'performance_saves', 'performance_w',
           'performance_d', 'performance_l', 'performance_cs',
           'penalty_kicks_pkatt', 'penalty_kicks_pksv', 'penalty_kicks_pkm',
           'goals_ga', 'goals_pka', 'goals_fk', 'goals_ck', 'goals_og',
           'expected_psxg', 'expected_psxg_sot', 'expected_psxg_plus_minus',
           'launched_cmp', 'launched_att', 'passes_att_gk', 'passes_thr',
           'passes_avglen', 'goal_kicks_att', 'goal_kicks_avglen', 'crosses_opp',
           'crosses_stp', 'sweeper_opa', 'sweeper_avgdist',
           'crosses_stp_percent', 'launched_cmp_percent', 'performance_cs_percent',
           'penalty_kicks_save_percent',
          ]
df.drop(gk_cols, axis='columns', inplace=True)

# Drop players with fewer than 675 minutes played over last 3 years

This is an arbitrary cut-off point (around 7.5 games)

In [None]:
# commented out as already done in the 05_merge_transfermarkt_and_fbref notebook
#df = df[df.playing_time_min >= 675].copy()
#df.reset_index(drop=True, inplace=True)

# Add a code for the transfermarkt position

Note there is some disagreement between fbref and transfermarkt positions but it is broadly the same

In [None]:
df.loc[df.position_tm.str.contains('Midfield'), 'position_code'] = 0
df.loc[df.position_tm.str.contains('Back'), 'position_code'] = 1
df.loc[df.position_tm.str.contains('Striker'), 'position_code'] = 2
df.loc[df.position_tm.str.contains('Winger'), 'position_code'] = 2
df.loc[df.position_tm.str.contains('Forward'), 'position_code'] = 2

# Reset the index

In [None]:
df.reset_index(drop=True, inplace=True)

# Check for highly correlated columns and drop them

In [None]:
corr_cols = ['pass_types_live', 'medium_att', 'total_att',
             'receiving_rec', 'long_att', 'medium_cmp_percent',
             'pass_types_crs',
             'touches_def_pen',  'tackles_tkl', 'blocks_pass', 
             'tackles_mid_3rd', 
             'expected_xg',
             'standard_sh',  'standard_g/sot', 
             'expected_g_minus_xg', 'kp', 'sca_types_passlive',
             'team_success_plus_minus','gca_types_passlive', 'team_success_ong',
              'carries_totdist', 'medium_cmp',
             'total_cmp', 'tkl_plus_int', 'team_success_xg_onxg', 'standard_sot',
             'touches_mid_3rd',
             'corner_kicks_in', 'corner_kicks_out', 'corner_kicks_str',
             'touches_def_3rd', 'team_success_onga', 'take_ons_succ',
             'gca_gca','gca_types_passdead', 'sca_sca', 'standard_gls',
             'ast', 'pass_types_dead',
             'gca_types_to', 'gca_types_sh', 'gca_types_fld',
             'short_cmp_percent', 'total_totdist', 'total_cmp_percent', 'challenges_att',
             # decided to get rid of gca_types_def as all the other gca are gone and I am 
             # thinking this is not as correlated with sca_types_def as there are relatively fewer
             # sca_types_def so there is more variance in the goals
             'gca_types_def',
             'performance_g_plus_a',
             'expected_npxg_plus_xag', 'performance_g_minus_pk',  'touches_att_pen',
             'expected_a_minus_xag',
             'carries_1_3', 'carries_prgdist',
             'long_cmp', 'touches_live', 'take_ons_att', 'pass_types_ck',
             'pass_types_ti', 'outcomes_cmp', 
             'touches_touches']

In [None]:
numeric_cols = [col for col in df.columns if is_numeric_dtype(df[col])]
ignore_cols = ['age', 'born', 'playing_time_mp', 'playing_time_min',
               'playing_time_90s', 'starts_compl', 'subs_subs',
               'subs_unsub', 'height_tm', 'market_value_euro_millions_tm',
               'age_at_2024_07_01_tm', 'position_code', 'playing_time_starts']
for col in ignore_cols:
    numeric_cols.remove(col)

In [None]:
# https://stackoverflow.com/questions/34175462/dendrogram-using-pandas-and-scipy
corr_condensed = squareform(1 - df[numeric_cols].corr())
z = linkage(corr_condensed, method='average')
fig, ax = plt.subplots(figsize=(16, 50))
labels = list(df[numeric_cols].columns)
dend = dendrogram(z, orientation='left', labels=labels, leaf_font_size=16, ax=ax)
for label in ax.get_yticklabels():
    if label.get_text() in corr_cols:
        label.set_color('red')

In [None]:
df.drop(corr_cols, axis='columns', inplace=True)

Rename columns to more descriptive names

In [None]:
rename_cols = {'team_success_xg_onxga': 'xG allowed by team while on pitch',
               'team_success_xg_xg_plus_minus': 'xG scored minus allowed by team while on pitch',               
               'tackles_tklw': 'tackles won',
               'tackles_def_3rd': 'tackles (def 3rd)',
               'tackles_att_3rd': 'tackles (att 3rd)',
               'challenges_tkl': 'dribblers tackled',
               'blocks_blocks': 'blocks',
               'blocks_sh': 'shots blocked',
               'int': 'interceptions',
               'clr': 'clearances',
               'sca_types_passdead': 'dead-ball pass leading to a shot attempt',
               'sca_types_to': 'dribbles leading to a shot attempt',
               'sca_types_sh': 'shots leading to another shot attempt',
               'sca_types_fld': 'fouls drawn leading to a shot attempt',
               'sca_types_def': 'defensive actions leading to a shot attempt',
               'performance_fld': 'fouls drawn',
               'performance_recov': 'ball recoveries',
               'aerial_duels_won': 'aerial duels won',
               'total_prgdist': 'progressive passing distance',
               'short_cmp': 'short passes completed (#)',               
               'expected_xa': 'expected assists',  
               'xag': 'expected assisted goals',    
               '1_3': 'passes into final third',
               'ppa': 'passes into penalty area',               
               'crspa': 'crosses into penalty area',
               'pass_types_tb': 'through balls',
               'pass_types_sw': 'switches',
               'touches_att_3rd': 'touches (# att 3rd)',
               'carries_carries': 'carries (#)',
               'carries_prgc': 'carries progressive distance',              
               'carries_cpa': 'carries into penalty area',
               'receiving_prgr': 'progressive passes received (#)',
               'prgp': 'progressive passes (#)',
               'expected_npxg': 'xG (non-penalty)',
               'standard_dist': 'shot distance (average)',
               'challenges_tkl_percent': '% of dribblers tackled',
               'aerial_duels_won_percent': '% of aerials won',
               'long_cmp_percent': '% of long passes completed',
               'take_ons_succ_percent': '% of dribbles successful',
               'standard_g/sh': 'goals per shot',
               'expected_npxg/sh': 'xG per shot (non-penalty)',
               'standard_sot_percent': '% of shots on target',
               'position_code': 'position',
              }
df.rename(rename_cols, axis=1, inplace=True)

# Change some values as we don't want to flag people as outliers for their poor stats (truncate stats for bottom quantile if more is better or top quantile if fewer is better)

In [None]:
X = df[list(rename_cols.values())]
X_copy = X.copy() # Store copy before modifying
X = X.fillna(0)
cols_fewer_better = ['xG allowed by team while on pitch', 'shot distance (average)']
cols_more_better = list(set(X.columns) - set(cols_fewer_better) - set(['position']))
# for goals allowed stats if in the top 50% change the value
value_1 = 0.5
mask1 = X[cols_fewer_better] > X[cols_fewer_better].quantile(value_1)
# for other stats is in the bottom 40% change value
value_2 = 0.4
mask2 = X[cols_more_better] < X[cols_more_better].quantile(value_2)
mask = pd.concat([mask1, mask2], axis='columns')
quantile_values = pd.concat([X[cols_fewer_better].quantile(value_1), X[cols_more_better].quantile(value_2)])
X.mask(mask, quantile_values, axis='columns', inplace=True)
# replace a couple of average distances that are zero with the maximum distance
# so we don't flag them up as outliers (they had zero shots)
max_dist = X['shot distance (average)'].max()
X.loc[X['shot distance (average)'] == 0, 'shot distance (average)'] = max_dist

# Find outliers using isolation forest

In [None]:
clf = IsolationForest(random_state=42, contamination=0.1)
y_pred = clf.fit(X)
forest_score = clf.score_samples(X)

# Calculate shap values (to explain the outliers)

In [None]:
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X)
df_shap = pd.DataFrame(shap_values, columns=X.columns)

# Round to 1/2 decimal places

In [None]:
cols_round_int = ['progressive passing distance', 'short passes completed (#)',
                  'touches (# att 3rd)',
                  'carries (#)', 'carries progressive distance', 
                  'progressive passes received (#)', 'shot distance (average)',
                  '% of dribblers tackled',
                  '% of aerials won', '% of long passes completed',
                  '% of dribbles successful', '% of shots on target']
other_cols = list(set(X_copy.columns) - set(cols_round_int) - set(['position']))
X_copy[cols_round_int] = X_copy[cols_round_int].round(1)
X_copy[other_cols] = X_copy[other_cols].round(2)

# Calculate percentiles for each player based on position (back, midfield, forward)

In [None]:
X_position_percentile = ((X_copy
                         .fillna(0)
                         .groupby('position')[X_copy.columns[:-1]]
                         .rank(pct=True) * 100).round(0).astype(np.int32))
X_position_percentile['position'] = np.nan
# reverse where better is lower
X_position_percentile['xG allowed by team while on pitch'] = 100 - X_position_percentile['xG allowed by team while on pitch']
X_position_percentile['shot distance (average)'] = 100 - X_position_percentile['shot distance (average)']
# if zero shot distance (typically didn't take any shots) set the percentile to zero
X_position_percentile.loc[X_copy['shot distance (average)'] == 0, 'shot distance (average)'] = 0

# Get lower (0.05) and upper (0.95) quantile values for each position

In [None]:
upper = X_copy.groupby('position')[X_copy.columns[:-1]].quantile(0.95)
upper.index = ['midfield_upper', 'back_upper', 'forward_upper']
lower = X_copy.groupby('position')[X_copy.columns[:-1]].quantile(0.05)
lower.index = ['midfield_lower', 'back_lower', 'forward_lower']
# swap xG allowed as lower is better!
swap_col = 'xG allowed by team while on pitch'
swap_upper = upper[swap_col].values.copy()
swap_lower = lower[swap_col].values.copy()
lower[swap_col] = swap_upper
upper[swap_col] = swap_lower
# concatenate
df_quantile = pd.concat([lower, upper])
df_quantile.sort_index(inplace=True)

# Get the column names contributing to the the smallest (most negative) difference and their values

In [None]:
# https://stackoverflow.com/questions/48764923/find-the-column-name-which-has-the-2nd-maximum-value-for-each-row-pandas
sort_arr = np.argsort(df_shap.values, axis=1)
df_stat_name = pd.DataFrame(np.array(df_shap.columns)[sort_arr], index=df_shap.index,
                            columns=[f'stat_{i}_name' for i in range(1, sort_arr.shape[1] + 1)])
# then get values
sort_arr = (sort_arr.T + (np.arange(0, sort_arr.shape[0]) * sort_arr.shape[1])).T
df_stat_values = pd.DataFrame(X_copy.values.ravel()[sort_arr].reshape(X_copy.shape),
                              columns=[f'stat_{i}_value' for i in range(1, sort_arr.shape[1] + 1)])
df_percentile_values = pd.DataFrame(X_position_percentile.values.ravel()[sort_arr].reshape(X_position_percentile.shape),
                                    columns=[f'stat_{i}_percentile' for i in range(1, sort_arr.shape[1] + 1)])

# merge them together
df_stat_values = df_stat_name.merge(df_stat_values, left_index=True, right_index=True)
df_stat_values = df_stat_values.merge(df_percentile_values, left_index=True, right_index=True)
# reorder the columns so it goes stat name then value, stat name then value, etc.
num_col = sort_arr.shape[1]
col_order = np.repeat(np.arange(num_col), 3) + np.tile(np.array([0, num_col, num_col * 2]), num_col)
col_order = df_stat_values.columns[col_order]
df_stat_values = df_stat_values[col_order].copy()

# Create final dataframe

In [None]:
keep_cols = [c for c in df.columns if c not in rename_cols.values()]
df = df[keep_cols].copy()
df['outlier_score'] = forest_score
df = df.merge(df_stat_values, left_index=True, right_index=True, validate='1:1')
df.loc[df.position_tm.str.contains('Midfield'), 'position'] = 'midfield'
df.loc[df.position_tm.str.contains('Back'), 'position'] = 'back'
df.loc[df.position_tm.str.contains('Striker'), 'position'] = 'forward'
df.loc[df.position_tm.str.contains('Winger'), 'position'] = 'forward'
df.loc[df.position_tm.str.contains('Forward'), 'position'] = 'forward'

# Backs where their position contributes towards them being an outlier

In [None]:
index_interesting_defenders = df_shap[(df['position'] == 'back') & (df_shap['position'] < 0)].index
df[df.index.isin(index_interesting_defenders)]

# Midfield where their position contributes most towards them being an outlier

In [None]:
index_interesting_defenders = df_shap[(df['position'] == 'midfield') &
                                      (df_shap['position'] < 0)].sort_values('position').index
df[df.index.isin(index_interesting_defenders)].head(10)

# Wingers/ forwards where their position contributes most towards them being an outlier

In [None]:
index_interesting_defenders = df_shap[(df['position'] == 'forward') &
                                      (df_shap['position'] < 0)].sort_values('position').index
df[df.index.isin(index_interesting_defenders)].head(10)

# Sort by the score

In [None]:
df.sort_values('outlier_score', inplace=True)
df.reset_index(drop=True, inplace=True)

# Rename some columns

In [None]:
df.rename({'jersey_number_tm': 'jersey_number_transfermarkt',
           'player_tm': 'player_name_transfermarkt',
           'nationality_tm': 'nationality_transfermarkt',
           'signed_from_tm': 'signed_from_transfermarkt',
           'age': 'age_fbref',
           'player': 'player_name_fbref',
           'dob_tm': 'dob_transfermarkt',
           'position_tm': 'position_transfermarkt',
           'pos': 'position_fbref',
           'height_tm': 'height_transfermarkt',
           'foot_tm': 'foot_transfermarkt',
           'squad_tm': 'squad_transfermarkt', 
           'league_tm': 'league_transfermarkt',
           'joined_tm': 'joined_transfermarkt',
           'market_value_euro_millions_tm': 'market_value_euros_millions_transfermarkt', 
           'player_link_tm': 'player_link_transfermarkt',
           'player_link_fbref': 'player_link_fbref',
           'nation': 'nation_fbref',
           'squad': 'squad_fbref',
           'league': 'league_fbref',
           'match_link': 'match_link_fbref',
           'age_at_2024_07_01_tm': 'age_years_at_2021_07_01_transfermarkt',
           'born': 'born_fbref'},
          axis='columns', inplace=True)

# Show top 30 outliers

In [None]:
df.head(30)

# Show top 30 outliers aged 26 or under and valued at 15 mill or under

In [None]:
df[(df.age_years_at_2021_07_01_transfermarkt <= 26) & (df.market_value_euros_millions_transfermarkt <= 15)].head(30)

# Show older outliers with a low valuation

In [None]:
df[(df.market_value_euros_millions_transfermarkt < 5) & (df.age_years_at_2021_07_01_transfermarkt > 26) & 
   (df.age_years_at_2021_07_01_transfermarkt < 30)].head(15)

# Save to excel

In [None]:
df.to_excel(os.path.join('data', 'outliers.xlsx'))

In [None]:
df.to_parquet(os.path.join('data', 'outliers.parquet'))
df_quantile.to_parquet(os.path.join('data', 'quantiles.parquet'))