In [16]:
import pandas as pd 
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

In [3]:
df = pd.read_pickle('rapm_input.pkl')
gk = pd.read_csv('top5_standard_24.csv')
player_mins = pd.read_csv('playermins.1724.csv')

  gk = pd.read_csv('top5_standard_24.csv')


In [12]:
gk['Min_Playing'] = pd.to_numeric(gk['Min_Playing'], errors='coerce')

gk_grouped = (gk
              .groupby(['Player', 'Url'], as_index=False)
              .agg(
                  Min_Playing = ('Min_Playing', 'sum'),
                   Pos = ('Pos', lambda x: ', '.join(sorted(x.dropna().astype(str).unique())))))

gk_players = gk_grouped.loc[gk_grouped['Pos'].str.contains('GK', na=False), 'Player'].tolist()
print(f"Goalkeeper players to remove: {gk_players}")

off_cols = [f"{p}_offense" for p in gk_players if f"{p}_offense" in df.columns]
def_cols = [f"{p}_defense" for p in gk_players if f"{p}_defense" in df.columns]
to_drop = off_cols + def_cols 
print(f"Columns to drop: {to_drop}")

df = df.drop(columns=to_drop, errors='ignore')

Goalkeeper players to remove: ['Aaron Ramsdale', 'Aarón Escandell', 'Abdoulaye Diallo', 'Adrian Rodriguez', 'Adrian Šemper', 'Adrián', 'Agustín Marchesín', 'Aitor Fernández', 'Alaa Bellaarouch', 'Alban Lafont', 'Albano Bizzarri', 'Alberto Brignoli', 'Alberto Cifuentes', 'Alberto García', 'Alberto Paleari', 'Alejandro Primo', 'Aleksandar Jovanović', 'Alen Sherri', 'Alessandro Berardi', 'Alessandro Russo', 'Alessandro Sorrentino', 'Alessio Cragno', 'Alex Cordaz', 'Alex McCarthy', 'Alex Meret', 'Alex Padilla', 'Alex Palmer', 'Alexander Brunst', 'Alexander Meyer', 'Alexander Nübel', 'Alexander Schwolow', 'Alexandre Letellier', 'Alexandre Olliero', 'Alexandre Oukidja', 'Alfred Gomis', 'Alisson', 'Alphonse Areola', 'Altay Bayındır', 'Andrea Consigli', 'Andrea Seculin', 'Andreas Linde', 'Andreas Luthe', 'Andrey Lunyov', 'Andriy Lunin', 'André Ferreira', 'André Onana', 'Andrés Fernández', 'Andrés Prieto', 'Angelo da Costa Júnior', 'Angus Gunn', 'Anthony Lopes', 'Anthony Mandrea', 'Anthony Raci

In [13]:
threshold = 2000 
replacement_players = set(player_mins.loc[player_mins['Min_Playing'] < threshold, 'Player']
)
print(f"Found {len(replacement_players)} replacement-level players")

off_cols = [f"{p}_offense" for p in replacement_players if f"{p}_offense" in df.columns]
def_cols = [f"{p}_defense" for p in replacement_players if f"{p}_defense" in df.columns]

df['replacement_offense'] = (df[off_cols] == 1).any(axis=1).astype(int)
df['replacement_defense'] = (df[def_cols] == 1).any(axis=1).astype(int)

df = df.drop(columns=off_cols + def_cols)

Found 3638 replacement-level players


In [None]:
df['xG_per_90'] = (df['total_xG'] / df['minutes_played']) * 90


In [15]:
feature_cols = [col for col in df.columns if col.endswith('_offense') or col.endswith('_defense')]

X = csr_matrix(df[feature_cols].values)
y = df['xG_per_90'].values

In [17]:
mask = np.isfinite(y)

X_clean = X[mask]
y_clean = y[mask]

In [18]:
model = Ridge(alpha=2000, solver='sparse_cg', max_iter=10000, tol=1e-3)
model.fit(X_clean, y_clean)

In [21]:
coef_series = pd.Series(model.coef_, index=feature_cols)

coef_df = coef_series.reset_index()
coef_df.columns = ['feature', 'coefficient']

coef_df[['player_name', 'offense_or_defense']] = (
    coef_df['feature'].str.rsplit(pat='_', n=1, expand=True)
)

coef_df = coef_df[['player_name', 'offense_or_defense', 'coefficient']]

wide = coef_df.pivot(
    index='player_name',
    columns='offense_or_defense',
    values='coefficient'
).reset_index()

wide[['offense','defense']] = wide[['offense','defense']].fillna(0)

wide['net'] = wide['offense'] - wide['defense']

wide = wide[['player_name', 'offense', 'defense', 'net']]

print(wide.sort_values('net', ascending=False).head(10))

offense_or_defense         player_name   offense   defense       net
2588                Robert Lewandowski  0.361335 -0.142414  0.503748
1684                     Kylian Mbappé  0.426097 -0.037718  0.463815
3033                   Virgil van Dijk  0.311477 -0.147900  0.459377
2759                      Serge Gnabry  0.391004 -0.051533  0.442537
205                   Andrew Robertson  0.214139 -0.220923  0.435062
1736                        Leroy Sané  0.276371 -0.156439  0.432811
1044                      Gerard Piqué  0.312873 -0.114306  0.427178
1485                    Joshua Kimmich  0.250162 -0.168708  0.418870
2170                     Mohamed Salah  0.245528 -0.162247  0.407774
3206                    İlkay Gündoğan  0.291305 -0.113863  0.405168
