In [1]:
import os
import sys

# Get the current directory 
current_dir = os.getcwd()

# Get the parent directory
root_dir = os.path.dirname(current_dir)

# Add the root directory to the Python path
sys.path.append(root_dir)

print(f"Current directory: {current_dir}")
print(f"Added to path: {root_dir}")

Current directory: /Users/arshsingh/Documents/Projects/soccer-value-added/baseline
Added to path: /Users/arshsingh/Documents/Projects/soccer-value-added


In [2]:
import pandas as pd 
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, GridSearchCV

import warnings
warnings.filterwarnings('ignore')

from preprocessing import preprocess_rapm_data

In [3]:
data_dir = os.path.join(root_dir, 'data')
df = preprocess_rapm_data(
    rapm_path=os.path.join(data_dir, 'rapm_input.pkl'),
    gk_path=os.path.join(data_dir, 'top5_standard_24.csv'),
    player_mins_path=os.path.join(data_dir, 'playermins.1724.csv'),
    replacement_threshold=2000,
    include_xg_per_90=True
)

Loaded RAPM data with shape: (299332, 13108)
Loaded goalkeeper data with shape: (22370, 39)
Loaded player minutes data with shape: (7149, 4)
Identified 540 goalkeeper players to remove
Dropping 992 goalkeeper columns
Found 3638 replacement-level players (threshold: 2000 minutes)
Dropping 5684 replacement player columns
Added xG_per_90 feature
Final preprocessed dataframe shape: (299332, 6435)


In [4]:
feature_cols = [col for col in df.columns if col.endswith('_offense') or col.endswith('_defense')]

X = csr_matrix(df[feature_cols].values)
y = df['xG_per_90'].values

mask = np.isfinite(y)

X_clean = X[mask]
y_clean = y[mask]

model = Ridge(alpha=2000, solver='sparse_cg', max_iter=10000, tol=1e-3)
model.fit(X_clean, y_clean)

coef_series = pd.Series(model.coef_, index=feature_cols)

coef_df = coef_series.reset_index()
coef_df.columns = ['feature', 'coefficient']

coef_df[['player_name', 'offense_or_defense']] = (
    coef_df['feature'].str.rsplit(pat='_', n=1, expand=True)
)

coef_df = coef_df[['player_name', 'offense_or_defense', 'coefficient']]

wide = coef_df.pivot(
    index='player_name',
    columns='offense_or_defense',
    values='coefficient'
).reset_index()

wide[['offense','defense']] = wide[['offense','defense']].fillna(0)

wide['net'] = wide['offense'] - wide['defense']

wide = wide[['player_name', 'offense', 'defense', 'net']]

print(wide.sort_values('net', ascending=False).head(10))

offense_or_defense         player_name   offense   defense       net
2588                Robert Lewandowski  0.361335 -0.142414  0.503748
1684                     Kylian Mbappé  0.426097 -0.037718  0.463815
3033                   Virgil van Dijk  0.311477 -0.147900  0.459377
2759                      Serge Gnabry  0.391004 -0.051533  0.442537
205                   Andrew Robertson  0.214139 -0.220923  0.435062
1736                        Leroy Sané  0.276371 -0.156439  0.432811
1044                      Gerard Piqué  0.312873 -0.114306  0.427178
1485                    Joshua Kimmich  0.250162 -0.168708  0.418870
2170                     Mohamed Salah  0.245528 -0.162247  0.407774
3206                    İlkay Gündoğan  0.291305 -0.113863  0.405168


In [5]:
wide.to_csv('baseline_model.csv')