In [1]:
import pandas as pd
import numpy as np

from group_lasso import GroupLasso

from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, OneHotEncoder

from tqdm import tqdm

## Build models

In [3]:
df = pd.read_csv('../data/final/player-model-data.csv', index_col=0)

predictor_cols = df.columns[5:-10].tolist()
target_cols = df.columns[-10:].tolist()

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(
    df.loc[:, predictor_cols],
    df.loc[:, target_cols],
    test_size=0.2,
    random_state=1897,
)

In [5]:
onehot_cols = predictor_cols[:4]
numeric_cols = predictor_cols[5:]

In [6]:
groups = (
    [0] * 6
    + [1] * 5
    + [2] * 5
    + [3] * 3
    + [4]
    + [5] * 17
    + [6] * 16
    + [7] * 22
    + [8] * 23
    + [9] * 23
    + [10] * 24
    + [11] * 3
    + [12] * 21
    + [-1]  # Don't regularize Age
)

In [84]:
reg = np.logspace(-6, 0, 7) * 5

param_grid = {
    'group_lasso_regression__group_reg': reg,
    'group_lasso_regression__l1_reg': reg,
}

In [85]:
models = {}

for target_col in tqdm(target_cols):
    y_train = Y_train[target_col]
    y_test = Y_test[target_col]

    col_tf = ColumnTransformer([
        ('one_hot_encoder', OneHotEncoder(), onehot_cols),
        ('power_transformer', PowerTransformer(), numeric_cols)
    ], remainder='passthrough')

    pl = Pipeline([
        ('column_transformer', col_tf),
        ('group_lasso_regression', GroupLasso(
            groups=groups,
            n_iter=10_000,
            tol=1e-3,
            scale_reg=None,
            frobenius_lipschitz=False,
            fit_intercept=True,
            random_state=1897,
            supress_warning=True,
        ))
    ])

    grid_cv = GridSearchCV(pl, param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_cv.fit(X_train, y_train)

    models[target_col[:-4]] = grid_cv

100%|██████████| 10/10 [45:25<00:00, 272.58s/it]


## Model Results

In [119]:
for m in models:
    print(f'{m:14s}', models[m].best_params_)

Gls            {'group_lasso_regression__group_reg': 0.0005, 'group_lasso_regression__l1_reg': 4.9999999999999996e-06}
G-xG           {'group_lasso_regression__group_reg': 0.0005, 'group_lasso_regression__l1_reg': 0.0005}
SCA            {'group_lasso_regression__group_reg': 0.0005, 'group_lasso_regression__l1_reg': 0.0005}
Ast            {'group_lasso_regression__group_reg': 5e-05, 'group_lasso_regression__l1_reg': 0.0005}
xA             {'group_lasso_regression__group_reg': 0.0005, 'group_lasso_regression__l1_reg': 4.9999999999999996e-06}
TklW           {'group_lasso_regression__group_reg': 0.0005, 'group_lasso_regression__l1_reg': 0.0005}
Int            {'group_lasso_regression__group_reg': 0.0005, 'group_lasso_regression__l1_reg': 5e-05}
PressuresSucc  {'group_lasso_regression__group_reg': 5e-05, 'group_lasso_regression__l1_reg': 0.0005}
Clr            {'group_lasso_regression__group_reg': 5e-05, 'group_lasso_regression__l1_reg': 0.0005}
AerialDuelsWon {'group_lasso_regression__grou

In [132]:
stat = 'SCA'

feature_names = models[stat].best_estimator_['column_transformer'].get_feature_names_out()
sparsity_mask = models[stat].best_estimator_['group_lasso_regression'].sparsity_mask_

for feature, mask in zip(feature_names, sparsity_mask):
    print(f'{feature:51s}', mask)

one_hot_encoder__prev_season_squad_play_style_0     True
one_hot_encoder__prev_season_squad_play_style_1     False
one_hot_encoder__prev_season_squad_play_style_2     True
one_hot_encoder__prev_season_squad_play_style_3     True
one_hot_encoder__prev_season_squad_play_style_4     True
one_hot_encoder__prev_season_squad_play_style_5     True
one_hot_encoder__prev_season_league_Bundesliga      True
one_hot_encoder__prev_season_league_La Liga         True
one_hot_encoder__prev_season_league_Ligue 1         True
one_hot_encoder__prev_season_league_Premier League  True
one_hot_encoder__prev_season_league_Serie A         True
one_hot_encoder__League_Bundesliga                  True
one_hot_encoder__League_La Liga                     True
one_hot_encoder__League_Ligue 1                     True
one_hot_encoder__League_Premier League              True
one_hot_encoder__League_Serie A                     True
one_hot_encoder__Pos_DF                             True
one_hot_encoder__Pos_FW       

In [139]:
for m in models:
    y_pred = models[m].predict(X_test)
    y_test = Y_test[f'{m}_tgt']

    mse_test = mean_squared_error(y_test, y_pred)
    print(f'{m:14s}', f'{mse_test:5.4f}')

Gls            0.0125
G-xG           0.0061
SCA            0.3230
Ast            0.0067
xA             0.0024
TklW           0.1104
Int            0.1241
PressuresSucc  0.8217
Clr            0.4467
AerialDuelsWon 0.3938


## 2021-22 Predictions

In [90]:
df_2122 = pd.read_csv('../data/final/player-model-data_2021-2022.csv', index_col=0)
df_2122 = df_2122.iloc[:, :-10]

X_2122 = df_2122.iloc[:, 5:]

In [91]:
preds_2122 = []

for stat in models:
    y_pred = pd.Series(models[stat].predict(X_2122), index=X_2122.index, name=stat)
    preds_2122.append(y_pred)

preds_2122 = pd.concat(preds_2122, axis=1)

In [92]:
preds_2122_scaled = preds_2122.copy()

for col in preds_2122_scaled:
    min_ = preds_2122_scaled[col].min()
    max_ = preds_2122_scaled[col].max()
    preds_2122_scaled[col] = (preds_2122_scaled[col] - min_) / (max_ - min_) * 100

In [93]:
preds_2122 = df_2122.iloc[:, :5].join(preds_2122)
preds_2122_scaled = df_2122.iloc[:, :5].join(preds_2122_scaled)

In [94]:
preds_2122.to_csv('../data/output/predictions_2021-2022.csv', index=False)
preds_2122_scaled.to_csv('../data/output/predictions-scaled_2021-2022.csv', index=False)