In [1]:
import pandas as pd
import numpy as np

from group_lasso import GroupLasso

from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, OneHotEncoder

from tqdm import tqdm

## Build models

In [5]:
df = pd.read_csv('../data/final/player-model-data.csv', index_col=0)

predictor_cols = df.columns[9:-10].tolist()
target_cols = df.columns[-10:].tolist()

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(
    df.loc[:, predictor_cols],
    df.loc[:, target_cols],
    test_size=0.2,
    random_state=1897,
)

In [9]:
onehot_cols = predictor_cols[:1]
numeric_cols = predictor_cols[1:]

In [12]:
groups = (
    [1] * 3  # Position group
    + [2]  # 90s
    + [3] * 17  # Shooting 
    + [4] * 16  # Goal / shot creation
    + [5] * 22  # Passing
    + [6] * 23  # Pass types
    + [7] * 23  # Defense
    + [8] * 24  # Possession
    + [9] * 3  # Miscellaneous
    + [10] * 21  # Playing time
    + [-1]  # Don't regularize Age
)

In [13]:
reg = np.logspace(-6, 0, 7) * 5

param_grid = {
    'group_lasso_regression__group_reg': reg,
    'group_lasso_regression__l1_reg': reg,
}

In [14]:
models = {}

for target_col in tqdm(target_cols):
    y_train = Y_train[target_col]
    y_test = Y_test[target_col]

    col_tf = ColumnTransformer([
        ('one_hot_encoder', OneHotEncoder(), onehot_cols),
        ('power_transformer', PowerTransformer(), numeric_cols)
    ], remainder='passthrough')

    pl = Pipeline([
        ('column_transformer', col_tf),
        ('group_lasso_regression', GroupLasso(
            groups=groups,
            n_iter=10_000,
            tol=1e-3,
            scale_reg=None,
            frobenius_lipschitz=False,
            fit_intercept=True,
            random_state=1897,
            supress_warning=True,
        ))
    ])

    grid_cv = GridSearchCV(pl, param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_cv.fit(X_train, y_train)

    models[target_col[:-4]] = grid_cv

100%|██████████| 10/10 [56:25<00:00, 338.60s/it]


## Model Results

In [15]:
for m in models:
    print(f'{m:14s}', models[m].best_params_)

Gls            {'group_lasso_regression__group_reg': 4.9999999999999996e-06, 'group_lasso_regression__l1_reg': 5e-05}
G-xG           {'group_lasso_regression__group_reg': 0.0005, 'group_lasso_regression__l1_reg': 0.0005}
SCA            {'group_lasso_regression__group_reg': 0.0005, 'group_lasso_regression__l1_reg': 0.0005}
Ast            {'group_lasso_regression__group_reg': 0.0005, 'group_lasso_regression__l1_reg': 5e-05}
xA             {'group_lasso_regression__group_reg': 0.0005, 'group_lasso_regression__l1_reg': 4.9999999999999996e-06}
TklW           {'group_lasso_regression__group_reg': 5e-05, 'group_lasso_regression__l1_reg': 0.0005}
Int            {'group_lasso_regression__group_reg': 4.9999999999999996e-06, 'group_lasso_regression__l1_reg': 0.0005}
PressuresSucc  {'group_lasso_regression__group_reg': 0.005, 'group_lasso_regression__l1_reg': 4.9999999999999996e-06}
Clr            {'group_lasso_regression__group_reg': 5e-05, 'group_lasso_regression__l1_reg': 0.0005}
AerialDuelsWon

In [16]:
stat = 'Ast'

feature_names = models[stat].best_estimator_['column_transformer'].get_feature_names_out()
sparsity_mask = models[stat].best_estimator_['group_lasso_regression'].sparsity_mask_

for feature, mask in zip(feature_names, sparsity_mask):
    print(f'{feature:51s}', mask)

one_hot_encoder__Pos_DF                             False
one_hot_encoder__Pos_FW                             False
one_hot_encoder__Pos_MF                             False
power_transformer__Age                              False
power_transformer__90s                              True
power_transformer__Gls_shoot                        True
power_transformer__Sh_shoot                         True
power_transformer__SoT_shoot                        True
power_transformer__SoT%_shoot                       True
power_transformer__Sh/90_shoot                      True
power_transformer__SoT/90_shoot                     True
power_transformer__G/Sh_shoot                       True
power_transformer__G/SoT_shoot                      True
power_transformer__Dist_shoot                       True
power_transformer__FK_shoot                         True
power_transformer__PK_shoot                         True
power_transformer__PKatt_shoot                      True
power_transformer__xG_shoot

In [17]:
for m in models:
    y_pred = models[m].predict(X_test)
    y_test = Y_test[f'{m}_tgt']

    mse_test = mean_squared_error(y_test, y_pred)
    print(f'{m:14s}', f'{mse_test:5.4f}')

Gls            0.0135
G-xG           0.0061
SCA            0.3628
Ast            0.0063
xA             0.0028
TklW           0.1185
Int            0.1197
PressuresSucc  0.8643
Clr            0.4983
AerialDuelsWon 0.3882
