In [1]:
import os;
import pandas as pd
import numpy as np
import tqdm
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import features as fs
import labels as lab

In [2]:
datafolder = os.getcwd()+'\\spadl\\'
spadl_h5 = os.path.join(datafolder,"spadl.h5")
features_h5 = os.path.join(datafolder,"features.h5")
labels_h5 = os.path.join(datafolder,"labels.h5")
predictions_h5 = os.path.join(datafolder,"predictions.h5")

In [3]:
# Create a train and test set of games
games = pd.read_hdf(spadl_h5,"games")
traingames = games[:len(games)//2]
testgames = games[len(games)//2:]
print(len(traingames),len(testgames))

672 672


In [4]:
# Select shots from the data and all available info about these shots

def get_shots(games):
    shots = []
    with pd.HDFStore(spadl_h5) as spadlstore,\
         pd.HDFStore(features_h5) as featurestore:
        actiontypes = spadlstore["actiontypes"]
        for game_id in tqdm.tqdm(games.game_id,desc="selecting features"):
            ai = (spadlstore[f"actions/game_{game_id}"]
                  .merge(actiontypes,how="left")
                 )
            shot_idx = ai.type_name.str.contains("shot")
            Xi = featurestore[f"game_{game_id}"]
            shots.append(Xi[shot_idx])
    return pd.concat(shots)

train_shots = get_shots(traingames)
test_shots = get_shots(testgames)

selecting features: 100%|████████████████████████████████████████████████████████████| 672/672 [00:11<00:00, 58.04it/s]
selecting features: 100%|████████████████████████████████████████████████████████████| 672/672 [00:11<00:00, 56.05it/s]


In [6]:
xfns = [fs.actiontype_onehot,
       fs.bodypart_onehot,
       fs.goalscore,
       fs.startlocation,
       fs.movement,
       fs.space_delta,
       fs.startpolar,
       fs.team,
       fs.time,
       fs.time_delta,
       ]
nb_prev_actions = 2

f = fs.feature_column_names(xfns,nb_prev_actions)
f.remove("dx_a0")
f.remove("dy_a0")
f

['type_pass_a0',
 'type_cross_a0',
 'type_throw_in_a0',
 'type_freekick_crossed_a0',
 'type_freekick_short_a0',
 'type_corner_crossed_a0',
 'type_corner_short_a0',
 'type_take_on_a0',
 'type_foul_a0',
 'type_tackle_a0',
 'type_interception_a0',
 'type_shot_a0',
 'type_shot_penalty_a0',
 'type_shot_freekick_a0',
 'type_keeper_save_a0',
 'type_keeper_claim_a0',
 'type_keeper_punch_a0',
 'type_keeper_pick_up_a0',
 'type_clearance_a0',
 'type_bad_touch_a0',
 'type_non_action_a0',
 'type_dribble_a0',
 'type_goalkick_a0',
 'type_ball_recovery_a0',
 'type_pass_a1',
 'type_cross_a1',
 'type_throw_in_a1',
 'type_freekick_crossed_a1',
 'type_freekick_short_a1',
 'type_corner_crossed_a1',
 'type_corner_short_a1',
 'type_take_on_a1',
 'type_foul_a1',
 'type_tackle_a1',
 'type_interception_a1',
 'type_shot_a1',
 'type_shot_penalty_a1',
 'type_shot_freekick_a1',
 'type_keeper_save_a1',
 'type_keeper_claim_a1',
 'type_keeper_punch_a1',
 'type_keeper_pick_up_a1',
 'type_clearance_a1',
 'type_bad_touch

In [7]:
# Create features-matrix X and label-vector y.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss, log_loss
from xgboost import XGBClassifier

def Xy(f,shots):
    return shots[f],shots.result_success_a0

In [8]:
# Logistic regression
X,y = Xy(f,train_shots)
model = LogisticRegression().fit(X,y)

X,y = Xy(f,test_shots)
pred = [p[1] for p in model.predict_proba(X)]

print("ROC AUC: %.3f" % roc_auc_score(y,pred))
print("Brier score: %.3f" % brier_score_loss(y,pred))
print("Log loss: %.3f" % log_loss(y,pred))

ROC AUC: 0.775
Brier score: 0.085
Log loss: 0.294


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [9]:
# XGBoost
X,y = Xy(f,train_shots)
model = XGBClassifier().fit(X,y)

X,y = Xy(f,test_shots)
pred = [p[1] for p in model.predict_proba(X)]

print("ROC AUC: %.3f" % roc_auc_score(y,pred))
print("Brier score: %.3f" % brier_score_loss(y,pred))
print("Log loss: %.3f" % log_loss(y,pred))

ROC AUC: 0.924
Brier score: 0.060
Log loss: 0.200


In [10]:
# Naive baseline, always predict class distribution
X,y = Xy(f,train_shots)
avgP = np.mean(y)

X,y = Xy(f,test_shots)
pred = [avgP for _i in y]

print("ROC AUC: %.3f" % roc_auc_score(y,pred))
print("Brier score: %.3f" % brier_score_loss(y,pred))
print("Log loss: %.3f" % log_loss(y,pred))

ROC AUC: 0.500
Brier score: 0.097
Log loss: 0.344
