In [1]:
import os
import warnings
import pandas as pd
pd.set_option('display.max_columns', None)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings(action="ignore", message="credentials were not supplied. open data access only")
import tqdm

In [2]:
%load_ext autoreload
%autoreload 2
from socceraction.data.statsbomb import StatsBombLoader
import socceraction.spadl as spadl

In [3]:
SBL = StatsBombLoader(getter="remote", creds={"user": None, "passwd": None})

In [5]:
# load cmps.pkl as dataframe
cmps = pd.read_pickle('cmps.pkl')

# create competition_ids and season_ids pairs
cmps_ids = cmps[['competition_id', 'season_id']].drop_duplicates().values.tolist()

cmps_ids

[[9, 27],
 [16, 4],
 [16, 1],
 [16, 2],
 [16, 27],
 [16, 26],
 [16, 25],
 [16, 24],
 [16, 23],
 [16, 22],
 [16, 21],
 [16, 41],
 [16, 39],
 [16, 37],
 [16, 44],
 [16, 277],
 [16, 71],
 [16, 276],
 [87, 84],
 [87, 268],
 [87, 279],
 [1470, 274],
 [43, 106],
 [43, 3],
 [43, 55],
 [43, 54],
 [43, 51],
 [43, 272],
 [43, 270],
 [43, 269],
 [1238, 108],
 [11, 90],
 [11, 42],
 [11, 4],
 [11, 1],
 [11, 2],
 [11, 27],
 [11, 26],
 [11, 25],
 [11, 24],
 [11, 23],
 [11, 22],
 [11, 21],
 [11, 41],
 [11, 40],
 [11, 39],
 [11, 38],
 [11, 37],
 [11, 278],
 [81, 48],
 [81, 275],
 [7, 27],
 [116, 68],
 [2, 27],
 [2, 44],
 [12, 27],
 [12, 86],
 [55, 43],
 [35, 75]]

In [6]:
competitions2 = SBL.competitions()

# create competition_ids and season_ids pairs
cmps_ids = cmps[['competition_id', 'season_id']].drop_duplicates().values.tolist()

# Load selected competitions based on competition_ids and season_ids from cmps_ids
selected_competitions = competitions2[
    competitions2.apply(lambda row: [row['competition_id'], row['season_id']] in cmps_ids, axis=1)
]

selected_competitions.head(20)

Unnamed: 0,season_id,competition_id,competition_name,country_name,competition_gender,season_name
0,27,9,1. Bundesliga,Germany,male,2015/2016
1,4,16,Champions League,Europe,male,2018/2019
2,1,16,Champions League,Europe,male,2017/2018
3,2,16,Champions League,Europe,male,2016/2017
4,27,16,Champions League,Europe,male,2015/2016
5,26,16,Champions League,Europe,male,2014/2015
6,25,16,Champions League,Europe,male,2013/2014
7,24,16,Champions League,Europe,male,2012/2013
8,23,16,Champions League,Europe,male,2011/2012
9,22,16,Champions League,Europe,male,2010/2011


In [7]:
# Get games from all selected competitions
games = pd.concat([
    SBL.games(row.competition_id, row.season_id)
    for row in selected_competitions.itertuples()
])

In [8]:
games.head(2)

Unnamed: 0,game_id,season_id,competition_id,competition_stage,game_day,game_date,home_team_id,away_team_id,home_score,away_score,venue,referee
0,3890561,27,9,Regular Season,34,2016-05-14 15:30:00,175,181,1,4,PreZero Arena,Felix Brych
1,3890505,27,9,Regular Season,28,2016-04-02 15:30:00,169,184,1,0,Allianz Arena,Florian Meyer


In [9]:
len(games)

2690

In [10]:
games_verbose = tqdm.tqdm(list(games.itertuples()), desc="Loading game data")
teams, players = [], []
actions = {}
for game in games_verbose:
    # load data
    teams.append(SBL.teams(game.game_id))
    players.append(SBL.players(game.game_id))
    events = SBL.events(game.game_id)
    # convert data
    actions[game.game_id] = spadl.statsbomb.convert_to_actions(events, game.home_team_id)

teams = pd.concat(teams).drop_duplicates(subset="team_id")
players = pd.concat(players)

Loading game data:   0%|          | 0/2690 [00:00<?, ?it/s]

Loading game data: 100%|██████████| 2690/2690 [57:37<00:00,  1.29s/it]


In [11]:
datafolder = "/home/msai/adnan002/repos/xB-360/data"

# Create data folder if it doesn't exist
if not os.path.exists(datafolder):
    os.mkdir(datafolder)
    print(f"Directory {datafolder} created.")

spadl_h5 = os.path.join(datafolder, "spadl-statsbomb.h5")

# Store all spadl data in h5-file
with pd.HDFStore(spadl_h5) as spadlstore:
    spadlstore["competitions"] = selected_competitions
    spadlstore["games"] = games
    spadlstore["teams"] = teams
    spadlstore["players"] = players[['player_id', 'player_name', 'nickname']].drop_duplicates(subset='player_id')
    spadlstore["player_games"] = players[['player_id', 'game_id', 'team_id', 'is_starter', 'starting_position_id', 'starting_position_name', 'minutes_played']]
    for game_id in actions.keys():
        spadlstore[f"actions/game_{game_id}"] = actions[game_id]

In [12]:
features_h5 = os.path.join(datafolder, "features.h5")
labels_h5 = os.path.join(datafolder, "labels.h5")

In [13]:
games = pd.read_hdf(spadl_h5, "games")
print("nb of games:", len(games))

nb of games: 2690


In [14]:
import socceraction.spadl as spadl
import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab

xfns = [
    fs.actiontype,
    fs.actiontype_onehot,
    fs.bodypart,
    fs.bodypart_onehot,
    fs.result,
    fs.result_onehot,
    fs.goalscore,
    fs.startlocation,
    fs.endlocation,
    fs.movement,
    fs.space_delta,
    fs.startpolar,
    fs.endpolar,
    fs.team,
    fs.time,
    fs.time_delta
]

for game in tqdm.tqdm(list(games.itertuples()), desc=f"Generating and storing features in {features_h5}"):
    actions = pd.read_hdf(spadl_h5, f"actions/game_{game.game_id}")
    gamestates = fs.gamestates(spadl.add_names(actions), 3)
    gamestates = fs.play_left_to_right(gamestates, game.home_team_id)
    
    X = pd.concat([fn(gamestates) for fn in xfns], axis=1)
    X.to_hdf(features_h5, f"game_{game.game_id}")

Generating and storing features in /home/msai/adnan002/repos/xB-360/data/features.h5: 100%|██████████| 2690/2690 [41:17<00:00,  1.09it/s]


In [15]:
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

for game in tqdm.tqdm(list(games.itertuples()), desc=f"Computing and storing labels in {labels_h5}"):
    actions = pd.read_hdf(spadl_h5, f"actions/game_{game.game_id}")   
    Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
    Y.to_hdf(labels_h5, f"game_{game.game_id}")

Computing and storing labels in /home/msai/adnan002/repos/xB-360/data/labels.h5: 100%|██████████| 2690/2690 [28:14<00:00,  1.59it/s]


In [16]:
predictions_h5 = os.path.join(datafolder, "predictions.h5")

In [17]:
traingames = games
testgames = games

# 1. Select feature set X
xfns = [
    fs.actiontype,
    fs.actiontype_onehot,
    #fs.bodypart,
    fs.bodypart_onehot,
    fs.result,
    fs.result_onehot,
    fs.goalscore,
    fs.startlocation,
    fs.endlocation,
    fs.movement,
    fs.space_delta,
    fs.startpolar,
    fs.endpolar,
    fs.team,
    #fs.time,
    fs.time_delta,
    #fs.actiontype_result_onehot
]
nb_prev_actions = 1

Xcols = fs.feature_column_names(xfns, nb_prev_actions)

def getXY(games,Xcols):
    # generate the columns of the selected feature
    X = []
    for game_id in tqdm.tqdm(games.game_id, desc="Selecting features"):
        Xi = pd.read_hdf(features_h5, f"game_{game_id}")
        X.append(Xi[Xcols])
    X = pd.concat(X).reset_index(drop=True)

    # 2. Select label Y
    Ycols = ["scores","concedes"]
    Y = []
    for game_id in tqdm.tqdm(games.game_id, desc="Selecting label"):
        Yi = pd.read_hdf(labels_h5, f"game_{game_id}")
        Y.append(Yi[Ycols])
    Y = pd.concat(Y).reset_index(drop=True)
    return X, Y

X, Y = getXY(traingames,Xcols)
print("X:", list(X.columns))
print("Y:", list(Y.columns))

Selecting features:   0%|          | 0/2690 [00:00<?, ?it/s]

Selecting features: 100%|██████████| 2690/2690 [00:48<00:00, 54.97it/s]
Selecting label: 100%|██████████| 2690/2690 [00:18<00:00, 149.06it/s]


X: ['type_id_a0', 'type_pass_a0', 'type_cross_a0', 'type_throw_in_a0', 'type_freekick_crossed_a0', 'type_freekick_short_a0', 'type_corner_crossed_a0', 'type_corner_short_a0', 'type_take_on_a0', 'type_foul_a0', 'type_tackle_a0', 'type_interception_a0', 'type_shot_a0', 'type_shot_penalty_a0', 'type_shot_freekick_a0', 'type_keeper_save_a0', 'type_keeper_claim_a0', 'type_keeper_punch_a0', 'type_keeper_pick_up_a0', 'type_clearance_a0', 'type_bad_touch_a0', 'type_non_action_a0', 'type_dribble_a0', 'type_goalkick_a0', 'bodypart_foot_a0', 'bodypart_head_a0', 'bodypart_other_a0', 'bodypart_head/other_a0', 'result_id_a0', 'result_fail_a0', 'result_success_a0', 'result_offside_a0', 'result_owngoal_a0', 'result_yellow_card_a0', 'result_red_card_a0', 'goalscore_team', 'goalscore_opponent', 'goalscore_diff', 'start_x_a0', 'start_y_a0', 'end_x_a0', 'end_y_a0', 'dx_a0', 'dy_a0', 'movement_a0', 'start_dist_to_goal_a0', 'start_angle_to_goal_a0', 'end_dist_to_goal_a0', 'end_angle_to_goal_a0']
Y: ['scores

In [18]:
# 3. train classifiers F(X) = Y
import xgboost

Y_hat = pd.DataFrame()
models = {}
for col in list(Y.columns):
    model = xgboost.XGBClassifier(n_estimators=50, max_depth=3, n_jobs=-3, verbosity=1)
    model.fit(X, Y[col])
    models[col] = model

In [19]:
from sklearn.metrics import brier_score_loss, roc_auc_score, log_loss

testX, testY = X, Y

def evaluate(y, y_hat):
    p = sum(y) / len(y)
    base = [p] * len(y)
    brier = brier_score_loss(y, y_hat)
    print(f"  Brier score: %.5f (%.5f)" % (brier, brier / brier_score_loss(y, base)))
    ll = log_loss(y, y_hat)
    print(f"  log loss score: %.5f (%.5f)" % (ll, ll / log_loss(y, base)))
    print(f"  ROC AUC: %.5f" % roc_auc_score(y, y_hat))

for col in testY.columns:
    Y_hat[col] = [p[1] for p in models[col].predict_proba(testX)]
    print(f"### Y: {col} ###")
    evaluate(testY[col], Y_hat[col])


### Y: scores ###
  Brier score: 0.00950 (0.84705)
  log loss score: 0.04845 (0.78021)
  ROC AUC: 0.81999
### Y: concedes ###
  Brier score: 0.00214 (0.94734)
  log loss score: 0.01349 (0.84069)
  ROC AUC: 0.80793


In [20]:
# get rows with game id per action
A = []
for game_id in tqdm.tqdm(games.game_id, "Loading game ids"):
    Ai = pd.read_hdf(spadl_h5, f"actions/game_{game_id}")
    A.append(Ai[["game_id"]])
A = pd.concat(A)
A = A.reset_index(drop=True)

# concatenate action game id rows with predictions and save per game
grouped_predictions = pd.concat([A, Y_hat], axis=1).groupby("game_id")
for k, df in tqdm.tqdm(grouped_predictions, desc="Saving predictions per game"):
    df = df.reset_index(drop=True)
    df[Y_hat.columns].to_hdf(predictions_h5, f"game_{int(k)}")

Loading game ids: 100%|██████████| 2690/2690 [01:12<00:00, 37.30it/s]
Saving predictions per game: 100%|██████████| 2690/2690 [21:57<00:00,  2.04it/s]


In [21]:
# show spadl_h5
with pd.HDFStore(spadl_h5) as spadlstore:
    print(spadlstore.keys())

# show features_h5
with pd.HDFStore(features_h5) as featuresstore:
    print(featuresstore.keys())

['/competitions', '/games', '/player_games', '/players', '/teams', '/actions/game_15946', '/actions/game_15956', '/actions/game_15973', '/actions/game_15978', '/actions/game_15986', '/actions/game_15998', '/actions/game_16010', '/actions/game_16023', '/actions/game_16029', '/actions/game_16056', '/actions/game_16073', '/actions/game_16079', '/actions/game_16086', '/actions/game_16095', '/actions/game_16109', '/actions/game_16120', '/actions/game_16131', '/actions/game_16136', '/actions/game_16149', '/actions/game_16157', '/actions/game_16173', '/actions/game_16182', '/actions/game_16190', '/actions/game_16196', '/actions/game_16205', '/actions/game_16215', '/actions/game_16231', '/actions/game_16240', '/actions/game_16248', '/actions/game_16265', '/actions/game_16275', '/actions/game_16289', '/actions/game_16306', '/actions/game_16317', '/actions/game_18235', '/actions/game_18236', '/actions/game_18237', '/actions/game_18240', '/actions/game_18241', '/actions/game_18242', '/actions/gam