In [1]:
import os
import warnings
import tqdm
import pandas as pd

import socceraction.spadl as spadl
import socceraction.vaep.formula as vaepformula

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
datafolder = "/home/msai/adnan002/repos/xB-360/data"
# create data folder if it does not exist
if not os.path.exists(datafolder):
    os.makedirs(datafolder)
    
spadl_h5 = os.path.join(datafolder, "spadl-statsbomb.h5")
predictions_h5 = os.path.join(datafolder, "predictions.h5")

In [3]:
with pd.HDFStore(spadl_h5) as spadlstore:
    games = (
        spadlstore["games"]
        .merge(spadlstore["competitions"], how='left')
        .merge(spadlstore["teams"].add_prefix('home_'), how='left')
        .merge(spadlstore["teams"].add_prefix('away_'), how='left'))
    players = spadlstore["players"]
    teams = spadlstore["teams"]
print("nb of games:", len(games))

nb of games: 245


In [4]:
A = []
for game in tqdm.tqdm(list(games.itertuples()), desc="Rating actions"):
    actions = pd.read_hdf(spadl_h5, f"actions/game_{game.game_id}")
    actions = (
        spadl.add_names(actions)
        .merge(players, how="left")
        .merge(teams, how="left")
        .sort_values(["game_id", "period_id", "action_id"])
        .reset_index(drop=True)
    )
    preds = pd.read_hdf(predictions_h5, f"game_{game.game_id}")
    values = vaepformula.value(actions, preds.scores, preds.concedes)
    A.append(pd.concat([actions, preds, values], axis=1))
A = pd.concat(A).sort_values(["game_id", "period_id", "time_seconds"]).reset_index(drop=True)
A.columns

Rating actions: 100%|██████████| 245/245 [00:11<00:00, 20.98it/s]


Index(['game_id', 'original_event_id', 'period_id', 'time_seconds', 'team_id',
       'player_id', 'start_x', 'start_y', 'end_x', 'end_y', 'type_id',
       'result_id', 'bodypart_id', 'action_id', 'type_name', 'result_name',
       'bodypart_name', 'player_name', 'nickname', 'team_name', 'scores',
       'concedes', 'offensive_value', 'defensive_value', 'vaep_value'],
      dtype='object')

In [5]:
# Load all_fouls_advanced.pkl
all_fouls_advanced = pd.read_pickle("all_fouls_advanced.pkl")

In [6]:
import numpy as np

all_fouls_advanced['vaep_value_offensive'] = np.nan


# Make original_event_id column in A dataframe as it's index
A = A.set_index('original_event_id')

# all_fouls_advanced['id'] has values from 'original_event_id' column in A dataframe. Fill the values of 'vaep_value_offensive' column in all_fouls_advanced dataframe with the values of 'offensive_value' column in A dataframe
for i in range(len(all_fouls_advanced)):
    all_fouls_advanced['vaep_value_offensive'][i] = A['offensive_value'][all_fouls_advanced['id'][i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_fouls_advanced['vaep_value_offensive'][i] = A['offensive_value'][all_fouls_advanced['id'][i]]


In [7]:
print (len(all_fouls_advanced))
# Remove the rows in all_fouls_advanced and testing_fouls_advanced where vaep_value_offensive is NaN
all_fouls_advanced = all_fouls_advanced.dropna(subset=['vaep_value_offensive'])
print (len(all_fouls_advanced))

1684
1684


In [18]:

all_fouls_advanced.to_pickle("all_fouls_advanced_with_vaep.pkl")


In [19]:
all_fouls_advanced.head(1)

Unnamed: 0,foul_committed_advantage,foul_committed_offensive,foul_committed_penalty,foul_committed_card,foul_committed_counterpress,foul_committed_type,foul_won_advantage,foul_won_defensive,foul_won_penalty,id,...,team_id,timestamp,type,seconds_till_now,scoreline_till_now,distance_to_goal,angle_to_goal,foul_count_player_till_now,foul_count_team_till_now,vaep_value_offensive
0,,,,Yellow Card,,,,,,9221da02-9c87-4f98-a8ea-de9994ad562b,...,773,00:14:27.221,Foul Committed,867,"{'Serbia': 0, 'Switzerland': 0, 'Argentina': 0...",37.968408,1.282367,1,1,-0.000196
