In [1]:
# Import Pandas
import pandas as pd
pd.set_option('display.max_columns', None)

# Reading and creating directories
import os

# Ignore warnings
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings(action="ignore", message="credentials were not supplied. open data access only")

# Progress bar
import tqdm

# Reload modules
%load_ext autoreload
%autoreload 2

# Import Socceraction modules to load and manipulate StatsBomb open data
from socceraction.data.statsbomb import StatsBombLoader
import socceraction.spadl as spadl
import socceraction.xthreat as xthreat

# Import Statsbombpy to load StatsBomb360
from statsbombpy import sb

In [2]:
# Initiate the loader
SBL = StatsBombLoader()

# View all available competitions 
#SBL.competitions() 

In [3]:
# Euro2020: competition_id=55, season_id=43

# Get all games of Euro2020
games = SBL.games(competition_id=55, season_id=43)

In [5]:
# Create progress bar and iterator
games_verbose = tqdm.tqdm(list(games.itertuples()), desc="Loading game data")

# Lists and dictionaries to store information
teams, players = [], []
events, actions, frames = {},{},{}

# Iterate over games
for game in games_verbose:
    # load data (game.game_id provides the id of each match)
    teams.append(SBL.teams(game.game_id))
    players.append(SBL.players(game.game_id))
    events[game.game_id] = SBL.events(game.game_id)
    frames[game.game_id] = sb.frames(match_id = game.game_id, fmt='dataframe')
    
    # Convert data to spadl 
    actions[game.game_id] = spadl.statsbomb.convert_to_actions(events[game.game_id], game.home_team_id)
    actions[game.game_id] = spadl.add_names(actions[game.game_id])
    
    # Add xT value to each action
    url_grid = "https://karun.in/blog/data/open_xt_12x8_v1.json"
    xT_model = xthreat.load_model(url_grid)
    df_actions_ltr = spadl.play_left_to_right(actions[game.game_id], game.home_team_id)
    xT_list = xT_model.rate(df_actions_ltr)
    xT_list = [round(num,3) for num in xT_list]
    actions[game.game_id]['xT'] = xT_list
    
    # Add supplementary information to each action that aren't include in the SPADL format
    df_additional = events[game.game_id][['event_id','under_pressure','duration']]
    actions[game.game_id] = actions[game.game_id].merge(df_additional, left_on = 'original_event_id', right_on = 'event_id')
    actions[game.game_id].drop('event_id',axis = 1, inplace = True)
    
    # Split Location in the SB360 datafram and convert to a 105 m x 68 m  pitch
    frames[game.game_id]['loc_x'] = frames[game.game_id].location.apply(lambda x: x[0] if x else 1).clip(1, 120)
    frames[game.game_id]['loc_y'] = frames[game.game_id].location.apply(lambda x: x[1] if x else 1).clip(1, 80)
    frames[game.game_id]['loc_x'] = ((frames[game.game_id]['loc_x'] - 1) / 119) * 105
    frames[game.game_id]['loc_y'] = 68 - ((frames[game.game_id]['loc_y'] - 1) / 79) * 68

# Create teams and players dataframe    
teams = pd.concat(teams).drop_duplicates(subset="team_id")
players = pd.concat(players)

# Choose the folder's path
datafolder = "../statsbomb"

# Create data folder if it doesn't exist
if not os.path.exists(datafolder):
    os.mkdir(datafolder)
    print(f"Directory {datafolder} created.")

euro2020 = os.path.join(datafolder, "euro2020.h5")

# Store all data in h5-file
with pd.HDFStore(euro2020) as spadlstore:
    spadlstore["games"] = games
    spadlstore["teams"] = teams
    spadlstore["players"] = players[['player_id', 'player_name']].drop_duplicates(subset='player_id')
    spadlstore["player_games"] = players[['player_id', 'game_id', 'team_id', 'is_starter', 'minutes_played','starting_position_name']]
    for game_id in actions.keys():
        spadlstore[f"events/game_{game_id}"] = events[game_id]
        spadlstore[f"actions/game_{game_id}"] = actions[game_id]
        spadlstore[f"frames/game_{game_id}"] = frames[game_id]

Loading game data: 100%|███████████████████████████████████████████████████████████████| 51/51 [03:31<00:00,  4.14s/it]
