## The analysis for this thesis was run from commit:
https://github.com/statsbomb/open-data/commit/20863db06d85306bd56f122a67fb7d03f2d15b70

This notebook takes the StatsBomb json files and turns them into parquet files. These are extremely fast to load so good for this prototyping kind of analysis.

In [None]:
import mplsoccer.statsbomb as sbapi
import pandas as pd
import os
import glob

# Change these paths/ parameters
You will need to change these paths/ parameters depending on where the StatsBomb open-data is located, how and where you want to save the resulting data, and if you only want the new files to be processed.

In [None]:
# open data folder is one folder down in the directory. To change if run elsewhere
STATSBOMB_DATA = os.path.join('..', '..', '..', 'open-data','data')
# save files in folder in current directory. To change if want to save elsewhere
DATA_FOLDER = os.path.join('..', '..', 'data', 'statsbomb')

# Get the data file paths

In [None]:
event_links = glob.glob(os.path.join(STATSBOMB_DATA, 'events', '**', '*.json'),recursive=True)
lineup_links = glob.glob(os.path.join(STATSBOMB_DATA, 'lineups', '**', '*.json'),recursive=True)
match_links = glob.glob(os.path.join(STATSBOMB_DATA, 'matches', '**', '*.json'),recursive=True)
competition_path = os.path.join(STATSBOMB_DATA, 'competitions.json')

# Make the directory structure

In [None]:
# make the directory structure
for folder in ['event_raw', 'related_event_raw', 'freeze_frame_raw', 'tactic_raw', 'lineup_raw']:
    path = os.path.join(DATA_FOLDER, folder)
    if not os.path.exists(path):
        os.mkdir(path)

# Read the competition datam

In [None]:
df_competition = sbapi.read_competition(competition_path, warn=False)
# note there is a slight loss of data quality with timestamps, but these aren't relevant for analysis
# pandas has nanoseconds, which aren't supported in parquet (supports milliseconds)
df_competition.to_parquet(os.path.join(DATA_FOLDER, 'competition.parquet'), allow_truncated_timestamps=True)
df_competition.info()

# Read the match data

In [None]:
match_dfs = [sbapi.read_match(file, warn=False) for file in match_links]
df_match = pd.concat(match_dfs)
# again there is a slight loss of quality when saving timestamps, but only relevant for last_updated
df_match.to_parquet(os.path.join(DATA_FOLDER, 'match.parquet'), allow_truncated_timestamps=True)
df_match.info()

# Read the lineup data

In [None]:
LINEUP_FOLDER = os.path.join(DATA_FOLDER, 'lineup_raw')
# loop through the links and store as parquet files - small and fast files
for file in lineup_links:
    save_path = f'{os.path.basename(file)[:-4]}parquet'
    
    # version that only loads new files
    # if not os.path.isfile(os.path.join(LINEUP_FOLDER, save_path)):
    #    try:
    #        print('Trying:', file)
    #        df_lineup = sbapi.read_lineup(file, warn=False)
    #        df_lineup.to_parquet(os.path.join(LINEUP_FOLDER, save_path))
    #    except:
    #        print('Skipping:', file)
    #        pass
    
    # version that loads all files
    try:
        print('Trying:', file)
        df_lineup = sbapi.read_lineup(file, warn=False)
        df_lineup.to_parquet(os.path.join(LINEUP_FOLDER, save_path))
    except:
        print('Skipping:', file)
        pass

Convert to a single dataframe

In [None]:
lineup_files = glob.glob(os.path.join(LINEUP_FOLDER, '*.parquet'))
df_lineup = pd.concat([pd.read_parquet(file) for file in lineup_files])
# replace some ids that appear to be duplicated. Then de-duplicate
df_lineup.player_id.replace({18103: 38522,  # Dietmar Hamann
                             17275: 4656,  # Hannah Jayne Blundell
                             17524: 4655,  # Jennifer Beattie
                             10172: 4644,  # Jill Scott
                             4634: 5088,  # Crystal Dunn
                             }, inplace=True)
df_lineup.to_parquet(os.path.join(DATA_FOLDER, 'lineup.parquet'))
df_lineup.info()

# Read the event data

In [None]:
# loop through the links and store as parquet files - small and fast files
for file in event_links:
    save_path = f'{os.path.basename(file)[:-4]}parquet'
    
    # version that only loads new files
    #if not os.path.isfile(os.path.join(DATA_FOLDER, 'event_raw', save_path)):
    #    try:
    #        print('Trying:', file)
    #        dict_event = sbapi.read_event(file, warn=False)
            # save to parquet files
            # using the dictionary key to access the dataframes from the dictionary
    #        dict_event['event'].to_parquet(os.path.join(DATA_FOLDER, 'event_raw', save_path))
    #        dict_event['related_event'].to_parquet(os.path.join(DATA_FOLDER, 'related_event_raw', save_path))
    #        dict_event['shot_freeze_frame'].to_parquet(os.path.join(DATA_FOLDER, 'freeze_frame_raw', save_path))
    #        dict_event['tactics_lineup'].to_parquet(os.path.join(DATA_FOLDER, 'tactic_raw', save_path))
    #    except:
    #        print('Skipping:', file)
    #        pass
        
    # version that loads all files
    try:
        print('Trying:', file)
        dict_event = sbapi.read_event(file, warn=False)
        # save to parquet files
        # using the dictionary key to access the dataframes from the dictionary
        dict_event['event'].to_parquet(os.path.join(DATA_FOLDER, 'event_raw', save_path))
        dict_event['related_event'].to_parquet(os.path.join(DATA_FOLDER, 'related_event_raw', save_path))
        dict_event['shot_freeze_frame'].to_parquet(os.path.join(DATA_FOLDER, 'freeze_frame_raw', save_path))
        dict_event['tactics_lineup'].to_parquet(os.path.join(DATA_FOLDER, 'tactic_raw', save_path))
    except:
        print('Skipping:', file)
        pass

Single dataframe events

In [None]:
event_files = glob.glob(os.path.join(DATA_FOLDER, 'event_raw', '*.parquet'))
df_event = pd.concat([pd.read_parquet(file) for file in event_files])
df_event.to_parquet(os.path.join(DATA_FOLDER, 'event.parquet'))
df_event.info(verbose=True, null_counts=True)

Single dataframe shot freeze frames

In [None]:
freeze_files = glob.glob(os.path.join(DATA_FOLDER, 'freeze_frame_raw', '*.parquet'))
df_freeze = pd.concat([pd.read_parquet(file) for file in freeze_files])
df_freeze.to_parquet(os.path.join(DATA_FOLDER, 'freeze.parquet'))
df_freeze.info()

Single dataframe tactics

In [None]:
tactic_files = glob.glob(os.path.join(DATA_FOLDER, 'tactic_raw', '*.parquet'))
df_tactic = pd.concat([pd.read_parquet(file) for file in tactic_files])
df_tactic.to_parquet(os.path.join(DATA_FOLDER, 'tactic.parquet'))
df_tactic.info()

Single dataframe related events

In [None]:
related_files = glob.glob(os.path.join(DATA_FOLDER, 'related_event_raw', '*.parquet'))
df_related = pd.concat([pd.read_parquet(file) for file in related_files])
df_related.to_parquet(os.path.join(DATA_FOLDER, 'related.parquet'))
df_related.info(verbose=True, null_counts=True)