In [1]:
import pandas as pd
import numpy as np
import glob
import os

This notebook takes the StatsBomb json files and turns them into feather files. These are extremely fast to load so good for this prototyping kind of analysis. See: https://medium.com/@steven.p.dye/feather-files-faster-than-the-speed-of-light-d4666ce24387.

They are not really meant for long term storage though. The event files are then combined from all the matches.

# Change these paths/ parameters
You will need to change these paths/ parameters depending on where the StatsBomb open-data is located, how and where you want to save the resulting data, and if you only want the new files to be processed.

In [2]:
# open data folder is one folder down in the directory. To change if run elsewhere
STATSBOMB_DATA = os.path.join('..','open-data','data')
# save files in folder in current directory. To change if want to save elsewhere
DATA_PATH = os.path.join(os.getcwd(),'data')
# if true, only processes files that don't already have a event file
process_new_only = True

# Delete event data included in error

One event file seems to be added to the statsbomb data in error. See: https://github.com/statsbomb/open-data/issues/13. Deleting it here for consistency.

In [3]:
ERROR_FILES = [os.path.join(STATSBOMB_DATA,'events','7298.json'),
               os.path.join(STATSBOMB_DATA,'lineups','7298.json')]
for file in ERROR_FILES:
    if os.path.isfile(file):
        os.remove(file)
        print(file,'removed')

../open-data/data/events/7298.json removed
../open-data/data/lineups/7298.json removed


# Setup folders

I set up the following folders in a new data directory folder (location set above). These are the places we will save the processed json files, in feather-format. <br>
├── data <br>
│   ├── events_raw            <- Data from the event file <br>
│   ├── related_events_raw    <- Data with the info on how events are connected. <br>
│   ├── shot_freeze_raw       <- DAta with the individual shot freeze frames <br>
│   └── tactics_raw           <- Data with the lineup tactics. <br>

In [4]:
def make_dir(PATH):
    if os.path.isdir(PATH)==False: os.mkdir(PATH)

In [5]:
# locations of new folders
RAW_EVENT_PATH = os.path.join(DATA_PATH,'events_raw')
RAW_RELATED_PATH = os.path.join(DATA_PATH,'related_events_raw')
RAW_SHOT_PATH = os.path.join(DATA_PATH,'shot_freeze_raw')
RAW_TACTICS_PATH = os.path.join(DATA_PATH,'tactics_raw')

In [6]:
# making directories
make_dir(DATA_PATH)
make_dir(RAW_EVENT_PATH)
make_dir(RAW_RELATED_PATH)
make_dir(RAW_SHOT_PATH)
make_dir(RAW_TACTICS_PATH)

# Get file paths

Retrieve a list of json file paths from which we will extract the infomation.

In [7]:
MATCH_PATH = glob.glob(os.path.join(STATSBOMB_DATA,'matches','**','*.json'),recursive=True)
LINEUP_PATH = glob.glob(os.path.join(STATSBOMB_DATA,'lineups','**','*.json'),recursive=True)
EVENT_PATH = glob.glob(os.path.join(STATSBOMB_DATA,'events','**','*.json'),recursive=True)
COMPETITION_PATH = os.path.join(STATSBOMB_DATA,'competitions.json')

# Format competition data

Get the competition data and save in feather format.

In [8]:
df_competition = pd.read_json(COMPETITION_PATH,convert_dates=['match_updated','match_available'])
df_competition.sort_values(['competition_id','season_id'],inplace=True)
df_competition.reset_index(drop=True,inplace=True)
print('Number of competitions in data:',len(df_competition))

Number of competitions in data: 20


In [9]:
# save to feather-format and show info
df_competition.to_feather(os.path.join(DATA_PATH,'competition'))
df_competition.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
competition_id        20 non-null int64
season_id             20 non-null int64
country_name          20 non-null object
competition_name      20 non-null object
competition_gender    20 non-null object
season_name           20 non-null object
match_updated         20 non-null datetime64[ns]
match_available       20 non-null datetime64[ns]
dtypes: datetime64[ns](2), int64(2), object(4)
memory usage: 1.4+ KB


# Format match data

Get the match data and save in feather format.

In [10]:
print('Number of match files in data:',len(MATCH_PATH))
match_list_dfs = [pd.read_json(file,convert_dates=['match_date','last_updated']) for file in MATCH_PATH]
df_match = pd.concat(match_list_dfs,sort=False)
print('Number of matches in data:',len(df_match))

Number of match files in data: 20
Number of matches in data: 778


In [11]:
def split_dict_col(df,col):
    '''function to split a dictionary column to seperate columns'''
    # handle missings by filling with an empty dictionary
    df[col] = df[col].apply(lambda x: {} if pd.isna(x) else x)
    # split the non missings and change column names
    df_temp_cols = pd.io.json.json_normalize(df[col]).set_index(df.index)
    col_names = df_temp_cols.columns
    # note add column description to column name if doesn't already contain it
    col_names = [(c).replace('.','_') if c[:len(col)]==col else (col+'_'+c).replace('.','_') for c in col_names]
    df[col_names] = df_temp_cols
    # drop old column
    df.drop(col,axis=1,inplace=True)
    return df

In [12]:
# loop through the columns that are still dictionary columns and add them as seperate cols to the dataframe
dictionary_columns = ['competition','season','home_team','away_team','metadata','competition_stage',
                      'stadium','referee']
for col in dictionary_columns:
    df_match = split_dict_col(df_match,col)
# convert kickoff to datetime - date + kickoff time
df_match['kick_off'] = pd.to_datetime(df_match.match_date.astype(str) +' '+ df_match.kick_off)
# drop one gender column as always equal to the other
# drop match status as always available
df_match.drop(['away_team_gender','match_status'],axis=1,inplace=True)
df_match.rename({'home_team_gender':'competition_gender'},axis=1,inplace=True)
# manager is a list (len=1) containing a dictionary so lets split into columns
df_match['home_team_managers'] = df_match.home_team_managers.str[0]
df_match = split_dict_col(df_match,'home_team_managers')
df_match['away_team_managers'] = df_match.away_team_managers.str[0]
df_match = split_dict_col(df_match,'away_team_managers')
df_match['home_team_managers_dob'] = pd.to_datetime(df_match['home_team_managers_dob'])
df_match['away_team_managers_dob'] = pd.to_datetime(df_match['away_team_managers_dob'])
for col in ['competition_id','season_id','home_team_id','competition_stage_id']:
    df_match[col] = df_match[col].astype(np.int64)
# sort and reset index: ready for exporting to feather
df_match.sort_values('kick_off',inplace=True)
df_match.reset_index(inplace=True,drop=True)

In [13]:
# save to feather-format and show info
df_match.to_feather(os.path.join(DATA_PATH,'match'))
df_match.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 778 entries, 0 to 777
Data columns (total 48 columns):
match_id                           778 non-null int64
match_date                         778 non-null datetime64[ns]
kick_off                           778 non-null datetime64[ns]
home_score                         778 non-null int64
away_score                         778 non-null int64
last_updated                       778 non-null datetime64[ns]
match_week                         778 non-null int64
competition_id                     778 non-null int64
competition_country_name           778 non-null object
competition_name                   778 non-null object
season_id                          778 non-null int64
season_name                        778 non-null object
home_team_id                       778 non-null int64
home_team_name                     778 non-null object
competition_gender                 778 non-null object
home_team_group                    100 non-null objec

# Format lineup data

Get the lineup data and save in feather format.

In [14]:
print('Number of lineup files in data:',len(LINEUP_PATH))
# read as dataframe can't use list comprehension to read files as need to create the match_id from the file name
lineup_list_dfs = []
for file in LINEUP_PATH:
    df_temp = pd.read_json(file)
    df_temp['match_id'] = os.path.basename(file[:-5])
    lineup_list_dfs.append(df_temp)
df_lineup = pd.concat(lineup_list_dfs,sort=False)
df_lineup.reset_index(inplace=True,drop=True)
# each line has a column named player that contains a list of dictionaries
# we split into seperate columns and then create a new row for each player using melt
df_lineup_players = df_lineup.lineup.apply(pd.Series)
df_lineup = df_lineup.merge(df_lineup_players,left_index=True,right_index=True)
df_lineup.drop('lineup',axis=1,inplace=True)
df_lineup = df_lineup.melt(id_vars = ['team_id','team_name','match_id'], value_name = 'player')
df_lineup.drop('variable',axis=1,inplace=True)
df_lineup = df_lineup[df_lineup.player.notnull()].copy()
df_lineup = split_dict_col(df_lineup,'player')
# turn ids to integers if no missings
df_lineup['match_id'] = df_lineup.match_id.astype(np.int64)
df_lineup['player_id'] = df_lineup.player_id.astype(np.int64)
# sort and reset index: ready for exporting to feather
df_lineup.sort_values('player_id',inplace=True)
df_lineup.reset_index(inplace=True,drop=True)

Number of lineup files in data: 778


In [15]:
# save to feather-format and show info
df_lineup.to_feather(os.path.join(DATA_PATH,'lineup'))
df_lineup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21416 entries, 0 to 21415
Data columns (total 9 columns):
team_id                 21416 non-null int64
team_name               21416 non-null object
match_id                21416 non-null int64
player_id               21416 non-null int64
player_name             21416 non-null object
player_nickname         12156 non-null object
player_jersey_number    21409 non-null float64
player_country_id       21328 non-null float64
player_country_name     21328 non-null object
dtypes: float64(2), int64(3), object(4)
memory usage: 1.5+ MB


# Format event data

Get the event data and save in feather format:
    - an events dataframe
    - a related events dataframe
    - a shot freeze frame dataframe
    - a tactics lineup dataframe
    
Each match is stored in a seperate dataframe

In [16]:
def list_dictionary_to_df(df,col,value_name,var_name):
    '''Some columns are a list of dictionaries. This turns them into a new dataframe of rows'''
    df = df.loc[df[col].notnull(),['id',col]]
    df.set_index('id',inplace=True)
    df = df[col].apply(pd.Series).copy()
    df.reset_index(inplace=True)
    df = df.melt(id_vars='id',value_name=value_name,var_name=var_name)
    df[var_name] = df[var_name] + 1
    df = df[df[value_name].notnull()].copy()
    df.reset_index(inplace=True,drop=True)
    return df

In [17]:
def split_location_cols(df,col,new_cols):
    ''' Location is stored as a list. split into columns'''
    if col in df.columns:
        df[new_cols] = df[col].apply(pd.Series)
        df.drop(col,axis=1,inplace=True)

In [18]:
print('Number of event files in data:',len(EVENT_PATH))

Number of event files in data: 778


In [19]:
EVENT_FILE_NAMES = np.array([os.path.basename(file)[:-5] for file in EVENT_PATH]).astype(int)
# quick check that all events have matches and vice versa.
print('Matches with no event file:',list(set(df_match.match_id) - set(EVENT_FILE_NAMES)))
print('Events with no match file:',list(set(EVENT_FILE_NAMES) - set(df_match.match_id)))

Matches with no event file: []
Events with no match file: []


In [20]:
# if you set process_new_only to True then we will not process event jsons which already have feather files
if process_new_only:
    event_set = set([os.path.basename(file) for file in glob.glob(os.path.join(RAW_EVENT_PATH,'*'))])
    related_set = set([os.path.basename(file) for file in glob.glob(os.path.join(RAW_RELATED_PATH,'*'))])
    shot_set = set([os.path.basename(file) for file in glob.glob(os.path.join(RAW_SHOT_PATH,'*'))])
    tactics_set = set([os.path.basename(file) for file in glob.glob(os.path.join(RAW_TACTICS_PATH,'*'))])
    to_delete = set.intersection(event_set,related_set,shot_set,tactics_set)
    mask_delete = [False if file in to_delete else True for file in (EVENT_FILE_NAMES).astype(str)]
    EVENT_PATH = np.array(EVENT_PATH)[mask_delete].tolist()
    print('Event files to process:',np.array(mask_delete).sum())

Event files to process: 1


In [21]:
def create_event_feather_files(PATH):
    ''' Extracts individual event jsons and loads as four feather-format files: events, related events,
    shot freeze frames, and tactics lineups'''
    # timestamp defaults to today's date so store as a string - feather can't store time objects
    df = pd.read_json(PATH,encoding='utf-8')
    df['timestamp'] = df['timestamp'].dt.time.astype(str)
    
    # get match id
    match_id = int(os.path.basename(PATH)[:-5])
    
    # loop through the columns that are still dictionary columns and add them as seperate cols to the dataframe
    # these are nested dataframes in the docs - although dribbled_past/ pressure isn't needed here?
    # also some others are needed: type, possession_team, play_pattern, team, tactics, player, pposition
    dictionary_columns = ['50_50','bad_behaviour','ball_receipt','ball_recovery','block','carry',
                          'clearance','dribble','duel','foul_committed','foul_won','goalkeeper',
                          'half_end','half_start','injury_stoppage','interception',
                          'miscontrol','pass','play_pattern','player','player_off','position',
                          'possession_team','shot','substitution','tactics','team','type',] 
    for col in dictionary_columns:
        if col in df.columns:
            df = split_dict_col(df,col)
    
    # sort and reset index: ready for exporting to feather
    df.sort_values(['minute','second','timestamp','possession'],inplace=True)
    df.reset_index(inplace=True,drop=True)
    
    # split location info to x, y and (z for shot) columns and drop old columns
    split_location_cols(df,'location',['x','y'])
    split_location_cols(df,'pass_end_location',['pass_end_x','pass_end_y'])
    split_location_cols(df,'carry_end_location',['carry_end_x','carry_end_y'])
    split_location_cols(df,'shot_end_location',['shot_end_x','shot_end_y','shot_end_z'])
    split_location_cols(df,'goalkeeper_end_location',['goalkeeper_end_x','goalkeeper_end_y'])
    
    # replace weird * character in the type_name for ball receipt
    df['type_name'] = df['type_name'].replace({'Ball Receipt*':'Ball Receipt'})
    
    # create a related events dataframe
    df_related_events = list_dictionary_to_df(df,col='related_events',
                                              value_name='related_event',var_name='event_related_id')
    # some carries don't have the corresponding events. This makes sure all events are linked both ways
    df_related_events.drop('event_related_id',axis=1,inplace=True)
    df_related_events_reverse = df_related_events.rename({'related_event':'id','id':'related_event'},axis=1)
    df_related_events = pd.concat([df_related_events,df_related_events_reverse],sort=False)
    df_related_events.drop_duplicates(inplace=True)
    # and add on the type_names, index for easier lookups of how the events are related
    df_event_type = df[['id','type_name','index']].copy()
    df_related_events = df_related_events.merge(df_event_type,on='id',how='left',validate='m:1')
    df_event_type.rename({'id':'related_event'},axis=1,inplace=True)
    df_related_events = df_related_events.merge(df_event_type,on='related_event',
                                                 how='left',validate='m:1',suffixes=['','_related'])
    df_related_events.rename({'related_event':'id_related'},axis=1,inplace=True)
    
    # create a shot freeze frame dataframe - also splits dictionary of player details into columns
    df_shot_freeze = list_dictionary_to_df(df,col='shot_freeze_frame',
                                           value_name='player',var_name='event_freeze_id')
    df_shot_freeze = split_dict_col(df_shot_freeze,'player')
    split_location_cols(df_shot_freeze,'player_location',['x','y'])

    # create a tactics lineup frame dataframe - also splits dictionary of player details into columns
    df_tactics_lineup = list_dictionary_to_df(df,col='tactics_lineup',
                                           value_name='player',var_name='event_tactics_id')
    df_tactics_lineup = split_dict_col(df_tactics_lineup,'player')
    
    # drop columns stored as a seperate table 
    df.drop(['related_events','shot_freeze_frame','tactics_lineup'],axis=1,inplace=True)
    
    # add match id to dataframes
    df['match_id'] = match_id
    df_related_events['match_id'] = match_id
    df_shot_freeze['match_id'] = match_id    
    df_tactics_lineup['match_id'] = match_id
    
    # save as feather files
    df.to_feather(os.path.join(RAW_EVENT_PATH,str(match_id)))
    df_related_events.to_feather(os.path.join(RAW_RELATED_PATH,str(match_id)))
    df_shot_freeze.to_feather(os.path.join(RAW_SHOT_PATH,str(match_id)))
    df_tactics_lineup.to_feather(os.path.join(RAW_TACTICS_PATH,str(match_id)))

In [22]:
# loop through and save all the event jsons as 4 seperate feather-files
for i, file in enumerate(EVENT_PATH):
    create_event_feather_files(file)
    if i%10 == 0:
        print(i,os.path.basename(file))

0 2275036.json


# Combine the raw dataframes and save as a single dataframe

Combine the event dataframes into a single dataframe for each type:
- events
- related_events
- shot freeze frame
- tactics

Note that the resulting feather file will be large (3gb+)

In [23]:
def combine_single_file(PATH,SAVE_PATH):
    ''' loads individual feather files and combines into a mega feather file'''
    files = glob.glob(os.path.join(PATH,'*'))
    dfs = [pd.read_feather(file) for file in files]
    df = pd.concat(dfs,sort=False)
    if 'index' in df.columns:
        df.sort_values(['match_id','index'],inplace=True)
    df.reset_index(drop=True,inplace=True)
    print(df.info(verbose=True,null_counts=True))
    df.to_feather(SAVE_PATH)

In [24]:
combine_single_file(RAW_EVENT_PATH,SAVE_PATH=os.path.join(DATA_PATH,'events'))

  labels, = index.labels


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2797557 entries, 0 to 2797556
Data columns (total 153 columns):
id                                  2797557 non-null object
index                               2797557 non-null int64
period                              2797557 non-null int64
timestamp                           2797557 non-null object
minute                              2797557 non-null int64
second                              2797557 non-null int64
possession                          2797557 non-null int64
duration                            2046570 non-null float64
off_camera                          27283 non-null float64
out                                 16363 non-null float64
under_pressure                      604676 non-null float64
counterpress                        86916 non-null float64
ball_receipt_outcome_id             110299 non-null float64
ball_receipt_outcome_name           110299 non-null object
ball_recovery_offensive             298 non-null objec

In [25]:
combine_single_file(RAW_RELATED_PATH,SAVE_PATH=os.path.join(DATA_PATH,'related_events'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5450328 entries, 0 to 5450327
Data columns (total 7 columns):
id                   5450328 non-null object
id_related           5450328 non-null object
type_name            5450328 non-null object
index                5450328 non-null int64
type_name_related    5450328 non-null object
index_related        5450328 non-null int64
match_id             5450328 non-null int64
dtypes: int64(3), object(4)
memory usage: 291.1+ MB
None


In [26]:
combine_single_file(RAW_SHOT_PATH,SAVE_PATH=os.path.join(DATA_PATH,'shot_freeze_frame'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244803 entries, 0 to 244802
Data columns (total 10 columns):
id                      244803 non-null object
event_freeze_id         244803 non-null int64
player_teammate         244803 non-null bool
player_id               244803 non-null int64
player_name             244803 non-null object
player_position_id      244803 non-null int64
player_position_name    244803 non-null object
x                       244803 non-null float64
y                       244803 non-null float64
match_id                244803 non-null int64
dtypes: bool(1), float64(2), int64(4), object(3)
memory usage: 17.0+ MB
None


In [27]:
combine_single_file(RAW_TACTICS_PATH,SAVE_PATH=os.path.join(DATA_PATH,'tactics'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32211 entries, 0 to 32210
Data columns (total 8 columns):
id                      32211 non-null object
event_tactics_id        32211 non-null int64
player_jersey_number    32204 non-null float64
player_id               32211 non-null int64
player_name             32211 non-null object
player_position_id      32211 non-null int64
player_position_name    32211 non-null object
match_id                32211 non-null int64
dtypes: float64(1), int64(4), object(3)
memory usage: 2.0+ MB
None
