In [None]:
import pandas as pd
import numpy as np
import glob
import os

# TO CHANGE! Depending on where the open-data is located and where you want to save the resulting data

In [None]:
# open data folder is one folder down in the directory. To change if run elsewhere
STATSBOMB_DATA = os.path.join('..','open-data')
# save files in folder in current directory. To change if want to save elsewhere
DATA_PATH = os.path.join(os.getcwd(),'data')

# Setup folders

In [None]:
def make_dir(PATH):
    if os.path.isdir(PATH)==False: os.mkdir(PATH)

In [None]:
make_dir(DATA_PATH)
make_dir(os.path.join(DATA_PATH,'events_raw'))
make_dir(os.path.join(DATA_PATH,'related_events_raw'))
make_dir(os.path.join(DATA_PATH,'short_freeze_raw'))
make_dir(os.path.join(DATA_PATH,'tactics_raw'))

# Get file paths

In [None]:
DATA_PATH = os.path.join(STATSBOMB_DATA,'data')
MATCH_PATH = glob.glob(os.path.join(DATA_PATH,'matches','**','*.json'),recursive=True)
LINEUP_PATH = glob.glob(os.path.join(DATA_PATH,'lineups','**','*.json'),recursive=True)
EVENT_PATH = glob.glob(os.path.join(DATA_PATH,'events','**','*.json'),recursive=True)
COMPETITION_PATH = os.path.join(DATA_PATH,'competitions.json')

# Format competition data

In [None]:
df_competition = pd.read_json(COMPETITION_PATH,convert_dates=['match_updated','match_available'])
df_competition.sort_values(['competition_id','season_id'],inplace=True)
df_competition.reset_index(drop=True,inplace=True)
print('Number of competitions in data:',len(df_competition))

In [None]:
df_competition.info()

In [None]:
df_competition.to_feather(os.path.join(DATA_PATH,'competition'))

# Format match data

In [None]:
print('Number of match files in data:',len(MATCH_PATH))
match_list_dfs = [pd.read_json(file,convert_dates=['match_date','last_updated']) for file in MATCH_PATH]
df_match = pd.concat(match_list_dfs,sort=False)
print('Number of matches in data:',len(df_match))

In [None]:
def split_dict_col(df,col):
    '''function to split a dictionary column to seperate columns'''
    # handle missings by filling with an empty dictionary
    df[col] = df[col].apply(lambda x: {} if pd.isna(x) else x)
    # split the non missings and change column names
    df_temp_cols = pd.io.json.json_normalize(df[col]).set_index(df.index)
    col_names = df_temp_cols.columns
    col_names = [(col+'_'+c).replace('.','_') for c in col_names]
    df[col_names] = df_temp_cols
    # drop old column
    df.drop(col,axis=1,inplace=True)
    return df

In [None]:
# loop through the columns that are still dictionary columns and add them as seperate cols to the dataframe
dictionary_columns = ['competition','season','home_team','away_team','metadata','competition_stage',
                      'stadium','referee']
for col in dictionary_columns:
    df_match = split_dict_col(df_match,col)
df_match['kick_off'] = pd.to_datetime(df_match.match_date.astype(str) +' '+ df_match.kick_off)
# rename some of the id columns with repeated names, as we added the column name infront of the new cols
df_match.rename({'season_season_id':'season_id',
                 'season_season_name':'season_name',
                 'competition_competition_id':'competition_id',
                 'home_team_home_team_id':'home_team_id',
                 'away_team_away_team_id':'away_team_id',
                 'competition_competition_name':'competition_name',           
                 'home_team_home_team_name':'home_team_name',
                 'home_team_home_team_gender':'home_team_gender',
                 'home_team_home_team_group':'home_team_group',
                 'away_team_away_team_name':'away_team_name',
                 'away_team_away_team_gender':'away_team_gender',
                 'away_team_away_team_group':'away_team_group'},axis=1,inplace=True)
# drop one gender column as always equal to the other
# drop match status as always available
df_match.drop(['away_team_gender','match_status'],axis=1,inplace=True)
df_match.rename({'home_team_gender':'competition_gender'},axis=1,inplace=True)
# manager is a list (len=1) containing a dictionary so lets split into columns
df_match['home_team_managers'] = df_match.home_team_managers.str[0]
df_match = split_dict_col(df_match,'home_team_managers')
df_match['away_team_managers'] = df_match.away_team_managers.str[0]
df_match = split_dict_col(df_match,'away_team_managers')
df_match['home_team_managers_dob'] = pd.to_datetime(df_match['home_team_managers_dob'])
df_match['away_team_managers_dob'] = pd.to_datetime(df_match['away_team_managers_dob'])
for col in ['competition_id','season_id','home_team_id','competition_stage_id']:
    df_match[col] = df_match[col].astype(np.int64)
# sort and reset index: ready for exporting to feather
df_match.sort_values('kick_off',inplace=True)
df_match.reset_index(inplace=True,drop=True)

In [None]:
df_match.info()

In [None]:
df_match.to_feather(os.path.join(DATA_PATH,'match'))

# Format lineup data

In [None]:
print('Number of lineup files in data:',len(LINEUP_PATH))
# read as dataframe can't use list comprehension to read files as need to create the match_id from the file name
lineup_list_dfs = []
for file in LINEUP_PATH:
    df_temp = pd.read_json(file)
    df_temp['match_id'] = os.path.basename(file[:-5])
    lineup_list_dfs.append(df_temp)
df_lineup = pd.concat(lineup_list_dfs,sort=False)
df_lineup.reset_index(inplace=True,drop=True)
# each line has a column named player that contains a list of dictionaries
# we split into seperate columns and then create a new row for each player using melt
df_lineup_players = df_lineup.lineup.apply(pd.Series)
df_lineup = df_lineup.merge(df_lineup_players,left_index=True,right_index=True)
df_lineup.drop('lineup',axis=1,inplace=True)
df_lineup = df_lineup.melt(id_vars = ['team_id','team_name','match_id'], value_name = 'player')
df_lineup.drop('variable',axis=1,inplace=True)
df_lineup = df_lineup[df_lineup.player.notnull()].copy()
df_lineup = split_dict_col(df_lineup,'player')
# rename columns with repeated words
cols = df_lineup.columns
cols = [col[7:] if col[:6]=='player' else col for col in cols]
df_lineup.columns = cols
# turn ids to integers if no missings
df_lineup['match_id'] = df_lineup.match_id.astype(np.int64)
df_lineup['player_id'] = df_lineup.player_id.astype(np.int64)
# sort and reset index: ready for exporting to feather
df_lineup.sort_values('player_id',inplace=True)
df_lineup.reset_index(inplace=True,drop=True)

In [None]:
df_lineup.info()

In [None]:
df_lineup.to_feather(os.path.join(DATA_PATH,'lineup'))

# Format event data

In [None]:
print('Number of event files in data:',len(EVENT_PATH))

In [None]:
def create_event_dfs(PATH):
    df = pd.read_json(PATH,convert_dates=['timestamp'],encoding='utf-8')
    
    # get match id
    match_id = os.path.basename(EVENT_PATH[0])[:-5]
    
    # loop through the columns that are still dictionary columns and add them as seperate cols to the dataframe
    # these are nested dataframes in the docs - although dribbled_past/ pressure isn't needed here?
    # also some others are needed: type, possession_team, play_pattern, team, tactics, player, pposition
    dictionary_columns = ['50_50','bad_behaviour','ball_receipt','ball_recovery','block','carry',
                          'clearance','dribble','duel','foul_committed','foul_won','goalkeeper',
                          'half_end','half_start','injury_stoppage','interception',
                          'miscontrol','pass','play_pattern','player','player_off','position',
                          'possession_team','shot','substitution','tactics','team','type',] 
    for col in dictionary_columns:
        if col in df.columns:
            df = split_dict_col(df,col)
    
    # sort and reset index: ready for exporting to feather
    df.sort_values(['minute','second','timestamp','possession'],inplace=True)
    df.reset_index(inplace=True,drop=True)
    
    # split location info to x, y, z and drop old columns
    df[['x','y']] = df.location.apply(pd.Series)
    df[['pass_end_x','pass_end_y']] = df.pass_end_location.apply(pd.Series)
    df[['carry_end_x','carry_end_y']] = df.carry_end_location.apply(pd.Series)
    df[['shot_end_x','shot_end_y','shot_end_z']] = df.shot_end_location.apply(pd.Series)
    df[['goalkeeper_end_x','goalkeeper_end_y']] = df.goalkeeper_end_location.apply(pd.Series)
    df.drop(['location','pass_end_location','carry_end_location',
             'shot_end_location','goalkeeper_end_location'],axis=1,inplace=True)
    
    # replace weird * character in the type_name for ball receipt
    df['type_name'] = df['type_name'].replace({'Ball Receipt*':'Ball Receipt'})
    
    # create a related events dataframe
    df_related_events = df.loc[df.related_events.notnull(),['id','related_events']].copy()
    df_related_events.set_index('id',inplace=True)
    df_related_events = df_related_events.related_events.apply(pd.Series)
    df_related_events.reset_index(inplace=True)
    df_related_events = df_related_events.melt(id_vars ='id', value_name = 'related_event',
                                               var_name='event_related_id')
    df_related_events['event_related_id'] = df_related_events.event_related_id + 1
    df_related_events = df_related_events[df_related_events.related_event.notnull()].copy()
    df_related_events.reset_index(inplace=True,drop=True)
    
    
    # create a shot freeze frame dataframe
    df_shot_freeze = df.loc[df.shot_freeze_frame.notnull(),['id','shot_freeze_frame']]
    df_shot_freeze.set_index('id',inplace=True)
    df_shot_freeze = df_shot_freeze.shot_freeze_frame.apply(pd.Series)
    df_shot_freeze.reset_index(inplace=True)
    df_shot_freeze = df_shot_freeze.melt(id_vars ='id', value_name = 'player',var_name='event_freeze_id')
    df_shot_freeze['event_freeze_id'] = df_shot_freeze.event_freeze_id + 1
    df_shot_freeze = df_shot_freeze[df_shot_freeze.player.notnull()].copy()
    df_shot_freeze = split_dict_col(df_shot_freeze,'player')
    df_shot_freeze[['x','y']] = df_shot_freeze.player_location.apply(pd.Series)
    df_shot_freeze.drop('player_location',axis=1,inplace=True)
    df_shot_freeze.reset_index(inplace=True,drop=True)
    
    df_tactics_lineup = df.loc[df.tactics_lineup.notnull(),['id','tactics_lineup']].copy()
    df_tactics_lineup.set_index('id',inplace=True)
    df_tactics_lineup = df_tactics_lineup.tactics_lineup.apply(pd.Series)
    df_tactics_lineup.reset_index(inplace=True)
    df_tactics_lineup = df_tactics_lineup.melt(id_vars ='id', value_name = 'player',var_name='event_tactics_id')
    df_tactics_lineup['event_tactics_id'] = df_tactics_lineup.event_tactics_id+ 1
    df_tactics_lineup = df_tactics_lineup[df_tactics_lineup.player.notnull()].copy()
    df_tactics_lineup = split_dict_col(df_tactics_lineup,'player')
    df_tactics_lineup.sort_values(['id','event_tactics_id'],inplace=True)
    df_tactics_lineup.reset_index(inplace=True,drop=True)
    
    # drop columns stored as a seperate table 
    df.drop(['related_events','shot_freeze_frame','tactics_lineup'],axis=1,inplace=True)
    
    # add match id to dataframes
    df['match_id'] = match_id
    df_related_events['match_id'] = match_id
    df_shot_freeze['match_id'] = match_id    
    df_tactics_lineup['match_id'] = match_id
    
    return df, df_related_events, df_shot_freeze, df_tactics_lineup

In [None]:
df_event, df_related_events, df_shot_freeze, df_tactics_lineup = create_event_dfs(EVENT_PATH[0])

In [None]:
df_event.to_feather(os.path.join(DATA_PATH,'events_raw',os.path.basename(EVENT_PATH[0][:-5])))
df_related_events.to_feather(os.path.join(DATA_PATH,'related_events_raw',os.path.basename(EVENT_PATH[0][:-5])))
df_shot_freeze.to_feather(os.path.join(DATA_PATH,'short_freeze_raw',os.path.basename(EVENT_PATH[0][:-5])))
df_tactics_lineup.to_feather(os.path.join(DATA_PATH,'tactics_raw',os.path.basename(EVENT_PATH[0][:-5])))

# !! TO DO SIMPLIFY THE DATAFRAME CREATION AS A LOT OF DUPLICATED CODE IN ABOVE FUNCTION. ALSO USED ABOVE TOO

# !!! TO DO individually and save as feather in correct folder. Check what files already feather and only do new ones

# Combine the raw dataframes and save as a single dataframe

In [None]:
df_event

In [None]:
df_related_events

In [None]:
df_shot_freeze

In [None]:
df_tactics_lineup

In [None]:
df_event.info(verbose=True,null_counts=True)

# !! TO DO save as seperate tables
# related_events
# shot freeze frame
# tactics lineup

In [None]:
#cols = df_event.columns
#cols[12]

# Check if a match is missing any metadata or vice versa

# Lineup to df

In [None]:
len(LINEUP_PATH)

# Event to df

In [None]:
len(EVENT_PATH)