In [24]:
import pandas as pd
import numpy as np
import os
import json
import yaml

In [14]:
columns = yaml.load(open(os.path.join(os.getcwd(), 'events.yaml')))


def get_event_name(dictionary: dict):
    """Gets value from dictionary for key `name` other returns None"""
    try:
        return dictionary.get('name', None)
    except AttributeError:
        return None

In [183]:
# shortcuts to path
matches = 'data/matches'
lineups = 'data/lineups'
events = 'data/events'
competition = 'competitions.json'

In [147]:
def get_event_name(dictionary: dict):
    """Gets value from dictionary for key `name` otherwise returns None"""
    try:
        return dictionary.get('name', None)
    except AttributeError:
        return None

In [275]:
def get_df(match_id: int, event_type: str, event_data: list) -> pd.DataFrame:
    events = [item for item in event_data if item['type']['name'] == event_type.title()]
    common_elements = [{key: event.get(key, None) for key in columns['common']} for event in events]
    event_objects = []
    for event in events:
        object_dict = {}
        for key in columns[event_type]:
            try:
                object_dict[key] = event[event_type].get(key, None)
            except KeyError:
                object_dict[key] = None
        event_objects.append(object_dict)
        
    df = pd.DataFrame([{**i, **j} for i, j in zip(common_elements, event_objects)])
    df['event_type'] = event_type
    df['match_id'] = match_id
    df = df[['match_id'] + ['event_type'] + columns['common'] + columns[event_type]]
        
    name_cols = [col for col in df.columns if col in columns['name_cols']]
    df[name_cols] = df[name_cols].applymap(get_event_name)
#     df[name_cols] = df[name_cols].applymap(lambda d: d.get('name', None))
        
    try:
        df[['start_location_x', 'start_location_y']] = df['location'].apply(pd.Series)
    except ValueError:
        pass
    df.drop(columns='location', inplace=True)
        
    if 'end_location' in df.columns:
        end_location_cols = ['end_location_x', 'end_location_y', 'end_location_z']
        try:
            df[end_location_cols] = df['end_location'].apply(pd.Series)
        except ValueError:
            df[['end_location_x', 'end_location_y']] = df['end_location'].apply(pd.Series)
        df.drop(columns='end_location', inplace=True)
    return df

In [201]:
def get_file(file_dir: str) -> list:
    dir_path = os.path.join(os.getcwd(), file_dir)
    file_path = os.listdir(dir_path)
    files = [os.path.join(dir_path, file) for file in file_path]
    return files

In [266]:
match_files = get_file(matches)
with open(match_files[-1]) as file:
    data = json.load(file)
wc_match_id = [str(match['match_id']) for match in data if match['competition']['competition_id'] == 43]
wc_event_files = [os.path.join(events, str(match_id)) for match_id in wc_match_id]

In [285]:
df = pd.DataFrame()
for event_file in wc_event_files:
    with open(event_file + '.json') as file: 
        data = json.load(file)
        df = df.append(get_df(int(event_file[-4:]), 'shot', data))
print('Load {} events in WC2018'.format(df.shape[0]))

Load 1706 events in WC2018


In [307]:
df.columns

Index(['match_id', 'event_type', 'id', 'index', 'period', 'timestamp',
       'minute', 'second', 'possession', 'possession_team', 'play_pattern',
       'off_camera', 'team', 'player', 'position', 'duration',
       'under_pressure', 'statsbomb_xg', 'key_pass_id', 'body_part', 'type',
       'outcome', 'technique', 'first_time', 'follows_dribble', 'redirect',
       'one_on_one', 'open_goal', 'deflected', 'start_location_x',
       'start_location_y', 'end_location_x', 'end_location_y',
       'end_location_z'],
      dtype='object')

In [336]:
mean_xg = df.groupby('team')['statsbomb_xg'].agg(['mean', 'count']).sort_values(by='count', ascending=False)
shot_outcome = pd.DataFrame(df.groupby(['team', 'outcome'])['id'].count()).reset_index()

In [34]:
event_type = 'shot'

In [35]:
all_events = [i for i in data if i['type']['name'] == event_type.title()]
common_elements = [{key: event.get(key, None) for key in columns['common']} for event in all_events]

In [42]:
event_objects = []
for event in all_events:
    object_dict = {}
    for key in columns[event_type]:
        try:
            object_dict[key] = event[event_type.replace(' ', '_')].get(key, None)
        except KeyError:
            object_dict[key] = None
    event_objects.append(object_dict)

In [48]:
columns['name_cols']

['body_part',
 'play_pattern',
 'player',
 'position',
 'possession_team',
 'team',
 'technique',
 'type',
 'height',
 'recipient',
 'outcome',
 'card']