In [24]:
import pandas as pd
import numpy as np
import os
import json
import yaml

In [14]:
columns = yaml.load(open(os.path.join(os.getcwd(), 'events.yaml')))


def get_event_name(dictionary: dict):
    """Gets value from dictionary for key `name` other returns None"""
    try:
        return dictionary.get('name', None)
    except AttributeError:
        return None

In [183]:
# shortcuts to path
matches = 'data/matches'
lineups = 'data/lineups'
events = 'data/events'
competition = 'competitions.json'

In [147]:
def get_event_name(dictionary: dict):
    """Gets value from dictionary for key `name` otherwise returns None"""
    try:
        return dictionary.get('name', None)
    except AttributeError:
        return None

In [275]:
def get_df(match_id: int, event_type: str, event_data: list) -> pd.DataFrame:
    events = [item for item in event_data if item['type']['name'] == event_type.title()]
    common_elements = [{key: event.get(key, None) for key in columns['common']} for event in events]
    event_objects = []
    for event in events:
        object_dict = {}
        for key in columns[event_type]:
            try:
                object_dict[key] = event[event_type].get(key, None)
            except KeyError:
                object_dict[key] = None
        event_objects.append(object_dict)
        
    df = pd.DataFrame([{**i, **j} for i, j in zip(common_elements, event_objects)])
    df['event_type'] = event_type
    df['match_id'] = match_id
    df = df[['match_id'] + ['event_type'] + columns['common'] + columns[event_type]]
        
    name_cols = [col for col in df.columns if col in columns['name_cols']]
    df[name_cols] = df[name_cols].applymap(get_event_name)
#     df[name_cols] = df[name_cols].applymap(lambda d: d.get('name', None))
        
    try:
        df[['start_location_x', 'start_location_y']] = df['location'].apply(pd.Series)
    except ValueError:
        pass
    df.drop(columns='location', inplace=True)
        
    if 'end_location' in df.columns:
        end_location_cols = ['end_location_x', 'end_location_y', 'end_location_z']
        try:
            df[end_location_cols] = df['end_location'].apply(pd.Series)
        except ValueError:
            df[['end_location_x', 'end_location_y']] = df['end_location'].apply(pd.Series)
        df.drop(columns='end_location', inplace=True)
    return df

In [201]:
def get_file(file_dir: str) -> list:
    dir_path = os.path.join(os.getcwd(), file_dir)
    file_path = os.listdir(dir_path)
    files = [os.path.join(dir_path, file) for file in file_path]
    return files

In [266]:
match_files = get_file(matches)
with open(match_files[-1]) as file:
    data = json.load(file)
wc_match_id = [str(match['match_id']) for match in data if match['competition']['competition_id'] == 43]
wc_event_files = [os.path.join(events, str(match_id)) for match_id in wc_match_id]

In [285]:
df = pd.DataFrame()
for event_file in wc_event_files:
    with open(event_file + '.json') as file: 
        data = json.load(file)
        df = df.append(get_df(int(event_file[-4:]), 'shot', data))
print('Load {} events in WC2018'.format(df.shape[0]))

Load 1706 events in WC2018


In [307]:
df.columns

Index(['match_id', 'event_type', 'id', 'index', 'period', 'timestamp',
       'minute', 'second', 'possession', 'possession_team', 'play_pattern',
       'off_camera', 'team', 'player', 'position', 'duration',
       'under_pressure', 'statsbomb_xg', 'key_pass_id', 'body_part', 'type',
       'outcome', 'technique', 'first_time', 'follows_dribble', 'redirect',
       'one_on_one', 'open_goal', 'deflected', 'start_location_x',
       'start_location_y', 'end_location_x', 'end_location_y',
       'end_location_z'],
      dtype='object')

In [524]:
df.groupby(['team', 'outcome']).get_group(('Russia', 'Goal'))[['period', 'match_id']]

Unnamed: 0,period,match_id
4,1,7582
33,5,7582
35,5,7582
37,5,7582
39,5,7582
17,2,7540
18,2,7540
4,1,7525
8,1,7525
13,2,7525


In [482]:
mean_xg = df[df['period'] < 5].groupby('team')['statsbomb_xg'].agg(['mean', 'count']).sort_values(by='count', ascending=False)
shot_outcome = pd.DataFrame(df[df['period'] < 5].groupby(['team', 'outcome'])['id'].count()).reset_index()

In [483]:
df['shot_distance'] = np.sqrt(((df['start_location_x'] - df['end_location_x']) ** 2) + ((df['start_location_x'] - df['end_location_x']) ** 2))

In [484]:
shot_outcome = shot_outcome.pivot(index='team', columns='outcome', values='id')
shot_outcome['match_played'] = df[['team', 'match_id']].groupby('team')['match_id'].nunique()

In [485]:
shot_outcome.fillna(0, inplace=True)

In [486]:
shot_outcome['total_shot'] = shot_outcome.iloc[:, -6:].apply(sum, axis=1)
shot_outcome['goal_conversion'] = shot_outcome['Goal']/shot_outcome['total_shot']

In [487]:
shot_outcome['shot_per_game'] = shot_outcome['total_shot']/shot_outcome['match_played']
shot_outcome['goal_per_game'] = shot_outcome['Goal']/shot_outcome['match_played']
shot_outcome['conversion_per_game'] = shot_outcome['goal_conversion']/(shot_outcome['match_played'])

In [494]:
shot_outcome.sort_values(by=['goal_conversion'], ascending=False)

outcome,Blocked,Goal,Off T,Post,Saved,Wayward,match_played,total_shot,goal_conversion,shot_per_game,goal_per_game,conversion_per_game
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Russia,14.0,10.0,15.0,0.0,9.0,2.0,5,41.0,0.243902,8.2,2.0,0.04878
Colombia,13.0,6.0,12.0,0.0,6.0,4.0,4,32.0,0.1875,8.0,1.5,0.046875
Belgium,27.0,15.0,33.0,2.0,21.0,10.0,7,88.0,0.170455,12.571429,2.142857,0.024351
France,17.0,12.0,32.0,1.0,17.0,3.0,7,72.0,0.166667,10.285714,1.714286,0.02381
Japan,11.0,6.0,11.0,1.0,10.0,5.0,4,37.0,0.162162,9.25,1.5,0.040541
England,28.0,12.0,35.0,1.0,13.0,7.0,7,75.0,0.16,10.714286,1.714286,0.022857
Argentina,23.0,6.0,14.0,1.0,11.0,3.0,4,39.0,0.153846,9.75,1.5,0.038462
Tunisia,8.0,5.0,10.0,0.0,7.0,8.0,3,33.0,0.151515,11.0,1.666667,0.050505
Croatia,29.0,13.0,47.0,4.0,16.0,5.0,7,92.0,0.141304,13.142857,1.857143,0.020186
Portugal,18.0,6.0,21.0,0.0,7.0,6.0,4,44.0,0.136364,11.0,1.5,0.034091


In [48]:
columns['name_cols']

['body_part',
 'play_pattern',
 'player',
 'position',
 'possession_team',
 'team',
 'technique',
 'type',
 'height',
 'recipient',
 'outcome',
 'card']