In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from mplsoccer.pitch import Pitch

Load the data

In [2]:
STATSBOMB = os.path.join('..', '..', 'data', 'statsbomb')
df_statsbomb_event = pd.read_parquet(os.path.join(STATSBOMB, 'event.parquet'))
df_statsbomb_match = pd.read_parquet(os.path.join(STATSBOMB, 'match.parquet'))

Pitches for coordinate conversion

In [3]:
pitch_statsbomb = Pitch(pitch_type='statsbomb', figsize=(16, 9))
pitch_statsperform = Pitch(pitch_type='statsperform', figsize=(16, 9))

Create a dataframe of the strongest foot

Assumptions:
- based on all events with body_part_name left or right
- both feet if the percentage of events on either foot are within 10 percentage points of each other (if the players have more than 20 events)
- left foot if left is used more often (other than when marked both feet)
- otherwise right foot

In [4]:
df_foot = df_statsbomb_event[df_statsbomb_event.body_part_name.isin(['Right Foot', 'Left Foot'])].copy()
df_foot = df_foot.groupby('player_id').body_part_name.value_counts()
df_foot = df_foot.unstack()
df_foot.reset_index(drop=False, inplace=True)
df_foot.index.name = ''
df_foot.rename({'Left Foot': 'left', 'Right Foot': 'right'}, axis=1, inplace=True)
df_foot.fillna(0, inplace=True)
n_total = df_foot[['left', 'right']].sum(axis=1)
df_foot['left'] = df_foot['left'] / n_total
df_foot['right'] = df_foot['right'] / n_total
df_foot.loc[(np.abs(df_foot.left - df_foot.right) < 0.1) & n_total > 20, 'foot'] = 'both'
df_foot.loc[(df_foot.foot.isnull()) & (df_foot.left > df_foot.right), 'foot'] = 'left'
df_foot.loc[(df_foot.foot.isnull()), 'foot'] = 'right'

Fast attack, win ball in own third, shoot in last quarter in 7-25 seconds

In [5]:
df_statsbomb_event['eventSec'] = (df_statsbomb_event.timestamp_minute * 60 + df_statsbomb_event.timestamp_second +
                                  df_statsbomb_event.timestamp_millisecond/1000)
mask_gk = ((df_statsbomb_event.type_name == 'Goal Keeper') & 
           (~df_statsbomb_event.goalkeeper_type_name.isin(['Shot Faced', 'Goal Conceded', 'Penalty Conceded'])))
mask_50 =((df_statsbomb_event.type_name == '50/50') & 
          (df_statsbomb_event.outcome_name.isin(['Success To Team', 'Won','Success To Opposition'])))
mask_other = (df_statsbomb_event.type_name == 'Interception') | (df_statsbomb_event.duel_type_name == 'Tackle')
mask_defence_win = (mask_gk | mask_50 | mask_other) & (df_statsbomb_event.x < 40.08)
df_statsbomb_event.loc[mask_defence_win, 'defence_win'] = df_statsbomb_event.loc[mask_defence_win, 'team_id']
df_statsbomb_event.loc[mask_defence_win, 'defence_sec'] = df_statsbomb_event.loc[mask_defence_win, 'eventSec']
group_match = df_statsbomb_event.groupby(['match_id', 'period'])
df_statsbomb_event[['defence_win', 'defence_sec']] = group_match[['defence_win', 'defence_sec']].ffill()
mask_fast = (((df_statsbomb_event.eventSec - df_statsbomb_event.defence_sec) <= 25) &
             (df_statsbomb_event.x > 90) &
             (df_statsbomb_event.type_name == 'Shot') & (df_statsbomb_event.team_id == df_statsbomb_event.defence_win))
df_statsbomb_event['fast_break'] = mask_fast

Add on previous info

In [6]:
# first filter out some events so the previous event is the correct assist type
on_ball = ['Ball Recovery','Block','Clearance','Dispossessed','Dribble','Referee Ball-Drop','Shield',
           'Error','Foul Won','Goal Keeper','Interception','Miscontrol','Own Goal Against','Pass','Shot']
# won 50/50s
mask_50_50_success = ((df_statsbomb_event.type_name=='50/50')&
                      (df_statsbomb_event.outcome_name.isin(['Won','Success To Team','Success To Opposition'])))
# off ball goal keeper
mask_goalkeeper_off_ball = df_statsbomb_event.goalkeeper_type_name.isin(['Shot Faced', 'Goal Conceded',
                                                                         'Penalty Conceded', 'Smother'])
# all on the ball events
mask_on_ball = ((df_statsbomb_event.type_name.isin(on_ball))&(~mask_goalkeeper_off_ball))|(mask_50_50_success)
df_statsbomb_event = df_statsbomb_event[mask_on_ball].copy()
match_group = df_statsbomb_event.groupby(['match_id', 'period'])
for i in range(1, 4):
    df_statsbomb_event[f'prev_id_{i}'] = match_group.id.shift(i)
    df_statsbomb_event[f'prev_type_name_{i}'] = match_group.type_name.shift(i)
    df_statsbomb_event[f'prev_player_id_{i}'] = match_group.player_id.shift(i)
    df_statsbomb_event[f'prev_team_id_{i}'] = match_group.team_id.shift(i)

Add set piece column

In [7]:
mask_set_piece1 = df_statsbomb_event.pass_type_name.isin(['Throw-in', 'Corner', 'Free Kick'])
mask_set_piece2 = df_statsbomb_event.shot_type_name == 'Free Kick'
df_statsbomb_event.loc[mask_set_piece1, 'set_piece'] = df_statsbomb_event.loc[mask_set_piece1, 'pass_type_name']
df_statsbomb_event.loc[mask_set_piece2, 'set_piece'] = df_statsbomb_event.loc[mask_set_piece2, 'shot_type_name']

Flag 10 seconds from a corner or freekick/ 20 seconds from a throw-in

In [8]:
group_match = df_statsbomb_event.groupby(['match_id', 'period'])
for set_piece in ['Corner', 'Throw-in', 'Free Kick']:
    mask = df_statsbomb_event.set_piece == set_piece
    name = set_piece.replace(' ', '_').replace('-', '_').lower()
    df_statsbomb_event.loc[mask, f'{name}_sec'] = df_statsbomb_event.loc[mask, 'eventSec']
    df_statsbomb_event.loc[mask, f'{name}_team'] = df_statsbomb_event.loc[mask, 'team_id']
    df_statsbomb_event[f'{name}_sec'] = group_match[f'{name}_sec'].ffill()
    df_statsbomb_event[f'{name}_team'] = group_match[f'{name}_team'].ffill()
    df_statsbomb_event[f'{name}_sec'] = df_statsbomb_event.eventSec - df_statsbomb_event[f'{name}_sec']
df_statsbomb_event.loc[df_statsbomb_event.throw_in_sec > 20, 'throw_in_sec'] = np.nan
df_statsbomb_event.loc[df_statsbomb_event.free_kick_sec > 10, 'free_kick_sec'] = np.nan
df_statsbomb_event.loc[df_statsbomb_event.corner_sec > 10, 'corner_sec'] = np.nan
df_statsbomb_event['play_type'] = df_statsbomb_event[['throw_in_sec', 'free_kick_sec', 'corner_sec']].idxmin(axis=1).str[:-4]
# if throw-in seperate defensive from offensive
mask_defensive = ((df_statsbomb_event.play_type == 'throw_in') &
                  (df_statsbomb_event['throw_in_team'] != df_statsbomb_event.team_id))
df_statsbomb_event.loc[mask_defensive, 'play_type'] = 'defensive_' + df_statsbomb_event.loc[mask_defensive, 'play_type']

Add player name

In [9]:
player_name_series = df_statsbomb_event.player_name.str.strip().str.replace(pat='"', repl="'").str.split(' ')
df_statsbomb_event['firstName'] = player_name_series.apply(lambda x: x[0] if isinstance(x, list) else None)
df_statsbomb_event['middleName'] = player_name_series.apply(lambda x: ' '.join(x[1:-1]) if isinstance(x, list) else None)
df_statsbomb_event['middleName'] = df_statsbomb_event['middleName'].str.strip()
df_statsbomb_event['lastName'] = player_name_series.apply(lambda x: x[-1] if isinstance(x, list) else None)
df_statsbomb_event['Name'] = ((df_statsbomb_event['firstName'] + ' ' + df_statsbomb_event['middleName']).str.strip()
                              + ' ' + df_statsbomb_event['lastName'])

### Filter Shots

In [10]:
df_statsbomb_shots = (df_statsbomb_event[(df_statsbomb_event.type_name=='Shot')&
                                        (~df_statsbomb_event.shot_type_name.isin(['Penalty', 'Corner']))]
                      .dropna(how='all', axis=1)
                      .copy())
print('Number of shots:',len(df_statsbomb_shots))
print('Number of goals:', (df_statsbomb_shots.outcome_name == 'Goal').sum())

Number of shots: 21532
Number of goals: 2470


Add on competition gender

In [11]:
df_statsbomb_shots = df_statsbomb_shots.merge(df_statsbomb_match[['match_id', 'competition_gender']], on='match_id', how='left')
# fill in when the match dataset is missing
women = [7298, 22536, 265905, 266234, 266466, 266574, 266933, 267161, 267405, 267609, 267679]
men = [18235, 18236, 18237, 18240, 18241, 18242, 18245, 18236, 18237, 18240, 18241, 18242, 18243, 18244, 18245]
df_statsbomb_shots.loc[df_statsbomb_shots.match_id.isin(women), 'competition_gender'] = 'female'
df_statsbomb_shots.loc[df_statsbomb_shots.match_id.isin(men), 'competition_gender'] = 'male'

StatsBomb: Add on shot fidelity

In [12]:
additional_higher_fidelity_shots = [22536, 265905, 266234, 266466, 266574, 266933, 267161, 267405,
                                    267609, 267679]
df_statsbomb_shots = df_statsbomb_shots.merge(df_statsbomb_match[['match_id', 'metadata_shot_fidelity_version']],
                                              on='match_id', how='left', validate='m:1')
df_statsbomb_shots.loc[df_statsbomb_shots.match_id.isin(additional_higher_fidelity_shots),
                       'metadata_shot_fidelity_version'] = '2'

Add on pass assist information

In [13]:
pass_cols = ['pass_assisted_shot_id', 'end_x', 'end_y', 'pass_length', 'pass_angle', 'pass_height_name',
             'pass_type_name', 'pass_switch', 'pass_through_ball',
             'pass_technique_name', 'pass_cross', 'pass_cut_back']
df_pass = df_statsbomb_event.loc[df_statsbomb_event.pass_assisted_shot_id.notnull(), pass_cols]
df_pass.rename({'pass_assisted_shot_id': 'id', 'end_x': 'pass_end_x', 'end_y': 'pass_end_y'}, axis=1, inplace=True)
df_pass['pass_switch'].fillna(False, inplace=True)
df_pass['pass_cross'].fillna(False, inplace=True)
df_pass['pass_cut_back'].fillna(False, inplace=True)
df_statsbomb_shots = df_statsbomb_shots.merge(df_pass, on='id', how='left')
# one through ball not included in the pass_technique name
df_statsbomb_shots.loc[df_statsbomb_shots.pass_through_ball == True, 'pass_technique_name'] = 'Through Ball'
df_statsbomb_shots.drop('pass_through_ball', axis=1, inplace=True)

Simplify the pass height name column

In [14]:
df_statsbomb_shots.pass_height_name.replace({'Ground Pass': 'Ground/ Low Pass',
                                             'Low Pass': 'Ground/ Low Pass'}, inplace=True)

Add carry length

In [15]:
df_statsbomb_shots['carry_length'] = (((df_statsbomb_shots.x - df_statsbomb_shots.pass_end_x) / 120 * 115) ** 2 +
                                      ((df_statsbomb_shots.y - df_statsbomb_shots.pass_end_y) / 80 * 74) ** 2) ** 0.5

Simplify body part name. As wyscout just has either foot and other

In [16]:
df_statsbomb_shots['body_part_name'] = df_statsbomb_shots.body_part_name.replace({'Head': 'Other'})

Merge on strongest foot

In [17]:
df_statsbomb_shots = df_statsbomb_shots.merge(df_foot, how='left', on='player_id')

Flag for whether shot is on the strongest foot

In [18]:
df_statsbomb_shots['strong_foot'] = (((df_statsbomb_shots.foot.isin(['left', 'both'])) &
                                     (df_statsbomb_shots.body_part_name == 'Left Foot')) |
                                    ((df_statsbomb_shots.foot.isin(['right', 'both'])) & 
                                     (df_statsbomb_shots.body_part_name == 'Right Foot')))

Side of field

In [19]:
df_statsbomb_shots.loc[df_statsbomb_shots.y < 36, 'side'] = 'left'
df_statsbomb_shots.loc[(df_statsbomb_shots.y >= 36) & (df_statsbomb_shots.y <= 44), 'side'] = 'center'
df_statsbomb_shots.loc[df_statsbomb_shots.y > 44, 'side'] = 'right'

Convert coordinates to standard pitch size (105m * 68m)

In [20]:
x_cols = ['x', 'pass_end_x']
y_cols = ['y', 'pass_end_y']
df_statsbomb_shots[x_cols] = (df_statsbomb_shots[x_cols]) / float(pitch_statsbomb.right) * pitch_statsperform.right
df_statsbomb_shots[y_cols] = ((float(pitch_statsbomb.bottom) - df_statsbomb_shots[y_cols]) /
                              float(pitch_statsbomb.bottom)  * pitch_statsperform.top)

Angles/ distance to goals

In [21]:
left_post, right_post = pitch_statsperform.goal_right
goal_width = abs(right_post - left_post)[1]
dx = abs(pitch_statsperform.right - df_statsbomb_shots.x)
dy = abs(pitch_statsperform.center_width - df_statsbomb_shots.y)
df_statsbomb_shots['visible_angle'] = np.arctan2(goal_width * dx , (dx**2 + dy**2 - (goal_width / 2.) ** 2))
df_statsbomb_shots['middle_angle'] = np.arctan2(dy, dx)
df_statsbomb_shots['distance_to_goal'] = round((dy**2 + dx**2)**0.5, 1)

Interaction between angle and distance

In [22]:
df_statsbomb_shots['distance_mid_angle'] = df_statsbomb_shots.distance_to_goal * df_statsbomb_shots.middle_angle
df_statsbomb_shots['distance_visible_angle'] = df_statsbomb_shots.distance_to_goal * df_statsbomb_shots.visible_angle

Log distance

In [23]:
df_statsbomb_shots['log_distance_to_goal'] = np.log(df_statsbomb_shots.distance_to_goal)

Counter attack

In [24]:
df_statsbomb_shots['counter_attack'] = df_statsbomb_shots.play_pattern_name == 'From Counter'

Assist type

In [25]:
df_statsbomb_shots.loc[df_statsbomb_shots.pass_end_x.notnull(), 'assist_type'] = 'pass'
df_statsbomb_shots.loc[df_statsbomb_shots.shot_type_name.isin(['Free Kick', 'Corner', 'Kick Off']), 'assist_type'] = 'direct'
df_statsbomb_shots.loc[df_statsbomb_shots.assist_type.isnull() &
                       ((df_statsbomb_shots.prev_type_name_3.isin(['Shot', 'Goal Keeper'])) |
                        (df_statsbomb_shots.prev_type_name_2.isin(['Shot', 'Goal Keeper'])) |
                        (df_statsbomb_shots.prev_type_name_1.isin(['Shot', 'Goal Keeper']))), 'assist_type'] = 'rebound'
df_statsbomb_shots.loc[df_statsbomb_shots.assist_type.isnull() &
                       ((df_statsbomb_shots.prev_type_name_3.isin(['Clearance'])) |
                        (df_statsbomb_shots.prev_type_name_2.isin(['Clearance'])) |
                        (df_statsbomb_shots.prev_type_name_1.isin(['Clearance']))), 'assist_type'] = 'clearance'
df_statsbomb_shots.loc[df_statsbomb_shots.assist_type.isnull(), 'assist_type'] = 'recovery'

Shot type name

In [26]:
df_statsbomb_shots.shot_type_name.replace({'Open Play': np.nan,
                                           'Free Kick': 'direct_set_piece',
                                           'Kick Off': 'direct_set_piece'}, inplace=True)
mask_null = df_statsbomb_shots.shot_type_name.isnull()
df_statsbomb_shots.loc[mask_null, 'shot_type_name'] = df_statsbomb_shots.loc[mask_null, 'play_type']
df_statsbomb_shots.loc[df_statsbomb_shots.shot_type_name.isnull(), 'shot_type_name'] = 'open_play'

Add goal boolean

In [27]:
df_statsbomb_shots['goal'] = df_statsbomb_shots.outcome_name == 'Goal'

Turn some of the StatsBomb columns to boolean columns

In [28]:
df_statsbomb_shots['shot_open_goal'].fillna(False, inplace=True)
df_statsbomb_shots['under_pressure'] = df_statsbomb_shots['under_pressure'].fillna(0).astype(np.bool)
df_statsbomb_shots['shot_one_on_one'].fillna(False, inplace=True)

Reduce columns

In [29]:
cols_to_keep = ['match_id', 'id', 'goal', 'team_id', 'team_name', 'player_id', 'firstName',
                'middleName', 'lastName', 'Name', 'shot_type_name', 'x', 'y', 'counter_attack',
                'fast_break', 'strong_foot', 'body_part_name', 'assist_type',
                'pass_end_y', 'pass_end_x', 'pass_switch', 'pass_cross',
                'pass_cut_back', 'pass_height_name', 'pass_technique_name',
                'pass_length', 'pass_angle', 'carry_length', 'side', 'visible_angle',
                'middle_angle', 'distance_to_goal', 'distance_mid_angle',
                'distance_visible_angle', 'log_distance_to_goal', 'competition_gender',
                'shot_one_on_one', 'shot_open_goal', 'under_pressure']
df_statsbomb_shots = df_statsbomb_shots[cols_to_keep].copy()

Save dataset

In [30]:
df_statsbomb_shots.reset_index(drop=True, inplace=True)
df_statsbomb_shots.to_parquet(os.path.join(STATSBOMB, 'shots.parquet'))

Show information

In [31]:
df_statsbomb_shots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21532 entries, 0 to 21531
Data columns (total 39 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   match_id                21532 non-null  int64  
 1   id                      21532 non-null  object 
 2   goal                    21532 non-null  bool   
 3   team_id                 21532 non-null  int64  
 4   team_name               21532 non-null  object 
 5   player_id               21532 non-null  float64
 6   firstName               21532 non-null  object 
 7   middleName              21532 non-null  object 
 8   lastName                21532 non-null  object 
 9   Name                    21532 non-null  object 
 10  shot_type_name          21532 non-null  object 
 11  x                       21532 non-null  float64
 12  y                       21532 non-null  float64
 13  counter_attack          21532 non-null  bool   
 14  fast_break              21532 non-null