In [None]:
import pandas as pd
import numpy as np
import os
from mplsoccer import Pitch, Standardizer
from shapely.geometry import MultiPoint, Polygon, Point
import geopandas as gpd
from tqdm import tqdm

Load the data

In [None]:
STATSBOMB = os.path.join('..', '..', 'data', 'statsbomb')
df_statsbomb_event = pd.read_parquet(os.path.join(STATSBOMB, 'event.parquet'))
df_statsbomb_freeze = pd.read_parquet(os.path.join(STATSBOMB, 'freeze.parquet'))

Filter shots

In [None]:
df_statsbomb_shot = df_statsbomb_event[df_statsbomb_event['type_name'] == 'Shot'].copy()

Statsbomb pitch

In [None]:
statsbomb_pitch = Pitch()

# Features based on StatsBomb freeze frame

Features based on freeze frame - this takes a while as looping over many thousands of shots:
- space around goaly
- space around shooter
- number of defenders in shot angle to goal

Filter out penalty goals from freeze frames

In [None]:
non_penalty_id = df_statsbomb_shot.loc[(df_statsbomb_shot['sub_type_name'] != 'Penalty'), 'id']
df_statsbomb_freeze = df_statsbomb_freeze[df_statsbomb_freeze['id'].isin(non_penalty_id)].copy()

Add the shot taker to the freeze frame, which is generally not included although thtere are a few cases where they are

In [None]:
cols_to_keep = ['id', 'player_id', 'player_name', 'position_id', 'position_name', 'x', 'y', 'match_id']
freeze_ids = df_statsbomb_freeze['id'].unique()

df_shot_taker = df_statsbomb_shot.loc[df_statsbomb_shot['id'].isin(freeze_ids), cols_to_keep].copy()
df_shot_taker['teammate'] = True
df_shot_taker['event_freeze_id'] = 0
df_statsbomb_freeze = pd.concat([df_statsbomb_freeze, df_shot_taker])

# get rid of duplicated players (around 7 after adding shot from event dataframe and keep the event from event dataframe)
df_statsbomb_freeze.sort_values(['id', 'event_freeze_id'], inplace=True)
df_statsbomb_freeze.drop_duplicates(['id', 'player_id'], keep='first', inplace=True)

Calculate number of defenders in the goal angle

In [None]:
shot_takers = df_statsbomb_freeze[df_statsbomb_freeze['event_freeze_id'] == 0].copy()  # we added this earlier
shot_takers.reset_index(drop=True, inplace=True)

verts = np.zeros((len(shot_takers), 3, 2))
verts[:, 1:, :] = statsbomb_pitch.goal_right
verts[:, 0, :] = shot_takers[['x', 'y']].values
verts = gpd.GeoSeries([Polygon(vert).buffer(0) for vert in verts])  # the angle to the goal polygon, buffer added as sometimes shot is on the goal line
verts = gpd.GeoDataFrame({'id': shot_takers['id'], 'goal_angle': gpd.GeoSeries(verts)})

player_positions = gpd.GeoSeries.from_xy(df_statsbomb_freeze['x'], df_statsbomb_freeze['y'])
player_positions = gpd.GeoDataFrame({'id': df_statsbomb_freeze['id'], 'event_freeze_id': df_statsbomb_freeze['event_freeze_id'], 'position': player_positions})

# merge the goal angle onto the player positions
player_positions = gpd.GeoDataFrame(player_positions.merge(verts, on='id'))

# add a mask for whether the player intersects with the goal angle
player_positions['n_angle'] = player_positions['position'].intersects(player_positions['goal_angle'])
df_statsbomb_freeze = df_statsbomb_freeze.merge(player_positions[['id', 'event_freeze_id', 'n_angle']], on=['id', 'event_freeze_id'])

# calculate the number of defenders in the goal angle
num_in_goal_angle = df_statsbomb_freeze[df_statsbomb_freeze['teammate'] == False].groupby('id')['n_angle'].sum().reset_index()

Standardize the coordinates to 105 * 68 pitch

In [None]:
standard = Standardizer(pitch_from='statsbomb', pitch_to='uefa')
x_std, y_std = standard.transform(df_statsbomb_freeze['x'], df_statsbomb_freeze['y'])
df_statsbomb_freeze['x'] = x_std
df_statsbomb_freeze['y'] = y_std

Use Voronoi to calculate the amount of space around the shot taker and goalkeepper

In [None]:
area_goal = []
area_shot = []

for shot_id in tqdm(df_statsbomb_freeze['id'].unique()):
    subset = df_statsbomb_freeze.loc[df_statsbomb_freeze['id'] == shot_id,
                                     ['x', 'y', 'teammate', 'event_freeze_id', 'position_id','position_name']].copy()
    team1, team2 = statsbomb_pitch.voronoi(subset['x'], subset['y'], subset['teammate'])
    subset['rank'] = subset.groupby('teammate')['x'].cumcount()

    gk_rank = subset.loc[(subset['teammate'] == False) & (subset['position_id'] == 1), 'rank']
    if len(gk_rank) > 0:
        area_goal.append(Polygon(team2[gk_rank.iloc[0]]).area)
    else:
        area_goal.append(0)

    shot_taker_voronoi = team1[subset.loc[subset['event_freeze_id'] == 0, 'rank'].iloc[0]]
    area_shot.append(Polygon(shot_taker_voronoi).area)

df_freeze_features = pd.DataFrame({'id': df_statsbomb_freeze['id'].unique(),
                                   'area_shot': area_shot,
                                   'area_goal': area_goal}
                                 )

Combine voronoi features, goalkeeper position and the number of players in the goal angle

In [None]:
gk_position = df_statsbomb_freeze.loc[(df_statsbomb_freeze['position_name'] == 'Goalkeeper') &
                                      (df_statsbomb_freeze['teammate'] == False),
                                      ['id', 'x', 'y']]
gk_position.rename({'x': 'goalkeeper_x',
                    'y': 'goalkeeper_y'},
                   axis=1, inplace=True)

In [None]:
df_freeze_features = (df_freeze_features
                      .merge(num_in_goal_angle, how='left', on='id', validate='1:1')
                      .merge(gk_position, how='left', on='id', validate='1:1')
                     )

Save features

In [None]:
df_freeze_features.to_parquet(os.path.join(STATSBOMB, 'freeze_features.parquet'))
df_freeze_features.info()