In [None]:
import pandas as pd
from mplsoccer import Pitch

In [None]:
df = pd.read_parquet('opta_events_112_2022.parquet')
p = Pitch(pitch_type='opta')

Add on the x/ coordinates in relation to the camera instead of left to right

In [None]:
df_direction = df.loc[df.direction_of_play.notnull(), ['match_id', 'team_id', 'period_id', 'direction_of_play']].copy()
df_direction['camera_right_to_left'] = df_direction['direction_of_play'] == 'Right to Left'
df.drop('direction_of_play', axis='columns', inplace=True)
df = df.merge(df_direction, on=['match_id', 'team_id', 'period_id'], how='left', validate='m:1')
# fill missing
df.loc[df['shot_goal_mouth_y_coordinate'].notnull(), 'shot_goal_mouth_x_coordinate'] = p.dim.right
df.loc[df['type_name'] == 'foul_throw_in', 'end_x'] = df.loc[df['type_name'] == 'foul_throw_in', 'x']
df.loc[df['type_name'] == 'foul_throw_in', 'end_y'] = df.loc[df['type_name'] == 'foul_throw_in', 'y']
# flip coordinates for camera
df['camera_x'], df['camera_y'] = p.flip_side(df['x'], df['y'], df['camera_right_to_left'])
df['camera_pass_end_x'], df['camera_pass_end_y'] = p.flip_side(df['pass_end_x'], df['pass_end_y'], df['camera_right_to_left'])
df['camera_shot_blocked_x_coordinate'], df['camera_shot_blocked_y_coordinate'] = p.flip_side(df['shot_blocked_x_coordinate'], df['shot_blocked_y_coordinate'], df['camera_right_to_left'])
df['camera_shot_goal_mouth_x_coordinate'], df['camera_shot_goal_mouth_y_coordinate'] = p.flip_side(df['shot_goal_mouth_x_coordinate'], df['shot_goal_mouth_y_coordinate'], df['camera_right_to_left'])
# end coordinates
df['camera_end_x'] = (df['camera_pass_end_x']
                      .fillna(df['camera_shot_blocked_x_coordinate'])
                      .fillna(df['camera_shot_goal_mouth_x_coordinate'])
                     )
df['camera_end_y'] = (df['camera_pass_end_y']
                      .fillna(df['camera_shot_blocked_y_coordinate'])
                      .fillna(df['camera_shot_goal_mouth_y_coordinate'])
                     )

Add on some boolean columns for pass/shot/goal/ and set pieces

In [None]:
df['shot'] = df['type_name'].isin(['goal', 'attempt_saved', 'miss', 'post'])
df['goal'] = df['type_name'] == 'goal'
df['pass'] = df['type_name'].isin(['pass', 'offside_pass', 'foul_throw_in'])
df['set_piece_taken'] = (df['free_kick_taken'] |
                         df['free_kick'] | # includes tap pass can exclude with df['assisted'].isnull()
                         df['corner_taken'] |
                         df['shot_corner_direct'] |
                         df['goalkeeper_goal_kick'] |
                         df['kick_off'] |
                         df['throw_in'] |
                         (df['type_name'] == 'foul_throw_in')
                        )

Remove some events that aren't the offensive team or aren't related to play (e.g. formations)

In [None]:
remove_events = ['attempted_tackle', 'card', 'challenge', 'chance_missed',
                 'coach_setup', 'collection_end', 'contentious_referee_decision',
                 'coverage_interruption', 'cross_not_claimed', 'delayed_start',
                 'deleted_after_review', 'end', 'end_delay',
                 'formation_change', 'good_skill', 'injury_time_announcement',
                 'obstacle', 'offside_provoked', 'penalty_faced',
                 'player_becomes_goalkeeper', 'player_off', 'player_on',
                 'player_retired', 'referee_drop_ball', 'shield_ball_opp', 'start',
                 'start_delay', 'take_on', 'team_set_up']
df = df[~df['type_name'].isin(remove_events)].copy()
# remove defensive duels
df = df[df['duel_events_defensive'].isnull()].copy()

In [None]:
multi_outcome_events = ['aerial', '50_50', 'foul',
                        'corner_awarded', 'foul_throw_in', 'out', 'referee_drop_ball']
mask_multi = (df['type_name'].isin(multi_outcome_events))
mask_success = df['outcome'] == 1
df = df[(~mask_multi) | (mask_multi & mask_success)].dropna(how='all', axis='columns').reset_index(drop=True).copy()

Add 'out' events to the previous events and change the outcome of events where the pass or ball recovery went out/ was offside but it looks succesful

In [None]:
df['out'] = df['type_name'].isin(['out', 'corner_awarded'])
df['previous_out'] = df.groupby(['match_id', 'period_id'])['out'].shift(1) == True
df['out'] = df.groupby(['match_id', 'period_id'])['out'].shift(-1) == True
df['next_camera_x'] = df.groupby(['match_id', 'period_id'])['camera_x'].shift(-1)
df['next_camera_y'] = df.groupby(['match_id', 'period_id'])['camera_y'].shift(-1)
df = df[~df['type_name'].isin(['out', 'corner_awarded'])].dropna(how='all', axis='columns').reset_index(drop=True).copy()

In [None]:
mask_change_outcome = (df['out']) & (df['type_name'] == 'pass') & (df['outcome'] == 1)
print('Number of outcomes changed:', mask_change_outcome.sum())
df.loc[mask_change_outcome, 'outcome'] = 0
# change outcome of offside pass to zero
mask_change_outcome2 = df['type_name'].isin(['offside_pass', 'foul_throw_in'])
print('Number of outcomes changed:', mask_change_outcome2.sum())
df.loc[mask_change_outcome, 'outcome'] = 0
mask_change_type = (df['type_name'] == 'ball_recovery') & (df['out'] == True)
print('Number of type_name changed', mask_change_type.sum())
df.loc[mask_change_type, 'type_name'] = 'ball_touch'

Fix x/y end coordinates that went out but the event doesn't have the final coordinate

In [None]:
mask_missing = df['out'] & (df['camera_end_x'].isnull())
mask_change = (df['out'] & 
               (df['camera_end_x'] > 0) & (df['camera_end_x'] < 100) & (df['camera_end_y'] > 0) & (df['camera_end_y'] < 100) & 
               ((df['next_camera_x'] <= 0) | (df['next_camera_x'] >= 0) | (df['next_camera_x'] >= 100) | (df['next_camera_y'] >= 100))
              )
df.loc[mask_missing | mask_change, 'camera_end_x'] = df.loc[mask_missing | mask_change, 'next_camera_x']
df.loc[mask_missing | mask_change, 'camera_end_y'] = df.loc[mask_missing | mask_change, 'next_camera_y']

Work out if an event was a carry

In [None]:
df['previous_team_name'] = df.groupby(['match_id', 'period_id'])['team_name'].shift(1)
df['previous_type_name'] = df.groupby(['match_id', 'period_id'])['type_name'].shift(1)
df['previous_player_id'] = df.groupby(['match_id', 'period_id'])['player_id'].shift(1)
df['previous_camera_end_x'] = df.groupby(['match_id', 'period_id'])['camera_end_x'].shift(1)
df['previous_camera_end_y'] = df.groupby(['match_id', 'period_id'])['camera_end_y'].shift(1)
df['previous_camera_x'] = df.groupby(['match_id', 'period_id'])['camera_x'].shift(1)
df['previous_camera_y'] = df.groupby(['match_id', 'period_id'])['camera_y'].shift(1)
df['previous_outcome'] = df.groupby(['match_id', 'period_id'])['outcome'].shift(1)
df['same_team'] = (df['previous_team_name'] == df['team_name']) | (df['previous_team_name'].isnull())
df['same_player'] = (df['previous_player_id'] == df['player_id'])
df['previous_defensive_touch_type_control'] = df.groupby(['match_id', 'period_id'])['defensive_touch_type_control'].shift(1)
df['previous_timestamp_utc'] = df.groupby(['match_id', 'period_id'])['timestamp_utc'].shift(1)

In [None]:
mask1 = df['same_team'] & df['previous_type_name'].isin(['pass', 'ball_recovery', 'keeper_pick_up', 'drop_of_ball', 'miss', 'attempt_saved', 'smother', 'punch', 'post'])
mask2 = df['previous_type_name'].isin(['50_50', 'tackle']) & (df['same_player'] == 1) & (df.type_name == 'pass') & df['duel_events_offensive'].isnull()
mask3 = (df['previous_type_name'] == 'claim') & (df['same_player']) & (df['previous_outcome'] == 1)
mask4 = (df['timestamp_utc'] - df['previous_timestamp_utc']) < pd.Timedelta(1, 'minute')
mask_carry = ((mask1 | mask2 | mask3)
              & mask4 # rules out around 200 dribbles for being 1+ minutes
              & (df['set_piece_taken'].isnull())
              & (df['shot_first_touch'].isnull())
              & (df['shot_volley'].isnull())
              & (df['body_part_head'].isnull())
              & (df['body_part_other'].isnull())
             )
df['carry_between'] = mask_carry
df.index = df.index + df['carry_between'].cumsum()

Create carry events and add to the other actions

In [None]:
df_carry = df.loc[df['carry_between'], ['match_id', 'period_id', 'team_id', 'team_name', 'player_id', 'type_name', 'previous_camera_x', 'previous_camera_y',
                                        'previous_camera_end_x', 'previous_camera_end_y', 'camera_x', 'camera_y']].copy()
df_carry.rename({'camera_x': 'camera_end_x', 'camera_y': 'camera_end_y'}, axis='columns', inplace=True)
df_carry['type_name'] = 'carry'
df_carry['camera_x'], df_carry['camera_y'] = df_carry['previous_camera_end_x'].fillna(df['previous_camera_x']), df_carry['previous_camera_end_y'].fillna(df['previous_camera_y'])
df_carry = df_carry.drop(['previous_camera_x', 'previous_camera_y', 'previous_camera_end_x', 'previous_camera_end_y'], axis='columns').copy()
xstart, ystart = p.standardizer.transform(df_carry['camera_x'], df_carry['camera_y'])
xend, yend = p.standardizer.transform(df_carry['camera_end_x'], df_carry['camera_end_y'])
df_carry['angle'], df_carry['length'] = p.calculate_angle_and_distance(xstart, ystart, xend, yend, standardized=True)
df_carry.index = df_carry.index - 1
df_carry = df_carry[df_carry['length'] >= 2].copy() # rules out 51.1k for being less than 2 meters
df_carry['pass'] = False
df_carry['shot'] = False
df_carry['goal'] = False
df_carry['outcome'] = 1
# add to the other actions
df_actions = pd.concat([df, df_carry]).sort_index().reset_index(drop=True)
df_actions['carry'] = df_actions['type_name'] == 'carry'

Clean up and add the non-camera coordinates and save the final action filem

In [None]:
df_actions.drop(columns=['camera_right_to_left', 'shot_goal_mouth_x_coordinate', 'camera_pass_end_x', 'camera_pass_end_y', 'end_x', 'end_y',
                         'camera_shot_blocked_x_coordinate', 'camera_shot_blocked_y_coordinate', 'camera_shot_goal_mouth_x_coordinate',
                         'camera_shot_goal_mouth_y_coordinate', 'out', 'previous_out',
                         'next_camera_x', 'next_camera_y', 'previous_team_name', 'previous_type_name', 'previous_player_id', 'previous_camera_end_x',
                         'previous_camera_end_y', 'previous_camera_x', 'previous_camera_y', 'previous_outcome', 'same_team', 'same_player', 
                         'previous_defensive_touch_type_control', 'previous_timestamp_utc', 'carry_between'], axis='columns', inplace=True)

In [None]:
df_actions = df_actions.merge(df_direction, on=['match_id', 'team_id', 'period_id'], how='left', validate='m:1')
new_x, new_y = p.flip_side(df_actions['camera_x'], df_actions['camera_y'], df_actions['camera_right_to_left'])
df_actions.loc[df_actions['x'].isnull(), 'x'] = new_x[df_actions['x'].isnull()].round(1)
df_actions.loc[df_actions['y'].isnull(), 'y'] = new_y[df_actions['y'].isnull()].round(1)
df_actions['end_x'], df_actions['end_y'] = p.flip_side(df_actions['camera_end_x'], df_actions['camera_end_y'], df_actions['camera_right_to_left'])
df_actions['end_x'] = df_actions['end_x'].round(1)
df_actions['end_y'] = df_actions['end_y'].round(1)

In [None]:
df_actions.to_parquet('opta_actions_112_2022.parquet')