In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
pd.set_option('display.max_columns', 500)

# Load dataset

In [3]:
DATA_PATH = os.path.join(os.getcwd(),'data')
EVENTS_PATH = os.path.join(DATA_PATH,'events')
df_events = pd.read_feather(EVENTS_PATH)

  labels, = index.labels


In [4]:
RELATED_PATH = os.path.join(DATA_PATH,'related_events')
df_related_events = pd.read_feather(RELATED_PATH)

In [5]:
MATCH_PATH = os.path.join(DATA_PATH,'match')
df_match = pd.read_feather(MATCH_PATH)

# Create definition of on ball event types and seperate the data

In [6]:
n_events = len(df_events)

In [7]:
# Passive/ defensive event types
off_ball = ['Ball Receipt','Carry','Duel','Pressure','Foul Committed','Dribbled Past',
            'Bad Behaviour','Player Off','Player On',
            'Offside','Own Goal For','Substitution',
            'Camera On','Half Start','Half End','Injury Stoppage','Starting XI','Tactical Shift',
            'Camera off',]
# note Referee Ball-Drop/ Shield added in temporarily so we can easily set a flag on the next event
on_ball = ['Ball Recovery','Block','Clearance','Dispossessed','Dribble','Referee Ball-Drop','Shield',
           'Error','Foul Won','Goal Keeper','Interception','Miscontrol','Own Goal Against','Pass','Shot']
print('Check only 50/50 left out:',
      set(df_events.type_name.unique()) - set(off_ball) - set(on_ball))

Check only 50/50 left out: {'50/50'}


In [8]:
# won 50/50s
mask_50_50_success = ((df_events.type_name=='50/50')&
                      (df_events['50_50_outcome_name'].isin(['Won','Success To Team','Success To Opposition'])))
# lost 50/50s
mask_50_50_lost = (df_events.type_name=='50/50')&(df_events['50_50_outcome_name']=='Lost')
# off ball
mask_goalkeeper_off_ball = df_events.goalkeeper_type_name.isin(['Shot Faced','Goal Conceded',
                                                                'Penalty Conceded','Smother'])
mask_off_ball_all = (df_events.type_name.isin(off_ball))|(mask_goalkeeper_off_ball)|(mask_50_50_success)
# on ball
mask_on_ball = ((df_events.type_name.isin(on_ball))&(~mask_goalkeeper_off_ball))|(mask_50_50_lost)

In [9]:
# seperate off_ball events and remove null columns
df_events_other = df_events[mask_off_ball_all].copy()
df_events_other.dropna(how='all',axis=1,inplace=True)
# seperate on_ball events and remove null columns
df_events = df_events[mask_on_ball].copy()
df_events.dropna(how='all',axis=1,inplace=True)

In [10]:
n_events == (len(df_events_other)+len(df_events))

True

# Function to shift columns easily

In [11]:
def shift_cols(df_events,cols_next=None,cols_previous=None):
    if cols_next is not None:
        for col in cols_next:
            df_events[col+'_next'] = df_events.groupby(['match_id','period'])[col].shift(-1)
    if cols_previous is not None:
        for col in cols_previous:
            df_events[col+'_previous'] = df_events.groupby(['match_id','period'])[col].shift(1)

# Function fo fill na easily

In [12]:
def fill_na(col1,col2,to_remove):
    df_events[col1] = df_events[col1].fillna(df_events[col2])
    to_remove.extend([col2])

# Simplify outcome columns

In [13]:
df_events['outcome_id'] = np.nan
df_events['outcome_id'] = (df_events['pass_outcome_id']
                             .fillna(df_events['dribble_outcome_id'])
                             .fillna(df_events['shot_outcome_id'])
                             .fillna(df_events['interception_outcome_id'])
                             .fillna(df_events['goalkeeper_outcome_id']))
df_events['outcome_name'] = None
df_events['outcome_name'] = (df_events['pass_outcome_name']
                             .fillna(df_events['dribble_outcome_name'])
                             .fillna(df_events['shot_outcome_name'])
                             .fillna(df_events['interception_outcome_name'])
                             .fillna(df_events['goalkeeper_outcome_name']))
cols = ['pass_outcome_id','dribble_outcome_id','shot_outcome_id',
        'interception_outcome_id','goalkeeper_outcome_id','pass_outcome_name','dribble_outcome_name',
        'shot_outcome_name','interception_outcome_name','goalkeeper_outcome_name']
df_events.drop(cols,axis=1,inplace=True)

# Simplify body part columns

In [14]:
df_events['body_part_id'] = np.nan
df_events['body_part_id'] = (df_events['body_part_id']
                             .fillna(df_events['clearance_body_part_id'])
                             .fillna(df_events['goalkeeper_body_part_id'])
                             .fillna(df_events['pass_body_part_id'])
                             .fillna(df_events['shot_body_part_id']))
df_events['body_part_name'] = None
df_events['body_part_name'] = (df_events['body_part_name']
                             .fillna(df_events['clearance_body_part_name'])
                             .fillna(df_events['goalkeeper_body_part_name'])
                             .fillna(df_events['pass_body_part_name'])
                             .fillna(df_events['shot_body_part_name']))
cols = ['clearance_right_foot','clearance_body_part_id','clearance_body_part_name','clearance_left_foot',
       'clearance_head','goalkeeper_body_part_id','goalkeeper_body_part_name','pass_body_part_id',
       'pass_body_part_name','shot_body_part_id','shot_body_part_name']
df_events.drop(cols,axis=1,inplace=True)

# Add shielded ball

First remove shielded events that do not lead to the ball going out

In [15]:
# remove shield events that do not lead to the ball going outn - i.e. unsuccesful shields?
df_events.loc[df_events.pass_type_name.isin(['Throw-in','Corner','Goal Kick']),'set_piece_out'] = True
cols_next = ['type_name','set_piece_out']
shift_cols(df_events,cols_next=cols_next)
mask_shield_remove = ((df_events.type_name=='Shield')&
                      (df_events.set_piece_out_next.isnull())&(df_events.type_name_next.notnull()))
cols_to_remove = ['type_name_next','set_piece_out_next','set_piece_out']
df_events.drop(cols_to_remove,axis=1,inplace=True)
df_shield = df_events[mask_shield_remove].copy()
df_events = df_events[~mask_shield_remove].copy()
df_events_other = pd.concat([df_events_other,df_shield],axis=0,sort=False)

Then shift up the shield event to the previous event and delete the seperate shield event

In [16]:
# shift the columns so we can see if the next event is a shield event
cols_next = ['timestamp','index','id','type_name','player_id','player_name','team_id','team_name']
shift_cols(df_events,cols_next=cols_next)
# set the shifted columns to null if the next event isn't a shield event
mask_shield = df_events.type_name_next == 'Shield'
to_change = [col+'_next' for col in cols_next]
df_events.loc[~mask_shield,to_change] = np.nan
# rename the shifted columns to shield_...
rename_dict = dict(zip(to_change,['shield_'+col for col in cols_next]))
df_events.rename(rename_dict,axis=1,inplace=True)
df_events.loc[mask_shield,'shield'] = True
df_events.drop('shield_type_name',axis=1,inplace=True)
# remove shielded events, as now they are next to the previous event
df_events = df_events[~(df_events.type_name=='Shield')].copy()

# Add ball recovery flag, either when a player attempts a ball recovery or from a recovery pass

In [17]:
mask_recovery = (df_events.pass_type_name=='Recovery')|(df_events.type_name=='Ball Recovery')
df_events.loc[mask_recovery,'recovery'] = mask_recovery

# Add ball drop info to previous event

In [18]:
df_events.loc[df_events.type_name=='Referee Ball-Drop','ball_drop'] = True
df_events['ball_drop'] = df_events.groupby(['match_id','period']).ball_drop.shift(1)
df_events = df_events[df_events.type_name!='Referee Ball-Drop'].copy()

# Add ball receipt to pass

In [19]:
# drop pass_recipient_id/ pass_recipient_name as we will get it from the ball receipt, with additional info
df_events.drop(['pass_recipient_id','pass_recipient_name'],axis=1,inplace=True)

In [20]:
df_pass_receipt_related = df_related_events.loc[(df_related_events.type_name_related.isin(['Ball Receipt']))&
                                                (df_related_events.type_name=='Pass'),['id','id_related']]
to_keep = ['id','index','timestamp','minute','second','under_pressure','duration',
          'ball_receipt_outcome_id','ball_receipt_outcome_name','player_id','player_name',
          'position_id','position_name','team_id','team_name','type_id','type_name','x','y']
df_pass_receipt = (df_events_other.loc[df_events_other.type_name=='Ball Receipt',to_keep]
                   .rename({'id':'id_related'},axis=1))
df_pass_receipt = df_pass_receipt.merge(df_pass_receipt_related,on='id_related',validate='1:1')
df_events = df_events.merge(df_pass_receipt,on='id',how='left',validate='1:1',suffixes=['','_related'])

In [21]:
df_events.rename({'id_related':'id_2',
                  'index_related':'index_2',
                  'timestamp_related':'timestamp_2',
                  'minute_related':'minute_2',
                  'second_related':'second_2',
                  'duration_related':'duration_2',
                  'player_id_related':'player_2_id',
                  'player_name_related':'player_2_name',
                  'position_id_related':'player_2_position_id',
                  'position_name_related':'player_2_position_name',
                  'team_id_related':'player_2_team_id',
                  'team_name_related':'player_2_team_name',
                  'type_id_related':'player_2_action_id',
                  'type_name_related':'player_2_action_name',
                  'type_id_related':'player_2_type_id',
                  'type_name_related':'player_2_type_name',
                  'counterpress_related':'player_2_counterpress',
                  'under_pressure_related':'player_2_under_pressure',
                  'ball_receipt_outcome_id':'player_2_outcome_id',
                  'ball_receipt_outcome_name':'player_2_outcome_name',
                  'x_related':'player_2_x',
                  'y_related':'player_2_y'},axis=1,inplace=True)

# Add Dribbled past

In [22]:
to_keep = ['id','index','timestamp','minute','second','duration',
           'off_camera','under_pressure','counterpress','player_id','player_name',
          'position_id','position_name','team_id','team_name','type_id','type_name','x','y']
df_dribble_related = df_related_events.loc[(df_related_events.type_name_related.isin(['Dribbled Past']))&
                                           (df_related_events.type_name=='Dribble'),['id','id_related']]
df_dribble = (df_events_other.loc[df_events_other.type_name=='Dribbled Past',to_keep]
              .rename({'id':'id_related'},axis=1))
df_dribble = df_dribble.merge(df_dribble_related,on='id_related',validate='1:1')
df_events = df_events.merge(df_dribble,on='id',how='left',validate='1:1',suffixes=['','_related'])

In [23]:
to_remove = []
fill_na('id_2','id_related',to_remove)
fill_na('index_2','index_related',to_remove)
fill_na('timestamp_2','timestamp_related',to_remove)
fill_na('minute_2','minute_related',to_remove)
fill_na('second_2','second_related',to_remove)
fill_na('duration_2','duration_related',to_remove)
fill_na('off_camera','off_camera_related',to_remove)
fill_na('player_2_under_pressure','under_pressure_related',to_remove)
fill_na('counterpress','counterpress_related',to_remove)
fill_na('player_2_id','player_id_related',to_remove)
fill_na('player_2_name','player_name_related',to_remove)
fill_na('player_2_position_id','position_id_related',to_remove)
fill_na('player_2_position_name','position_name_related',to_remove)
fill_na('player_2_team_id','team_id_related',to_remove)
fill_na('player_2_team_name','team_name_related',to_remove)
fill_na('player_2_type_id','type_id_related',to_remove)
fill_na('player_2_type_name','type_name_related',to_remove)
fill_na('player_2_x','x_related',to_remove)
fill_na('player_2_y','y_related',to_remove)
df_events.drop(to_remove,axis=1,inplace=True)

# Add 50/50 info

In [24]:
to_keep = ['id','index','timestamp','minute','second','duration',
           'off_camera','out','under_pressure','counterpress','player_id','player_name',
          'position_id','position_name','team_id','team_name','type_id','type_name','x','y',
          '50_50_outcome_id','50_50_outcome_name']
df_50_related = df_related_events.loc[(df_related_events.type_name_related=='50/50')&
                                      (df_related_events.type_name=='50/50'),['id','id_related']]
df_50 = (df_events_other.loc[df_events_other.type_name=='50/50',to_keep]
         .rename({'id':'id_related'},axis=1))
df_50 = df_50.merge(df_50_related,on='id_related',validate='1:1')
df_events = df_events.merge(df_50,on='id',how='left',validate='1:1',suffixes=['','_related'])

In [25]:
to_remove = []
fill_na('id_2','id_related',to_remove)
fill_na('index_2','index_related',to_remove)
fill_na('timestamp_2','timestamp_related',to_remove)
fill_na('minute_2','minute_related',to_remove)
fill_na('second_2','second_related',to_remove)
fill_na('duration_2','duration_related',to_remove)
fill_na('off_camera','off_camera_related',to_remove)
fill_na('out','out_related',to_remove)
fill_na('counterpress','counterpress_related',to_remove)
fill_na('player_2_under_pressure','under_pressure_related',to_remove)
fill_na('counterpress','counterpress_related',to_remove)
fill_na('player_2_id','player_id_related',to_remove)
fill_na('player_2_name','player_name_related',to_remove)
fill_na('player_2_position_id','position_id_related',to_remove)
fill_na('player_2_position_name','position_name_related',to_remove)
fill_na('player_2_team_id','team_id_related',to_remove)
fill_na('player_2_team_name','team_name_related',to_remove)
fill_na('player_2_type_id','type_id_related',to_remove)
fill_na('player_2_type_name','type_name_related',to_remove)
fill_na('player_2_x','x_related',to_remove)
fill_na('player_2_y','y_related',to_remove)
fill_na('player_2_x','50_50_outcome_id_related',to_remove)
fill_na('player_2_y','50_50_outcome_name_related',to_remove)
df_events.drop(to_remove,axis=1,inplace=True)

# Foul Committed

In [26]:
cols_to_keep = ['id','index','timestamp','minute','second','duration','off_camera','under_pressure',
                'counterpress','player_id','player_name','position_id','position_name','team_id','team_name',
                'foul_committed_offensive','foul_committed_type_id','foul_committed_type_name',
                'foul_committed_card_id','foul_committed_card_name','foul_committed_penalty',
                'foul_committed_advantage','type_id','type_name','x','y']
df_foul_related = df_related_events.loc[(df_related_events.type_name_related=='Foul Committed')&
                                        (df_related_events.type_name=='Foul Won'),['id','id_related']]
df_foul = (df_events_other.loc[df_events_other.type_name=='Foul Committed',cols_to_keep]
           .rename({'id':'id_related'},axis=1))
df_foul = df_foul.merge(df_foul_related,on='id_related',validate='1:1')
df_events = df_events.merge(df_foul,on='id',how='left',validate='1:1',suffixes=['','_related'])

In [27]:
to_remove = []
fill_na('id_2','id_related',to_remove)
fill_na('index_2','index_related',to_remove)
fill_na('timestamp_2','timestamp_related',to_remove)
fill_na('minute_2','minute_related',to_remove)
fill_na('second_2','second_related',to_remove)
fill_na('duration_2','duration_related',to_remove)
fill_na('off_camera','off_camera_related',to_remove)
fill_na('player_2_under_pressure','under_pressure_related',to_remove)
fill_na('counterpress','counterpress_related',to_remove)
fill_na('player_2_id','player_id_related',to_remove)
fill_na('player_2_name','player_name_related',to_remove)
fill_na('player_2_position_id','position_id_related',to_remove)
fill_na('player_2_position_name','position_name_related',to_remove)
fill_na('player_2_team_id','team_id_related',to_remove)
fill_na('player_2_team_name','team_name_related',to_remove)
fill_na('player_2_type_id','type_id_related',to_remove)
fill_na('player_2_type_name','type_name_related',to_remove)
fill_na('player_2_x','x_related',to_remove)
fill_na('player_2_y','y_related',to_remove)
df_events.drop(to_remove,axis=1,inplace=True)

# Add duel to dispossed

In [28]:
cols_to_keep = ['id','index','timestamp','minute','second','duration',
                'off_camera','counterpress','duel_type_id',
                'duel_type_name','duel_outcome_id','duel_outcome_name',
                'player_id','player_name','position_id','position_name','team_id','team_name',
                'type_id','type_name','x','y']
df_duel_related = df_related_events.loc[(df_related_events.type_name_related.isin(['Duel','Goal Keeper']))&
                                        (df_related_events.type_name=='Dispossessed'),['id','id_related']]
df_duel = (df_events_other.loc[df_events_other.type_name=='Duel',cols_to_keep]
           .rename({'id':'id_related'},axis=1))
df_duel = df_duel.merge(df_duel_related,on='id_related',validate='1:1')
df_events = df_events.merge(df_duel,on='id',how='left',validate='1:1',suffixes=['','_related'])

In [29]:
to_remove = []
fill_na('id_2','id_related',to_remove)
fill_na('index_2','index_related',to_remove)
fill_na('timestamp_2','timestamp_related',to_remove)
fill_na('minute_2','minute_related',to_remove)
fill_na('second_2','second_related',to_remove)
fill_na('duration_2','duration_related',to_remove)
fill_na('off_camera','off_camera_related',to_remove)
fill_na('counterpress','counterpress_related',to_remove)
fill_na('player_2_id','player_id_related',to_remove)
fill_na('player_2_name','player_name_related',to_remove)
fill_na('player_2_position_id','position_id_related',to_remove)
fill_na('player_2_position_name','position_name_related',to_remove)
fill_na('player_2_team_id','team_id_related',to_remove)
fill_na('player_2_team_name','team_name_related',to_remove)
fill_na('player_2_type_id','type_id_related',to_remove)
fill_na('player_2_type_name','type_name_related',to_remove)
fill_na('player_2_x','x_related',to_remove)
fill_na('player_2_y','y_related',to_remove)
fill_na('player_2_outcome_id','duel_outcome_id',to_remove)
fill_na('player_2_outcome_name','duel_outcome_name',to_remove)
df_events.drop(to_remove,axis=1,inplace=True)
mask_duel = df_events.duel_type_name.notnull()
df_events.loc[mask_duel,'player_2_type_id'] = df_events.loc[mask_duel,'duel_type_id']
df_events.loc[mask_duel,'player_2_type_name'] = df_events.loc[mask_duel,'duel_type_name']
df_events.drop(['duel_type_id','duel_type_name'],axis=1,inplace=True)

# Add goalkeeper smother

In [30]:
cols_to_keep = ['id','index','timestamp','minute','second','duration',
                'goalkeeper_type_id','goalkeeper_type_name','goalkeeper_outcome_id',
                'goalkeeper_outcome_name',
                'player_id','player_name','position_id','position_name','team_id','team_name',
                'type_id','type_name','x','y']
df_smother_related = df_related_events.loc[(df_related_events.type_name.isin(['Dribble','Dispossessed']))
                                           & (df_related_events.type_name_related=='Goal Keeper'),['id','id_related']]
df_smother = (df_events_other.loc[df_events_other.goalkeeper_type_name=='Smother',cols_to_keep]
              .rename({'id':'id_related'},axis=1))
df_smother = df_smother.merge(df_smother_related,on='id_related',validate='1:1')
df_events = df_events.merge(df_smother,on='id',how='left',validate='1:1',suffixes=['','_related'])

In [31]:
to_remove = []
fill_na('id_2','id_related',to_remove)
fill_na('index_2','index_related',to_remove)
fill_na('timestamp_2','timestamp_related',to_remove)
fill_na('minute_2','minute_related',to_remove)
fill_na('second_2','second_related',to_remove)
fill_na('duration_2','duration_related',to_remove)
fill_na('goalkeeper_type_id','goalkeeper_type_id_related',to_remove)
fill_na('goalkeeper_type_name','goalkeeper_type_name_related',to_remove)
fill_na('player_2_outcome_id','goalkeeper_outcome_id',to_remove)
fill_na('player_2_outcome_name','goalkeeper_outcome_name',to_remove)
fill_na('player_2_id','player_id_related',to_remove)
fill_na('player_2_name','player_name_related',to_remove)
fill_na('player_2_position_id','position_id_related',to_remove)
fill_na('player_2_position_name','position_name_related',to_remove)
fill_na('player_2_team_id','team_id_related',to_remove)
fill_na('player_2_team_name','team_name_related',to_remove)
fill_na('player_2_type_id','type_id_related',to_remove)
fill_na('player_2_type_name','type_name_related',to_remove)
fill_na('player_2_x','x_related',to_remove)
fill_na('player_2_y','y_related',to_remove)
df_events.drop(to_remove,axis=1,inplace=True)

# Change goal keeper types to the goal keeper types (goalkeeper_type_id/name)

In [32]:
# player 1
mask_goal_keeper = df_events.type_name == 'Goal Keeper'
df_events.loc[mask_goal_keeper,'type_id'] = df_events.loc[mask_goal_keeper,'goalkeeper_type_id']
df_events.loc[mask_goal_keeper,'type_name'] = df_events.loc[mask_goal_keeper,'goalkeeper_type_name']
# player 2
mask_goal_keeper = df_events.player_2_type_name == 'Goal Keeper'
df_events.loc[mask_goal_keeper,'player_2_type_id'] = df_events.loc[mask_goal_keeper,'goalkeeper_type_id']
df_events.loc[mask_goal_keeper,'player_2_type_name'] = df_events.loc[mask_goal_keeper,'goalkeeper_type_name']
df_events.drop(['goalkeeper_type_id','goalkeeper_type_name'],axis=1,inplace=True)

In [33]:
df_events.info(verbose=True,null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1053279 entries, 0 to 1053278
Data columns (total 136 columns):
id                                  1053279 non-null object
index                               1053279 non-null int64
period                              1053279 non-null int64
timestamp                           1053279 non-null object
minute                              1053279 non-null int64
second                              1053279 non-null int64
possession                          1053279 non-null int64
duration                            1031424 non-null float64
off_camera                          26771 non-null float64
out                                 16326 non-null float64
under_pressure                      243968 non-null float64
counterpress                        29273 non-null float64
ball_recovery_offensive             296 non-null object
ball_recovery_recovery_failure      6309 non-null object
block_save_block                    176 non-null object
bloc

In [40]:
df_events[(df_events.player_2_outcome_name.notnull())&(df_events.player_2_name.notnull())].player_2_type_name.value_counts()

Ball Receipt    110094
Tackle           19682
Smother            104
Name: player_2_type_name, dtype: int64

In [41]:
df_events[df_events.id_2.isnull()].type_name.value_counts()

Ball Recovery            83377
Pass                     53311
Clearance                34384
Block                    27336
Miscontrol               21002
Shot                     19917
Interception             15193
Dribble                  10704
Shot Saved                4922
Collected                 1731
Keeper Sweeper             888
Punch                      853
Error                      463
Own Goal Against            87
Shot Saved Off Target       72
Penalty Saved               55
Shot Saved to Post          54
Save                        44
50/50                        6
Saved to Post                2
Penalty Saved to Post        1
Name: type_name, dtype: int64

# To do add other event outcomes to event dataframe

- refactor to make the table joining in one
- add on carry/ pressure

- <del> Ball Receipt         715516
- Carry                637463
- Pressure             256187
- <del> Duel                  48365
- <del> Foul Committed        22324
- <del> Dribbled Past         21769
- <del> Goal Keeper           14915
- <del> Substitution           4289
- <del> Half End               3150
- <del> Half Start             3150
- <del> Injury Stoppage        2430
- <del> Shield
- <del> Referee Ball-Drop       738
- <del> 50/50                   613
- <del> Camera On              3822
- <del> Starting XI            1554
- <del> Tactical Shift         1371
- <del> Camera off             1027
- <del> Bad Behaviour           543
- <del> Player Off              535
- <del> Player On               529
- <del> Offside                 298   
- <del> Own Goal Against         87

# Old code

reverse events

In [None]:
#it looks like sometimes there is a reversed event so the set piece out doesn't follow the shiedlded ball
# here we reverse the events
df_events.loc[df_events.pass_type_name.isin(['Throw-in','Corner','Goal Kick']),'set_piece_out'] = True
cols_next = ['type_name','set_piece_out']
cols_previous = ['set_piece_out','type_name','id','player_name','index']
shift_cols(df_events,cols_next,cols_previous)
to_reverse = df_events.loc[(df_events['type_name_previous'] == 'Shield') &
                           (df_events['player_name_previous']!=df_events['player_name']) &
                           (df_events.set_piece_out.isnull()) & df_events['set_piece_out_next']==True,
                           ['id','index','id_previous','index_previous']]
df_events.loc[df_events.id.isin(to_reverse.id),'index'] = to_reverse.index_previous
df_events.loc[df_events.id.isin(to_reverse.id_previous),'index'] = to_reverse.index

Mark passes as Incomplete (instead of null) if the pass is a recovery and the possession changes team

In [None]:
df_events.loc[(df_events.pass_type_name=='Recovery')&
              (df_events.next_change_team==True)&
              (df_events.outcome_name.isnull()),'outcome_name'] = 'Incomplete'

Get the player_name, id and index in two events time

In [None]:
type_cols = ['player_name','type_name','id','index','timestamp']
type_cols_new = ['next2_'+col for col in type_cols]
df_events[type_cols_new] = df_events.groupby(['match_id','period'])[type_cols].shift(-2)

Get the player_name in three events time

In [None]:
df_events['next3_player_name'] = df_events.groupby(['match_id','period'])['player_name'].shift(-3)

Swapable events

In [None]:
first_cols = ['match_id','id','index','period','timestamp','minute','second','possession',
              'duration','team_id','team_name','player_name','pass_recipient_name',
              'type_id','type_name','outcome_id',
              'outcome_name','ball_recovery_recovery_failure','miscontrol_aerial_won','pass_type_name',
              'dispossessed_won_out','ball_receipt_incomplete']
other_cols = [col for col in df_events.columns if col not in first_cols]
first_cols.extend(other_cols)

In [None]:
mask_possible_swaps = ((df_events.outcome_name.isnull())&(df_events.type_name=='Pass')&
                       (df_events.next_change_team==True)&
                       (df_events.pass_recipient_name==df_events.next2_player_name)&
                       (df_events.next2_player_name!=df_events.next3_player_name) &
                       ((df_events.next_player_name==df_events.next3_player_name)|
                        (df_events.next_pass_recipient_name==df_events.next3_player_name)))

Show possible errors

In [None]:
mask_potential_problems = ((df_events['next_change_team']==True)&
                           (df_events.outcome_name!='Incomplete')&
                           (~df_events.type_name.isin(['Miscontrol','Dispossessed','Clearance','Block','Error']))&
                           (~((df_events.outcome_name=='Injury Clearance')&(df_events.out==True)))&
                           (df_events.next_type_name!='Foul Won')&
                           (~(df_events.outcome_name.isin(['Saved','In Play Safe','Wayward','Blocked','Off T','Pass Offside','Injury Clearance'])))&
                           (~((df_events.out==True)&(df_events.type_name.isin(['Goal Keeper','Shot']))))&
                           (~df_events.outcome_name.isin(['Goal']))&
                           (~((df_events.outcome_name.isin(['Out','Unknown']))&(df_events.type_name.isin(['Pass']))))&
                           (~((df_events.type_name=='Interception')&(df_events.outcome_name.isin(['Lost In Play','Lost Out']))))&                           
                           (~((df_events.type_name=='Goal Keeper')&(df_events.outcome_name.isin(['Touched Out','In Play Danger','Clear','Punched out']))))&
                           (~((df_events.goalkeeper_type_name=='Collected')&(df_events.outcome_name.isin(['Fail']))))&
                           (~(df_events.ball_receipt_incomplete==True))&
                           (~(df_events.ball_recovery_recovery_failure==True)))

In [None]:
match_id = 7430
index_id = 2089

In [None]:
df_events.loc[(df_events['index']>=(index_id-4))&(df_events['index']<=(index_id+7))&(df_events.match_id==match_id),first_cols].dropna(how='all',axis=1)

In [None]:
df_events_other[(df_events_other['index']>=index_id)&(df_events_other['index']<=(index_id+4))&(df_events_other.match_id==match_id)].dropna(how='all',axis=1)

output a game with coloured bars when the possession changes

In [None]:
random_game = np.random.choice(df_events.match_id.unique())
print(random_game)

In [None]:
df_one_game = df_events[df_events.match_id==random_game].copy()

In [None]:
len(df_one_game)

In [None]:
# reorder
first_cols = ['match_id','id','index','period','timestamp','minute','second','possession',
              'duration','team_id','team_name','player_name','pass_recipient_name',
              'type_id','type_name','outcome_id',
              'outcome_name','ball_recovery_recovery_failure','miscontrol_aerial_won','pass_type_name',
              'dispossessed_won_out']
other_cols = [col for col in df_one_game.columns if col not in first_cols]
first_cols.extend(other_cols)
df_one_game = df_one_game[first_cols].copy()

In [None]:
def highlight_home_team(row):
    color = 'white'
    if row.home == True:
        color = 'yellow'
    return ['background-color: %s' % color]*len(row.values)

In [None]:
df_one_game = df_one_game.merge(df_match[['match_id','home_team_name']],
                                on='match_id',validate='m:1',how='left')

df_one_game['home'] = df_one_game.home_team_name == df_one_game.team_name
df_one_game = df_one_game.style.apply(highlight_home_team,axis=1)

In [None]:
df_one_game.to_excel("game_to_check.xlsx")

Create a type column For passes

In [None]:
mask_pass_success = (df_events.pass_outcome_name.isnull())&(df_events.type_name=='Pass')
mask_pass_not_success = df_events.pass_outcome_name == 'Incomplete'
mask_pass_out = df_events.pass_outcome_name.isin(['Out','Injury Clearance'])
mask_pass_offside = df_events.pass_outcome_name == 'Pass Offside'
mask_pass_unknown =  df_events.pass_outcome_name == 'Unknown'

In [None]:
df_events.loc[mask_pass_success,'type'] = 'pass_success'
df_events.loc[mask_pass_not_success,'type'] = 'pass_not_success'
df_events.loc[mask_pass_out,'type'] = 'pass_out_of_play'
df_events.loc[mask_pass_offside,'type'] = 'pass_offside'
df_events.loc[mask_pass_unknown,'type'] = 'pass_unknown'

Create a type column For shots

In [None]:
mask_shot_miss = df_events.shot_outcome_name.isin(['Off T','Wayward','Post','Saved Off Target'])
mask_saved = df_events.shot_outcome_name.isin(['Saved','Saved to Post'])
mask_blocked = df_events.shot_outcome_name.isin(['Blocked'])
mask_goal = df_events.shot_outcome_name.isin(['Goal'])

In [None]:
df_events.loc[mask_shot_miss,'type'] = 'shot_miss'
df_events.loc[mask_saved,'type'] = 'shot_saved'
df_events.loc[mask_blocked,'type'] = 'shot_blocked'
df_events.loc[mask_goal,'type'] = 'shot_goal'

Create a type column For goalkeeper

In [None]:
mask_save = df_events.goalkeeper_type_name.isin(['Save', 'Shot Saved', 'Shot Saved Off T',
                                                 'Shot Saved To Post', 'Saved To Post', 'Penalty Saved To Post',
                                                 'Penalty Saved','Shot Saved Off Target',
                                                 'Shot Saved to Post','Saved to Post','Penalty Saved to Post'])
mask_claim_punch = df_events.goalkeeper_type_name.isin(['Collected', 'Punch','Smother'])
mask_clearance_gk = df_events.goalkeeper_type_name.isin(['Keeper Sweeper'])

In [None]:
df_events.loc[mask_save,'type'] = 'goalkeeper_save'
df_events.loc[mask_claim_punch,'type'] = 'goalkeeper_claim_or_punch'
df_events.loc[mask_clearance_gk,'type'] = 'goalkeeper_clearance'

Create a type column For dribbles

In [None]:
mask_dribble_success = df_events.dribble_outcome_name=='Complete'
mask_dribble_overrun = (df_events.dribble_outcome_name=='Incomplete')&(df_events.dribble_overrun==True)
mask_dribble_dispossessed = (df_events.dribble_outcome_name=='Incomplete')&(df_events.dribble_overrun!=True)

In [None]:
df_events.loc[mask_dribble_success,'type'] = 'dribble_success'
df_events.loc[mask_dribble_overrun,'type'] = 'dribble_overrun'
df_events.loc[mask_dribble_dispossessed,'type'] = 'dribble_dispossessed'

Create a type column For others

In [None]:
df_events.loc[df_events.type_name.isin(['Interception','Block']),'type'] = 'interception_or_block'
df_events.loc[df_events.type_name.isin(['Ball Recovery']),'type'] = 'ball_recovery'
df_events.loc[df_events.type_name.isin(['Miscontrol','Error']),'type'] = 'lose_ball'
df_events.loc[df_events.type_name.isin(['Duel','50/50']),'type'] = 'duel_50_50'
df_events.loc[df_events.type_name.isin(['Own Goal For']),'type'] = 'own_goal'
df_events.loc[(df_events.type_name.isin(['Clearance'])),'type'] = 'clearance'
df_events.loc[df_events.type_name.isin(['Foul Won']),'type'] = 'fouled'
df_events.loc[df_events.type_name.isin(['Dispossessed']),'type'] = 'dispossessed'

Set out for events that go out

In [None]:
df_events.loc[((df_events.type=='pass_out_of_play')|
               (df_events.out==1)|
               (df_events.duel_outcome_name.isin(['Success Out','Lost Out']))|
               (df_events.interception_outcome_name.isin(['Success Out','Lost Out']))|
               (df_events.goalkeeper_outcome_name.isin(['Touched Out','Punched out','Lost Out','Success Out']))|
               (df_events.goalkeeper_punched_out==True)|
               (df_events.goalkeeper_success_out==True)|
               (df_events.shot_outcome_name=='Off T')|
               (df_events.goalkeeper_lost_out==True)),'out'] = True

In [None]:
df_events.loc[df_events.pass_type_name.isin(['Throw-in','Goal Kick','Corner']),'deadball'] = True
df_events[['out_next','team_next']] = df_events.groupby(['match_id','period'])['deadball','team_name'].shift(-1)
mask_to_change = ((df_events['out_next'] == True)&
                  (df_events.out.isnull())&
                  (df_events.team_name!=df_events.team_next)&
                  (~df_events.type.isin(['pass_offside','fouled','shot_goal','pass_success','pass_unknown'])))
df_events.loc[mask_to_change,'out'] = True