#Imports

In [None]:
#@title
import sys
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
import numpy as np
!git clone https://github.com/bosemessi/StatsbombOpenData --q
!pip install mplsoccer --q
from mplsoccer import Pitch, VerticalPitch
import ipywidgets as widgets
from ipywidgets import interact,interactive
import matplotlib.pyplot as plt
import matplotlib as mpl 
import matplotlib.font_manager as fm
!pip install highlight-text --q
from highlight_text import ax_text, fig_text
!pip install adjustText --q
from adjustText import adjust_text
!pip install gdown --quiet
import gdown
import matplotlib.patheffects as path_effects
from matplotlib.transforms import Affine2D
import mpl_toolkits.axisartist.floating_axes as floating_axes
import mpl_toolkits.axisartist.angle_helper as angle_helper
from matplotlib.projections import PolarAxes
from mpl_toolkits.axisartist.grid_finder import (FixedLocator, MaxNLocator,
                                                 DictFormatter)
import matplotlib.patches as patches
from PIL import Image
from io import StringIO, BytesIO
from tqdm import tqdm
import requests 
import warnings
warnings.filterwarnings('ignore')
from matplotlib.colors import LinearSegmentedColormap
from scipy.interpolate import RegularGridInterpolator

[?25l[K     |█████▊                          | 10kB 19.0MB/s eta 0:00:01[K     |███████████▍                    | 20kB 9.4MB/s eta 0:00:01[K     |█████████████████               | 30kB 7.7MB/s eta 0:00:01[K     |██████████████████████▊         | 40kB 7.3MB/s eta 0:00:01[K     |████████████████████████████▍   | 51kB 3.1MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.6MB/s 
[?25h  Building wheel for mplsoccer (setup.py) ... [?25l[?25hdone
  Building wheel for adjustText (setup.py) ... [?25l[?25hdone


#Read data file

In [None]:
#@title

df = pd.read_parquet('/content/StatsbombOpenData/WC2018.parquet').reset_index(drop=True)
Df = df[df.location.notna()].reset_index(drop=True)
Df[['x','y']] = np.array(list(Df.location))
Df['endloc'] = np.where(Df.type_name=='Pass',Df.pass_end_location,
                        np.where(Df.type_name=='Carry',Df.carry_end_location,Df.location))
Df[['endX','endY']] = np.array(list(Df.endloc))

defenders = Df[Df.position_name.isin(['Left Center Back','Right Center Back',
                                      'Center Back'])].player_name.unique()
print("Initial number : "+str(len(defenders)))
flagnames = ['Francisco Javier Calvo Quesada','Joshua Kimmich',
             'Luis Carlos Tejada Hansell',
             'Michael Lang','Nicolás Alejandro Tagliafico',
             'Gabriel Iván Mercado','Hörður Björgvin Magnússon','Birkir Már Sævarsson',
             'Fedor Kudryashov','Éver Maximiliano David Banega','Edson Omar Álvarez Velázquez',
             'Marcus Rashford', 'İlkay Gündoğan', 'Dylan Bronn']
print("Suspicious cases : "+str(len(flagnames)))
defenders = list(set(defenders) - set(flagnames))
print("Final number :"+str(len(defenders)))

Initial number : 116
Suspicious cases : 14
Final number :102


#Define a function to extract all defensive metrics from the event data of a particular game

In [None]:
#@title

def game_poss(match_id):
    gamedf = df[(df.match_id==match_id)&(df.period<=4)].reset_index(drop=True)
    team1 = gamedf.team_name[0]
    team2 = gamedf.team_name[1]
    gamedf['time_seconds'] = gamedf['minute']*60 + gamedf['second']
    gamedf['Successful Pressures'] = 0
    passes1 = len(gamedf[(gamedf.team_name==team1)&(gamedf.type_name=='Pass')]) 
    passes2 = len(gamedf[(gamedf.team_name==team2)&(gamedf.type_name=='Pass')]) 
    poss1 = round(passes1*100/(passes1+passes2))
    poss2 = 100 - poss1
    tacklemask = gamedf.duel_type_name=='Tackle'
    tacklesuccess = gamedf.duel_outcome_name.isin(['Success In Play', 'Won',
       'Success Out'])
    interceptmask = gamedf.type_name == 'Interception'
    interceptsuccess = gamedf.interception_outcome_name.isin(['Success In Play', 'Won',
       'Success Out'])
    dribbled_past = gamedf.type_name == 'Dribbled Past'
    fouls = gamedf.type_name == 'Foul Committed'
    aerialL = gamedf.duel_type_name=='Aerial Lost'
    aerialW = gamedf.pass_aerial_won.notna() | gamedf.shot_aerial_won.notna() | \
                gamedf.clearance_aerial_won.notna() | gamedf.miscontrol_aerial_won.notna() 
    blocks = gamedf.type_name == 'Block'
    passblock = gamedf.block_offensive.isna() & gamedf.block_deflection.isna() &\
                gamedf.block_save_block.isna()
    pressures = gamedf.type_name=='Pressure'
    pressuredf = gamedf[pressures]
    for indx in list(pressuredf.index):
        t = pressuredf['time_seconds'][indx]
        possession_team_name = pressuredf['possession_team_name'][indx]
        
        if t+5>=gamedf.time_seconds.max():
            t_end = gamedf.time_seconds.max()
        else:
            t_end = t+5
        
        index_after_five_seconds = list(gamedf[(gamedf.time_seconds>=t) & 
                                               (gamedf.time_seconds<=t_end)].index)
        possession_teams = gamedf['possession_team_name'][index_after_five_seconds].unique().tolist()
                
        if len(possession_teams) == 2:
            gamedf.loc[indx,'Successful Pressures'] = 1
    successful_dribbles = gamedf.dribble_outcome_name == 'Complete'
    failed_dribbles = gamedf.dribble_outcome_name == 'Incomplete'
    miscontrols = gamedf.type_name == 'Miscontrol'
    dispossessions = gamedf.type_name == 'Dispossessed'

    gamedf['Tackles'] = np.where(tacklemask, 1, 0)
    gamedf['Tackles Won'] = np.where(tacklesuccess, 1, 0)
    gamedf['Interceptions'] = np.where(interceptmask, 1, 0)
    gamedf['Interceptions Won'] = np.where(interceptsuccess, 1, 0)
    gamedf['Dribbled Past'] = np.where(dribbled_past,1,0)
    gamedf['Fouls'] = np.where(fouls,1,0)
    gamedf['Aerial Challenges Lost'] = np.where(aerialL,1,0)
    gamedf['Aerial Challenges Won'] = np.where(aerialW,1,0)
    gamedf['Blocks'] = np.where(blocks,1,0)
    gamedf['Blocked Passes'] = np.where(blocks & passblock,1,0)
    gamedf['Pressures'] = np.where(pressures,1,0)
    gamedf['Successful Dribbles'] = np.where(successful_dribbles,1,0)
    gamedf['Failed Dribbles'] = np.where(failed_dribbles,1,0)
    gamedf['Miscontrols'] = np.where(miscontrols,1,0)
    gamedf['Dispossessions'] = np.where(dispossessions,1,0)
    gamedf['Ball Recovery'] = np.where(gamedf.type_name=='Ball Recovery',1,0)
    gamedf['Clearances'] = np.where(gamedf.type_name=='Clearance',1,0)

    aggdict = {'Tackles':'sum', 'Tackles Won':'sum','Interceptions':'sum',
               'Interceptions Won':'sum','Dribbled Past':'sum','Fouls':'sum',
               'Aerial Challenges Lost':'sum','Aerial Challenges Won':'sum',
               'Blocks':'sum','Blocked Passes':'sum','Pressures':'sum',
               'Successful Pressures':'sum','Successful Dribbles':'sum',
               'Failed Dribbles':'sum','Miscontrols':'sum','Dispossessions':'sum',
               'Ball Recovery':'sum','Clearances':'sum'}

    groupedstats = gamedf.groupby(['player_name','team_name']).agg(aggdict).reset_index()
    groupedstats = groupedstats.sort_values(by=['team_name','Successful Pressures'],
                                            ascending=False).reset_index(drop=True)
    groupedstats.rename(columns={"player_name": "name","team_name":'team'},
                        errors="raise",inplace=True)
    groupedstats['Possession %'] = np.where(groupedstats.team==team1,poss1,poss2) 
    groupedstats['True Tackles'] = groupedstats['Tackles'] + groupedstats['Fouls'] + \
                                    groupedstats['Dribbled Past']
    groupedstats['True Tackle Win%'] = groupedstats['Tackles']*100/groupedstats['True Tackles']
    groupedstats['True Interceptions'] = groupedstats['Interceptions'] + \
                                groupedstats['Blocked Passes']
    groupedstats['Defensive Acts'] = groupedstats['Tackles'] + groupedstats['Interceptions'] + \
                                     groupedstats['Clearances'] + groupedstats['Ball Recovery'] + \
                                     groupedstats['Blocks']
    return groupedstats


#Loop over all games and extract the event data, then save

In [None]:
groupgamedfs = []
for game in tqdm(df.match_id.unique(),desc='Reading all games'):
    groupgamedfs.append(game_poss(game))
groupgamedfs = pd.concat(groupgamedfs,ignore_index=True)

Reading all games: 100%|██████████| 64/64 [00:38<00:00,  1.66it/s]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
groupgamedfs.to_parquet('/content/drive/MyDrive/Norwich/defensivemetrics.parquet',index=False)