In [1]:
# Import Pandas and Numpy
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# Reading and creating directories
import os

# Ignore warnings
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings(action="ignore", message="credentials were not supplied. open data access only")

# Progress bar
from tqdm import tqdm

# Reload modules
%load_ext autoreload
%autoreload 2

# Import Socceraction modules to manipulate StatsBomb open data
import socceraction.spadl as spadl

# Import jenkspy to perform 1-D clustering
from jenkspy import JenksNaturalBreaks

# Library for computing if two line segments intersect
from shapely.geometry import LineString

# Data Visualization libraries
from mplsoccer import Pitch
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def line_breaking_df(df, frames):
    """
    Function that returns a dataframe in the extended SPADL format with a new column that specifies if the action is 
    a line breaking pass. 
    It is recommended to ensure a good execution time that the dataframe df in input contains only passes. 
    
    
    Parameters
    ----------
    df: pd.DataFrame
        dataframe of a given match in the SPADL format
    
    frames: pd.DataFrame
        dataframe that cointains StatsBomb360 data
        
        
    Returns
    -------
    df: pd.DataFrame
        uptated version of the dataframe df given as input with a new column that specifies if the pass is line breaking.
        
    """
    
    # Define a function that identifies line breaking passes
    def row_line_breaking(row):
        """
        It returns if the pass is classified as line breaking or not. If the row given in input does 
        not describe a pass the function returns Nan. It must be applied to a dataframe 
        that contains match event data in the SPADL format, which has also to be associated with 
        a StatsBomb360 frames datataframe. 

        Match-event dataframe: df
        StatsBomb360 frames datataframe: frames

        Parameters
        ----------
        row: pd.Series 
                row that describes a single event/action in the SPADL format. 
                Preferably, the event should be a pass.  


        Returns
        -------
        boolean or NaN
            True if the pass is identified as a line breaking one. False is the pass is not line breaking. 
            NaN if the event is not a pass.
        """
        
        # Import jenkspy to perform 1-D clustering
        from jenkspy import JenksNaturalBreaks
        
        # Library for computing if two line segments intersect
        from shapely.geometry import LineString

        # Check if the action is a pass
        if row['type_name'] != 'pass':

            # If the actions is not a pass, return NaN
            return np.nan

        # If the action is a pass, start the algorithm
        else:

            # Create a 1-row dataframe from df that contains only the pass 
            df_pass = df.loc[row.name: row.name].copy()
            
            # Check first condition: the pass must advance the ball of at least 10 m
            if float(df_pass.end_x) >= float(df_pass.start_x) + 10:
                
                # Merge df_pass with the corresponding event in the SB360 frames
                df_360 = df_pass.merge(frames, how = 'left', left_on = 'original_event_id', right_on = 'id' ).copy()
                
                # Exclude passer's teammates
                df_360_opp = df_360[df_360.teammate == False].copy()
                
                # Exclude the goalkeeper
                df_360_opp = df_360_opp[df_360_opp.keeper == False]
                
                # Check if there are at least 2 defenders 
                if len(df_360_opp) >=2:

                    # The defensive lines, the ones that we want to check if they're broken by the pass,
                    # will be identified with a 1-D clustering on the x-dimension of the pitch. 
                    # 1-D Clustering = Fisher-Jenks algorithm

                    # Find the x-values
                    x = list(df_360_opp.loc_x)

                    # If the number of values (i.e. the number of defenders in the 360-frames) are more than 4
                    # find 3 defensive lines
                    if len(x)>4:

                        # Create the model for 3 clusters
                        jnb = JenksNaturalBreaks(nb_class = 3)

                        # Fit the model
                        jnb.fit(x)

                        # Predict the cluster (i.e. line)
                        df_360_opp['group'] = jnb.labels_
                    
                    # If the number of defenders is 4, consider 2 clusters.
                    elif len(x)>3:
                        
                        # Create the model for 2 clusters, fit and label the values
                        jnb = JenksNaturalBreaks(nb_class = 2)
                        jnb.fit(x)
                        df_360_opp['group'] = jnb.labels_
                   
                    # If there are 2 or 3 defenders in the frames, label all the defenders in the same group
                    else:
                        df_360_opp['group'] = 0
                    
                        # If there are 2 or 3 defenders we want to be sure that they are close
                        # to each other in the x-direction since we label them in the same
                        # Create a function that computes the distance between players
                        def dx(row):
                            k = list(df_360_opp.loc_x)
                            ind = k.index(float(row['loc_x']))
                            z = k[:ind] + k[ind+1:]
                            return min([abs(w-k[ind]) for w in z ])
                        df_360_opp['dx'] = df_360_opp.apply(dx, axis = 1)
                        
                        # Exclude defenders that are more than 8 m (arbitrary) distant to the closest teammate
                        df_360_opp = df_360_opp[df_360_opp.dx <= 8].copy()
                    
                    # Create dataframes for each group and order them in the y-direction 
                    # This order ensures that we have "vertical" lines. 
                    df_0 = df_360_opp[(df_360_opp.group == 0)].sort_values('loc_y', ascending = False).copy()
                    df_0.reset_index(inplace = True)
                    df_1 = df_360_opp[(df_360_opp.group == 1)].sort_values('loc_y', ascending = False).copy()
                    df_1.reset_index(inplace = True)
                    df_2 = df_360_opp[(df_360_opp.group == 2)].sort_values('loc_y', ascending = False).copy()
                    df_2.reset_index(inplace = True)

                    # Create the pass segment with LineString  
                    line = LineString([(df_pass.start_x,df_pass.start_y),(df_pass.end_x,df_pass.end_y)])
                    
                    # Compute if the pass segment intersect with the defender lines segment 
                    # If yes, then the pass is identifies as line breaking
                    # For each group: 
                    # - iterate over the defenders
                    # - check if the defenders are at most 15 m (arbitrary) distant in the y-direction
                    # - Create the defender segment and check the intersection, return True if so
                    i = 0
                    while i < len(df_0)-1:
                        if abs(df_0.iloc[i].loc_y-df_0.iloc[i+1].loc_y) <= 15:
                            other = LineString([(df_0.iloc[i].loc_x,df_0.iloc[i].loc_y),
                                                (df_0.iloc[i+1].loc_x,df_0.iloc[i+1].loc_y)])
                            if line.intersects(other) == True:
                                return True
                        i+=1

                    i = 0
                    while i < len(df_1)-1:
                        if abs(df_1.iloc[i].loc_y-df_1.iloc[i+1].loc_y) <= 15:
                            other = LineString([(df_1.iloc[i].loc_x,df_1.iloc[i].loc_y),
                                                (df_1.iloc[i+1].loc_x,df_1.iloc[i+1].loc_y)])
                            if line.intersects(other) == True:
                                return True
                        i+=1

                    i=0
                    while i < len(df_2)-1:
                        if abs(df_2.iloc[i].loc_y-df_2.iloc[i+1].loc_y) <= 15:
                            other = LineString([(df_2.iloc[i].loc_x,df_2.iloc[i].loc_y),
                                                (df_2.iloc[i+1].loc_x,df_2.iloc[i+1].loc_y)])
                            if line.intersects(other) == True:
                                return True
                        i+=1
                    
                    # Return False if the pass does not break any line 
                    return False
                
                # This actives if there are 0 or 1 defenders in the 360-frame.
                return False
                
            # This activates if the pass does not advance the ball of at least 10 m. 
            else:
                
                # If the pass does not move the ball 10 m forward, return False
                return False
    
    
    
    # Apply row_line_breaking to create a new column that specifies if the pass is line breaking
    df['line_breaking'] = df.apply(row_line_breaking, axis = 1)
    
    # Return the updated dataframe
    return df
    
    
    

In [None]:
# Load data
datafolder = r"C:\Users\alber\Statsbomb"
spadl_h5 = os.path.join(datafolder, "euro2020.h5")

# Import time to count the execution time
import time
st = time.time()

# Open data
with pd.HDFStore(spadl_h5) as spadlstore:
    
    # Get all games of Euro2020
    games = (
        spadlstore["games"]
        .merge(spadlstore["teams"].add_prefix('home_'), how='left')
        .merge(spadlstore["teams"].add_prefix('away_'), how='left'))
    
    # Create a dataframe to store the data about all passes of Euro2020
    all_passes = pd.DataFrame()
    
    # Iterate over each national team
    for squad in set(games.home_team_name.unique()):
        
        # Select all games of squad
        squad_games = games[(games.home_team_name == squad) | (games.away_team_name == squad)]
        
        # Iterate over all games of squad
        for a, game in  tqdm(list(squad_games.iterrows())):
            
            # Get game_id 
            game_id = game.game_id
            
            # Get the event-data dataframe in the extended SPADL format for the game
            actions = (
                spadlstore[f"actions/game_{game_id}"]
                .merge(spadl.actiontypes_df(), how="left")
                .merge(spadl.results_df(), how="left")
                .merge(spadl.bodyparts_df(), how="left")
                .merge(spadlstore["players"], how="left")
               .merge(spadlstore["teams"], how="left")
            )
            
            # Add information about which player is in the next action.
            # This could be useful for analyses about which players receive more line breaking passes
            actions['next_player'] = actions.player_name.shift(-1)
            
            # Make both teams attack from left to right
            actions = spadl.play_left_to_right(actions, game.squeeze()['home_team_id'])
            
            # Get StatsBomb360 frames dataframe of the game
            frames = spadlstore[f"frames/game_{game_id}"]
            
            # Select only passes
            passes = actions[actions.type_name == 'pass'].copy()
            
            # Apply line_breaking_df to understand which passes are breaking line passes
            passes = line_breaking_df(passes, frames)
            
            # Update the dataframe that contain only passes 
            all_passes = pd.concat([all_passes, passes])
            

# Estimate execution time       
et = time.time()
elapsed_time = round((et - st)/60,2)
print('Execution time:', elapsed_time, 'minutes')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:17<00:00,  3.53s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:13<00:00,  4.42s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:44<00:00,  7.49s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:20<00:00,  5.14s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:24<00:00,  6.20s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:49<00:00,  8.18s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:20<00:00,  5.02s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:42<00:00, 10.74s/it]
100%|███████████████████████████████████

In [None]:
# Save dataframe as a csv
all_passes.to_csv('euro2020_passes.csv', index = False)