# Table of Contents
0. [Others](#others)
    1. [Title Page](#title)
1. [Introduction](#introduction)
2. [GKP Models](#gkp)
    1. [Geometric GKP Models](#geometric)
    2. [Data-Driven GKP Models](#data-driven)
    3. [All Modeled GK Positions](#modeled)
3. [Evaluation Methods](#evaluation)
4. [Results](#results)
    1. [Proximity to Actual GK Position for All Outcomes](#41)
    2. [Proximity to Actual GK Position for Specific Outcomes](#42)
    3. [Harrison Comparison](#43)
    4. [Correlation Between Two Variables](#44)
    5. [Messi Test](#45)
5. [Discussion](#discussion)
6. [Summary and Outlook](#conclusion)
7. [Appendix](#appendix)
    1. [GKP Models](#gkp-appendix)
    2. [Datasets](#datasets)

### Used Code Snippets

Intersection of line and circle: https://stackoverflow.com/questions/30844482/what-is-most-efficient-way-to-find-the-intersection-of-a-line-and-a-circle-in-py

Semicircle: https://stackoverflow.com/questions/69185636/how-to-plot-the-equation-for-a-semicircle

Coordinate system: https://pygmalion.nitri.org/cartesian-coordinates-with-matplotlib-1263.html

Scatter plot to heatmap: https://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set/59920744#59920744

### Import

In [None]:
# Import packages
import os
import math
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
from matplotlib.patches import Arc
from statistics import mean
from mplsoccer import Pitch
from mplsoccer import VerticalPitch
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

# Import sklearn packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

### Configuration

Define settings:

In [None]:
# Display all columns
pd.set_option('display.max_columns', None)

Define constant variables in upper cases:

In [None]:
# Directories with json files
DIRECTORY_EVENTS = '/Users/gian-andrea/Documents/Masterarbeit (offline)/open-data-master/data/events/'
DIRECTORY_360 = '/Users/gian-andrea/Documents/Masterarbeit (offline)/open-data-master/data/three-sixty/'

# Filepath to one event file
EVENTPATH = DIRECTORY_EVENTS + '3788745.json'

# Filepath to one match file
# MATCHPATH = '/Users/gian-andrea/Documents/Masterarbeit (offline)/statsbomb360-euro2020/matches.json'

# List with all filenames of the men's and women's data
LS_FILENAME_MEN = ['3788742.json', '3788754.json', '3794690.json', '3794686.json', '3788758.json', '3788774.json', '3788762.json', '3795107.json', '3788763.json', '3788775.json', '3788759.json', '3794687.json', '3794691.json', '3788755.json', '3788743.json', '3788748.json', '3788772.json', '3788764.json', '3795506.json', '3788744.json', '3788752.json', '3788768.json', '3788769.json', '3788753.json', '3788745.json', '3788765.json', '3788773.json','3788749.json', '3788766.json', '3788770.json', '3788750.json', '3788746.json', '3788747.json', '3788751.json', '3788771.json', '3788767.json', '3795109.json', '3795220.json', '3788756.json','3794688.json', '3788760.json', '3788776.json', '3795187.json', '3794692.json', '3794685.json', '3788761.json', '3788741.json', '3794689.json', '3788757.json', '3795221.json', '3795108.json']
LS_FILENAME_WOMEN = ['3835338.json', '3844386.json', '3835322.json', '3835334.json', '3835335.json', '3835323.json', '3835319.json', '3844387.json', '3835339.json', '3835342.json', '3835324.json', '3835332.json', '3835328.json', '3835329.json', '3835333.json', '3835325.json', '3835330.json', '3835326.json', '3845507.json', '3845506.json', '3835327.json', '3835331.json', '3844384.json', '3835341.json', '3835336.json', '3835320.json', '3847567.json', '3835321.json', '3835337.json', '3835340.json', '3844385.json']
# List with all TOGKP model names
LS_TOGKP_MODEL_NAMES = ['', '_zonal', '_arc', '_line', '_bisector_wingspan', '_bisector_dive', '_inscribed', '_knn', '_ml']

# Horizontal pitch positions
x_LH, y_LH = 120, 44   # Left goalpost
x_RH , y_RH = 120, 36  # Right goalpost
x_MH, y_MH = 120, 40   # Goal middle
x_OH, y_OH = 117.6, 40 # Orange point
x_EX, y_EX = 0, 0      # Dummie TOGKP values for shots in undefined region
x_PENALTY, y_PENALTY = 108, 40
radius_PENALTY = 10 

# Vertical pitch positions
x_LV, y_LV = 36, 0      # Left goalpost
x_RV , y_RV = 44, 0     # Right goalpost
x_MV, y_MV = 40, 0      # Goal middle
x_SV, y_SV = 50, 20     # Random shot position
x_GKV, y_GKV = 41.75, 3 # Random gk position

# Pitch values
linewidth_PITCH = 4
s_PITCH = 150
s_ZONES = 1000
alpha_PITCH = 0.4
alpha_PITCH_LIGHT = 0.2
fontsize_PITCH = 30
fontsize_PITCH_SMALL = 25

# Define orange polygons
polygon_orange_1 = Polygon([(114, 62), (120, 62), (120, 80), (114, 80)])
polygon_orange_2 = Polygon([(114, 0), (120, 0), (120, 18), (114, 18)])
polygon_orange_3 = Polygon([(60, 62), (98, 62), (98, 80), (60, 80)])
polygon_orange_4 = Polygon([(60, 0), (98, 0), (98, 18), (60, 18)])
# Define yellow polygons
polygon_yellow_1 = Polygon([(117.6, 44), (120, 44), (120, 62), (106.8, 62)])
polygon_yellow_2 = Polygon([(106.8, 18), (120, 18), (120, 36), (117.6, 36)])
# Define red polygon
polygon_red = Polygon([(60, 18), (98, 18), (98, 0), (114, 0), (114, 18), (102, 18), (102, 62), (114, 62), (114, 80), (98, 80), (98, 62), (60, 62)])
# Define blue polygon
polygon_blue = Polygon([(98, 36.5), (114, 39), (114, 41), (98, 43.5)])
# Define light blue polygons
polygon_light_1 = Polygon([(98, 43.5), (114, 41), (117.6, 44), (106.8, 62), (102, 62), (102, 48), (98, 48)])
polygon_light_2 = Polygon([(102, 18), (106.8, 18), (117.6, 36), (114, 39), (98, 36.5), (98, 32), (102, 32)])

### Support Methods

In [None]:
def transform_horizontal_to_vertical_coordinates(x_horizontal, y_horizontal):
    """
    Return transformed coordinates
    """
    
    # Save vertical coordinates
    x_vertical = y_horizontal
    y_vertical = 120 - x_horizontal
    
    # Return vertical coordinates
    return (x_vertical, y_vertical)

In [None]:
def calculate_circle_line_intersection(pt1, full_line=True, tangent_tol=1e-9):
    """
    Return intersections of a circle and a line
    Intersections can happen 0, 1, or 2 times
    The relevant intersection is always the first one
    """
    
    # pt1: x-/y-coordinates of first point of segment
    # full_line: True to find intersections along full line (not just in segment), False to find intersections within segment
    # tangent_tol: Numerical tolerance at which intersections are close enough to be tangent
    
    circle_center = (120, 40) # x-/y-coordinates of circle center
    circle_radius = 4         # radius of circle
    pt2 = (120, 40)           # x-/y-coordinates of second point of segment
    
    (p1x, p1y), (p2x, p2y), (cx, cy) = pt1, pt2, circle_center
    (x1, y1), (x2, y2) = (p1x - cx, p1y - cy), (p2x - cx, p2y - cy)
    dx, dy = (x2 - x1), (y2 - y1)
    dr = (dx ** 2 + dy ** 2)**.5
    big_d = x1 * y2 - x2 * y1
    discriminant = circle_radius ** 2 * dr ** 2 - big_d ** 2

    # No intersection between circle and line
    if discriminant < 0:
        return []
    # There may be 0, 1, or 2 intersections with the segment
    else:
        intersections = [
            (cx + (big_d * dy + sign * (-1 if dy < 0 else 1) * dx * discriminant**.5) / dr ** 2,
             cy + (-big_d * dx + sign * abs(dy) * discriminant**.5) / dr ** 2)
            # This makes sure the order along the segment is correct
            for sign in ((1, -1) if dy < 0 else (-1, 1))]
        # If only considering the segment, filter out intersections that do not fall within the segment
        if not full_line:
            fraction_along_segment = [(xi - p1x) / dx if abs(dx) > abs(dy) else (yi - p1y) / dy for xi, yi in intersections]
            intersections = [pt for pt, frac in zip(intersections, fraction_along_segment) if 0 <= frac <= 1]
        # If line is tangent to circle, return just one point (as both intersections have same location)
        if len(intersections) == 2 and abs(discriminant) <= tangent_tol: 
            # return Sequence[Tuple[float, float]]: list of length 0, 1, or 2, where each element is a point at which circle intercepts line segment
            return [intersections[0]]
        else:
            return intersections

In [None]:
def add_distance_between_gk_and_gk_alt(df, gk_alt):
    """
    Return dataframe with distance between gk position and alternative gk position
    """
    
    # Create empty list
    ls_distance = []
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define two gk positions
        x_gk = df.at[row, 'x_gk']
        y_gk = df.at[row, 'y_gk']
        x_gk_alt = df.at[row, 'x_gk' + gk_alt]
        y_gk_alt = df.at[row, 'y_gk' + gk_alt]
        # Add distance between two gk positions to list
        ls_distance.append(round(math.dist([x_gk, y_gk], [x_gk_alt, y_gk_alt]), 2))
    
    # Define column name
    col_name = 'distance_between_gk_and_gk' + gk_alt
    # Add list to dataframe
    df[col_name] = ls_distance

    # Return dataframe
    return df

In [None]:
def add_distance_between_gk_knn_and_gk_alt(df, gk_alt):
    """
    Return dataframe with distance between knn gk position and alternative gk position
    """
    
    # Create empty list
    ls_distance = []
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define two gk positions
        x_gk_knn = df.at[row, 'x_gk_knn']
        y_gk_knn = df.at[row, 'y_gk_knn']
        x_gk_alt = df.at[row, 'x_gk' + gk_alt]
        y_gk_alt = df.at[row, 'y_gk' + gk_alt]
        # Add distance between two gk positions to list
        ls_distance.append(round(math.dist([x_gk_knn, y_gk_knn], [x_gk_alt, y_gk_alt]), 2))
    
    # Define column name
    col_name = 'distance_between_gk_knn_and_gk' + gk_alt
    # Add list to dataframe
    df[col_name] = ls_distance

    # Return dataframe
    return df

In [None]:
def add_distance_between_goalline_and_gk_alt(df, gk_name):
    """
    Return dataframe with distance between goalline any gk position
    """
    
    # Create empty list
    ls_distance = []
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define x of gk position
        x_gk = df.at[row, 'x_gk' + gk_name]
        # Add distance between gk position and goalline to list
        ls_distance.append(120 - x_gk)
    
    # Define column name
    col_name = 'distance_between_goalline_and_gk' + gk_name
    # Add list to dataframe
    df[col_name] = ls_distance

    # Return dataframe
    return df

In [None]:
def add_distance_between_bisector_and_gk_alt(df, gk_name):
    """
    Return dataframe with distance between bisector and any gk position
    """
    
    # Create empty list
    ls_distance = []

    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define shot positions
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Define gk positions
        x_gk = df.at[row, 'x_gk' + gk_name]
        y_gk = df.at[row, 'y_gk' + gk_name]
        # Define linear equation of bisector
        m_bisector = df.at[row, 'bisector_slope']
        q_bisector = y_shot - m_bisector *x_shot
        
        # If central shot
        if m_bisector == 0:
            # Calculate intersection of two linear equations
            x_intersection = x_gk
            y_intersection = 40
        # If decentral shot
        else:
            # Define linear equation through gk position and perpendicular to bisector
            m_perpendicular = -(1/m_bisector)
            q_perpendicular = y_gk - m_perpendicular *x_gk
            # Calculate intersection of two linear equations
            y_intersection = (q_perpendicular - q_bisector*m_perpendicular/m_bisector)/(1-(m_perpendicular/m_bisector))
            x_intersection = (y_intersection-q_bisector)/m_bisector

        # Add value to list
        ls_distance.append(round(math.dist([x_gk, y_gk], [x_intersection, y_intersection]), 2))
    
    # Define column name
    col_name = 'distance_between_bisector_and_gk' + gk_name
    # Add list to dataframe
    df[col_name] = ls_distance
    
    # Return dataframe
    return df

In [None]:
def add_distance_between_ballline_and_gk_alt(df, gk_name):
    """
    Return dataframe with distance between ball line and any gk position
    """
    
    # Create empty list
    ls_distance = []

    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define shot position
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Define gk position
        x_gk = df.at[row, 'x_gk' + gk_name]
        y_gk = df.at[row, 'y_gk' + gk_name]
        # Define linear equation of ballline
        m_ballline = (y_shot-y_MH)/(x_shot-x_MH)
        q_ballline = y_shot - m_ballline *x_shot
        
        # If central shot
        if m_ballline == 0:
            # Calculate intersection of two linear equations
            x_intersection = x_gk
            y_intersection = 40
        # If decentral shot
        else:
            # Define linear equation through gk position and perpendicular to ballline
            m_perpendicular = -(1/m_ballline)
            q_perpendicular = y_gk - m_perpendicular *x_gk
            # Calculate intersection of two linear equations
            y_intersection = (q_perpendicular - q_ballline*m_perpendicular/m_ballline)/(1-(m_perpendicular/m_ballline))
            x_intersection = (y_intersection-q_ballline)/m_ballline

        # Add value to list
        ls_distance.append(round(math.dist([x_gk, y_gk], [x_intersection, y_intersection]), 2))
    
    # Define column name
    col_name = 'distance_between_ballline_and_gk' + gk_name
    # Add list to dataframe
    df[col_name] = ls_distance
    
    # Return dataframe
    return df

In [None]:
def add_reaction_distance(df):
    """
    Return dataframe with distance between shot position and gk position on bisector
    """
    
    # Create empty list
    ls_distance = []

    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define shot position
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Define gk position
        x_gk = df.at[row, 'x_gk']
        y_gk = df.at[row, 'y_gk']
        # Define linear equation of bisector
        m_bisector = df.at[row, 'bisector_slope']
        q_bisector = y_shot - m_bisector *x_shot
        
        # If central shot
        if m_bisector == 0:
            # Calculate intersection of two linear equations
            x_intersection = x_gk
            y_intersection = 40
        # If decentral shot
        else:
            # Define linear equation through gk position and perpendicular to bisector
            m_perpendicular = -(1/m_bisector)
            q_perpendicular = y_gk - m_perpendicular *x_gk
            # Calculate intersection of two linear equations
            y_intersection = (q_perpendicular - q_bisector*m_perpendicular/m_bisector)/(1-(m_perpendicular/m_bisector))
            x_intersection = (y_intersection-q_bisector)/m_bisector

        # Add value to list
        ls_distance.append(round(math.dist([x_shot, y_shot], [x_intersection, y_intersection]), 2))
    
    # Add list to dataframe
    df['reaction_distance'] = ls_distance
    
    # Return dataframe
    return df

In [None]:
def add_max_open_goal(df):
    """
    Return dataframe with maximal open goal width of virtual goal
    """
    
    # Create empty list
    ls_max_open_goal = []

    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define shot position
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Define gk position
        x_gk = df.at[row, 'x_gk']
        y_gk = df.at[row, 'y_gk']
        # Define linear equation of bisector
        m_bisector = df.at[row, 'bisector_slope']
        q_bisector = y_shot - m_bisector*x_shot
        # Define linear equation from shot position to left goalpost
        m_shot_l = (y_shot - y_LH)/(x_shot - x_LH)
        q_shot_l = y_LH - m_shot_l*x_LH
        # Define linear equation from shot position to right goalpost
        m_shot_r = (y_shot - y_RH)/(x_shot - x_RH)
        q_shot_r = y_RH - m_shot_r*x_RH
        
        # If central shot
        if m_bisector == 0:
            # Calculate intersection of two linear equations
            x_intersection = x_gk
            y_intersection_l = m_shot_l*x_gk + q_shot_l
            y_intersection_r = m_shot_r*x_gk + q_shot_r
        # If decentral shot
        else:
            # Define linear equation through gk position and perpendicular to bisector
            m_perpendicular = -(1/m_bisector)
            q_perpendicular = y_gk - m_perpendicular*x_gk
            # Calculate intersection of two linear equations
            if m_shot_l == 0:
                y_intersection_l = y_shot
                x_intersection_l = (y_intersection_l - q_perpendicular)/m_perpendicular
            else:
                y_intersection_l = (q_perpendicular - q_shot_l*m_perpendicular/m_shot_l)/(1 - (m_perpendicular/m_shot_l))
                x_intersection_l = (y_intersection_l - q_shot_l)/m_shot_l
            if m_shot_r == 0:
                y_intersection_r = y_shot
                x_intersection_r = (y_intersection_r - q_perpendicular)/m_perpendicular
            else:
                y_intersection_r = (q_perpendicular - q_shot_r*m_perpendicular/m_shot_r)/(1 - (m_perpendicular/m_shot_r))
                x_intersection_r = (y_intersection_r - q_shot_r)/m_shot_r

        # Save widths
        d_l = math.dist([x_gk, y_gk], [x_intersection_l, y_intersection_l])
        d_r = math.dist([x_gk, y_gk], [x_intersection_r, y_intersection_r])
        # Add largest width to list
        if df.at[row, 'shooting_cone_without_gk'] == True:
            if y_shot >= 40:
                y_p = 44
                q_shot = q_shot_l
                m_shot = m_shot_l
            else:
                y_p = 36
                q_shot = q_shot_r
                m_shot = m_shot_r
            q_perpendicular = y_p - m_perpendicular*120
            y_intersection = (q_perpendicular - q_bisector*m_perpendicular/m_bisector)/(1 - (m_perpendicular/m_bisector))
            x_intersection = (y_intersection - q_bisector)/m_bisector
            ls_max_open_goal.append(round(2*math.dist([120, y_p], [x_intersection, y_intersection]), 2))
        elif d_l > d_r:
            ls_max_open_goal.append(round(d_l, 2))
        else:
            ls_max_open_goal.append(round(d_r, 2))
            
    # Add list to dataframe
    df['max_open_goal'] = ls_max_open_goal
    
    # Return dataframe
    return df

In [None]:
def add_ratio_between_shot_gk_and_gk_bisector(df, gk_name):
    """
    Return dataframe with ratio of distance between shot position and gk position and distance between gk position and bisector
    """
    
    # Create empty list
    ls_ratio_shotgk_gkbisector = []
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define shot position
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Define gk position
        x_gk = df.at[row, 'x_gk' + gk_name]
        y_gk = df.at[row, 'y_gk' + gk_name]
        # Define distance between gk position and bisector
        distance_between_bisector_and_gk = df.at[row, 'distance_between_bisector_and_' + 'gk' + gk_name]
        
        # Calculate value
        if distance_between_bisector_and_gk != 0:
            ratio_shotgk_gkbisector = round(math.dist([x_shot, y_shot], [x_gk, y_gk]) / distance_between_bisector_and_gk, 2)
        else:
            ratio_shotgk_gkbisector = 4030
        # Add value to list  
        ls_ratio_shotgk_gkbisector.append(ratio_shotgk_gkbisector)

    # Define column name
    col_name = 'ratio_shotgk' + gk_name + '_bisectorgk' + gk_name
    # Add list to dataframe
    df[col_name] = ls_ratio_shotgk_gkbisector

    # Return dataframe
    return df

In [None]:
def mirror_shots(df):
    """
    Return dataframe with all shot positions and gk positions from one side of the pitch to the other
    """
    
    # Copy dataframe
    df = df.copy()
    # Create empty lists
    ls_y_shot_mirror = []
    ls_y_gk_mirror = []
    ls_mirror = []

    # Iterate over all rows in dataframe
    for row in range(len(df)):
        if df.at[row, 'y_shot'] < 40:
            ls_y_shot_mirror.append(80 - df.at[row, 'y_shot'])
            ls_y_gk_mirror.append(80 - df.at[row, 'y_gk'])
            ls_mirror.append(True)
        else:
            ls_y_shot_mirror.append(df.at[row, 'y_shot'])
            ls_y_gk_mirror.append(df.at[row, 'y_gk'])
            ls_mirror.append(False)

    # Add lists to dataframe
    df['y_shot'] = ls_y_shot_mirror
    df['y_gk'] = ls_y_gk_mirror
    df['mirror'] = ls_mirror
    
    # Return dataframe
    return df

In [None]:
def get_angle(a, b, c):
    """
    Calculate angle of ba-bc in degrees
    """
    ang = math.degrees(math.atan2(c[1]-b[1], c[0]-b[0]) - math.atan2(a[1]-b[1], a[0]-b[0]))
    return ang + 360 if ang < 0 else ang

### Dataframe Methods

In [None]:
def create_dataframe(ls_filename):
    """
    Return dataframe of all json files in list
    """
    
    # Create empty lists
    ls_shot = []
    ls_possession_team = []
    ls_location = []
    
    for filename in ls_filename:
        # Save filepath to current file
        filepath = os.path.join(DIRECTORY_EVENTS, filename)
        # Transform current json file to dataframe
        df_file = pd.read_json(filepath)
        # Transform columns of current dataframe to lists and add them to existing lists
        ls_shot.extend(df_file['shot'].tolist())
        ls_possession_team.extend(df_file['possession_team'].tolist())
        ls_location.extend(df_file['location'].tolist())

    # Create empty dataframe and add complete lists 
    df = pd.DataFrame()
    df['shot'] = ls_shot
    df['possession_team'] = ls_possession_team
    df['location'] = ls_location

    # Return dataframe
    return df

In [None]:
def filter_for_relevant_shots(df):
    """
    Return dataframe filtered for undeflected shots from open plays
    """
    
    # Filter dataframe for shots
    df = df.dropna().reset_index(drop = True)
    # Create empty list
    ls_rows_to_delete = []
    
    # Add row number of all deflected or non open play shots to list 
    for row in range(len(df)):
        if df.at[row, 'shot']['type']['name'] != 'Open Play' or 'deflected' in df.iloc[row]['shot']:
            ls_rows_to_delete.append(row)
    # Delete deflected or non open play shots from dataframe
    df.drop(ls_rows_to_delete, inplace = True)
    
    # Transform shot location to shot coordinates
    df['x_shot'], df['y_shot'] = zip(*df['location'])
    # Delete shot location column
    del df['location']

    # Return dataframe
    return df.reset_index(drop = True)

In [None]:
def add_data(df):
    """
    Return dataframe with additional data
    """
    
    # Create previous dataframe
    df = filter_for_relevant_shots(df)
    # Create empty lists
    ls_shot = []
    ls_outcome = []
    ls_statsbomb_xg = []
    ls_possession_team = []
    ls_shot_body_part = []
    ls_shot_technique = []
    ls_goalkeeper = []
    ls_goalkeeper_id = []
    ls_x_shot = []
    ls_y_shot = []
    ls_x_gk = []
    ls_y_gk = []
    ls_player_locations = []

    # Iterate over all rows in dataframe
    for row in range (len(df)):
        
        # If current row has freeze frame
        if 'freeze_frame' in df.iloc[row]['shot']:
            
            # Create empty list
            ls_player_locations_nested = []
            # Add values to lists
            ls_shot.append(df.iloc[row]['shot'])
            ls_outcome.append(df.iloc[row]['shot']['outcome']['name'])
            ls_statsbomb_xg.append(round(df.iloc[row]['shot']['statsbomb_xg'], 4))
            ls_possession_team.append(df.iloc[row]['possession_team']['name'])
            ls_shot_body_part.append(df.iloc[row]['shot']['body_part']['name'])
            ls_shot_technique.append(df.iloc[row]['shot']['technique']['name'])
            ls_x_shot.append(df.iloc[row]['x_shot'])
            ls_y_shot.append(df.iloc[row]['y_shot'])
            
            # Create bool that tests whether a goalkeeper was added in the next for loop
            gk_check = False
            
            # Save shot information from current freeze frame
            shot = df.iloc[row]['shot']['freeze_frame']
            # Iterate over all players in freeze frame
            for i in range (len(shot)):
                 
                # Add gk data to lists
                if shot[i]['position']['name'] == 'Goalkeeper' and shot[i]['teammate'] == False:
                    ls_goalkeeper.append(shot[i]['player']['name'])
                    ls_goalkeeper_id.append(int(shot[i]['player']['id']))
                    ls_x_gk.append(shot[i]['location'][0])
                    ls_y_gk.append(shot[i]['location'][1])
                    gk_check = True
                # Add remaining player locations to list
                else:
                    ls_player_locations_nested.append(shot[i]['location'])
            
            # Add empty gk data to lists to delete later
            if gk_check == False:
                ls_goalkeeper.append('')
                ls_goalkeeper_id.append('')
                ls_x_gk.append('')
                ls_y_gk.append('')
                    
            # Add remaining player locations or empty string to list
            if len(ls_player_locations_nested) > 0:
                ls_player_locations.append(ls_player_locations_nested)
            else:
                ls_player_locations.append('')
    
    # Create empty dataframe and add complete lists
    df = pd.DataFrame()
    df['shot'] = ls_shot
    df['outcome'] = ls_outcome
    df['statsbomb_xg'] = ls_statsbomb_xg
    df['possession_team'] = ls_possession_team
    df['shot_body_part'] = ls_shot_body_part
    df['shot_technique'] = ls_shot_technique
    df['goalkeeper'] = ls_goalkeeper
    df['goalkeeper_id'] = ls_goalkeeper_id
    df['x_shot'] = ls_x_shot
    df['y_shot'] = ls_y_shot
    df['x_gk'] = ls_x_gk
    df['y_gk'] = ls_y_gk
    df['player_locations'] = ls_player_locations
    
    # Create list with row number of shots without gk position
    ls_rows_to_delete = list(df.where(df['goalkeeper'] == '').dropna().index.values)
    # Delete shots without gk position from dataframe
    df.drop(ls_rows_to_delete, inplace = True)

    # Return dataframe
    return df

In [None]:
def add_locations_of_players_in_shooting_cone(df):
    """
    Return dataframe with locations and count of players in shooting cone
    """
    
    # Create previous dataframe
    df = add_data(df)
    # Filtered for shots with locations from other players
    df = df.where(df['player_locations'] != '').dropna().reset_index(drop = True)
    # Create empty lists
    ls_shooting_cone_players = []
    ls_shooting_cone_players_count = []
        
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Create empty list
        ls_shooting_cone_players_nested = []
        # Iterate over all players in shot
        for i in range (len(df.at[row, 'player_locations'])):
            # Define player location and shooting cone
            player_location = Point(df.at[row, 'player_locations'][i][0], df.at[row, 'player_locations'][i][1])
            shooting_cone = Polygon([(df.at[row, 'x_shot'], df.at[row, 'y_shot']), (x_RH, y_RH), (x_LH, y_LH)])
            # Add players locations inside shooting cone to list
            if shooting_cone.contains(player_location) == True:
                ls_shooting_cone_players_nested.append(df.at[row, 'player_locations'][i])
        
        # Add player locations inside shooting cone or empty string to list
        if len(ls_shooting_cone_players_nested) > 0:
            ls_shooting_cone_players.append(ls_shooting_cone_players_nested)
        else:
            ls_shooting_cone_players.append('')
        # Add shooting cone players count to list    
        ls_shooting_cone_players_count.append(len(ls_shooting_cone_players_nested))
    
    # Add lists to dataframe
    df['shooting_cone_players'] = ls_shooting_cone_players
    df['shooting_cone_players_count'] = ls_shooting_cone_players_count
    
    # Return dataframe
    return df

In [None]:
def add_shooting_angle(df):
    """
    Return dataframe with shooting angle
    """
    
    # Create previous dataframe
    df = add_locations_of_players_in_shooting_cone(df)
    # Create empty list
    ls_shooting_angle = []
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        # Calculate slopes from shot to goalposts
        m_r = (y_RH - df.at[row, 'y_shot'])/(x_RH - df.at[row, 'x_shot'])
        m_l = (y_LH - df.at[row, 'y_shot'])/(x_LH - df.at[row, 'x_shot'])
        # Add shooting angle to list
        ls_shooting_angle.append(round(math.degrees(abs(math.atan(m_r) - math.atan(m_l))), 4))

    # Add list to dataframe
    df['shooting_angle'] = ls_shooting_angle
    
    # Return dataframe
    return df

In [None]:
def add_bisector_slope(df):
    """
    Return dataframe with bisector slope
    x_shot, y_shot = origin = shot 
    x_RH, y_RH = point = right goalpost
    x_rotated, y_rotated = rotated point
    """
    
    # Create previous dataframe
    df = add_shooting_angle(df)
    # Create empty list
    ls_bisector_slope = []
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):
    
        # Define shot position
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Define half shooting angle in radians
        half_shooting_angle = math.radians(df.at[row, 'shooting_angle'])/2

        # Rotate point counterclockwise by half shooting angle around origin
        x_rotated = x_shot + math.cos(half_shooting_angle) * (x_RH - x_shot) - math.sin(half_shooting_angle) * (y_RH - y_shot)
        y_rotated = y_shot + math.sin(half_shooting_angle) * (x_RH - x_shot) + math.cos(half_shooting_angle) * (y_RH - y_shot)
        
        # Add bisector slope to list
        ls_bisector_slope.append(round((y_rotated-y_shot)/(x_rotated-x_shot), 4))
        
    # Add list to dataframe
    df['bisector_slope'] = ls_bisector_slope   
    
    # Return dataframe
    return df

In [None]:
def implement_bisector_dive_model(df):
    """
    Return dataframe with TOGKP from bisector model
    """
    
    # Create previous dataframe
    df = add_bisector_slope(df)
    # Create empty lists
    ls_x_gk_bisector = []
    ls_y_gk_bisector = []
    # Define gk reach
    GOALKEEPER_REACH = 3.41

    # Iterate over all rows in dataframe
    for row in range(len(df)):

        # Define shot position
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Define linear equation of bisector
        m_bisector = df.at[row, 'bisector_slope']
        q_bisector = y_shot - m_bisector*x_shot
        # Define half shooting angle in radians
        half_shooting_angle = math.radians(df.at[row, 'shooting_angle'])/2

        # If central shot
        if m_bisector == 0:
            x_shift = (math.cos(half_shooting_angle)*GOALKEEPER_REACH)/math.sin(half_shooting_angle)
            x_gk_bisector = x_shot + x_shift
            y_gk_bisector = y_MH
        # If decentral shot
        else:
            # Define linear equation through closer goalpost and perpendicular to bisector
            distance_right_goalpost = math.dist([x_shot, y_shot], [x_RH, y_RH])
            distance_left_goalpost = math.dist([x_shot, y_shot], [x_LH, y_LH])
            if distance_right_goalpost < distance_left_goalpost:
                y_goalpost = y_RH
            else:
                y_goalpost = y_LH
            x_goalpost = x_RH
            m_goalpost = -(1/m_bisector)
            q_goalpost = y_goalpost - m_goalpost*x_goalpost

            # Calculate intersection of two linear equations
            y_gk_bisector = (q_bisector - q_goalpost*m_bisector/m_goalpost)/(1 - (m_bisector/m_goalpost))
            x_gk_bisector = (y_gk_bisector - q_goalpost)/m_goalpost

        # If shooting angle is not covered by goalkeeper reach
        if math.dist([x_goalpost, y_goalpost], [x_gk_bisector, y_gk_bisector]) > GOALKEEPER_REACH:

            # Distance between shot position and TOGKP to cover gk reach
            distance = (math.cos(half_shooting_angle)*GOALKEEPER_REACH)/math.sin(half_shooting_angle)
            # Shift from shot position to TOGKP
            x_shift = distance/(math.sqrt(1+m_bisector*m_bisector))
            y_shift = x_shift*m_bisector
            # Calculate TOGKP
            x_gk_bisector = x_shot + x_shift
            y_gk_bisector = y_shot + y_shift

        # Add bisector gk coordinates to lists
        ls_x_gk_bisector.append(round(x_gk_bisector, 2))
        ls_y_gk_bisector.append(round(y_gk_bisector, 2))

    # Add lists to dataframe
    df['x_gk_bisector_dive'] = ls_x_gk_bisector
    df['y_gk_bisector_dive'] = ls_y_gk_bisector

    # Return dataframe
    return df

In [None]:
def implement_bisector_wingspan_model(df):
    """
    Return dataframe with TOGKP from bisector model
    """
    
    # Create previous dataframe
    df = implement_bisector_dive_model(df)
    # Create empty lists
    ls_x_gk_bisector = []
    ls_y_gk_bisector = []
    # Define gk reach
    GOALKEEPER_REACH = 2.19
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):

        # Define shot position
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Define linear equation of bisector
        m_bisector = df.at[row, 'bisector_slope']
        q_bisector = y_shot - m_bisector*x_shot
        # Define half shooting angle in radians
        half_shooting_angle = math.radians(df.at[row, 'shooting_angle'])/2

        # If central shot
        if m_bisector == 0:
            x_shift = (math.cos(half_shooting_angle)*GOALKEEPER_REACH)/math.sin(half_shooting_angle)
            x_gk_bisector = x_shot + x_shift
            y_gk_bisector = y_MH
        # If decentral shot
        else:
            # Define linear equation through closer goalpost and perpendicular to bisector
            distance_right_goalpost = math.dist([x_shot, y_shot], [x_RH, y_RH])
            distance_left_goalpost = math.dist([x_shot, y_shot], [x_LH, y_LH])
            if distance_right_goalpost < distance_left_goalpost:
                y_goalpost = y_RH
            else:
                y_goalpost = y_LH
            x_goalpost = x_RH
            m_goalpost = -(1/m_bisector)
            q_goalpost = y_goalpost - m_goalpost*x_goalpost

            # Calculate intersection of two linear equations
            y_gk_bisector = (q_bisector - q_goalpost*m_bisector/m_goalpost)/(1 - (m_bisector/m_goalpost))
            x_gk_bisector = (y_gk_bisector - q_goalpost)/m_goalpost

        # If shooting angle is not covered by goalkeeper reach
        if math.dist([x_goalpost, y_goalpost], [x_gk_bisector, y_gk_bisector]) > GOALKEEPER_REACH:

            # Distance between shot position and TOGKP to cover gk reach
            distance = (math.cos(half_shooting_angle)*GOALKEEPER_REACH)/math.sin(half_shooting_angle)
            # Shift from shot position to TOGKP
            x_shift = distance/(math.sqrt(1+m_bisector*m_bisector))
            y_shift = x_shift*m_bisector
            # Calculate TOGKP
            x_gk_bisector = x_shot + x_shift
            y_gk_bisector = y_shot + y_shift

        # Add bisector gk coordinates to lists
        ls_x_gk_bisector.append(round(x_gk_bisector, 2))
        ls_y_gk_bisector.append(round(y_gk_bisector, 2))

    # Add lists to dataframe
    df['x_gk_bisector_wingspan'] = ls_x_gk_bisector
    df['y_gk_bisector_wingspan'] = ls_y_gk_bisector

    # Return dataframe
    return df

In [None]:
def implement_inscribed_circle_model(df):
    """
    Return dataframe with TOGKP from bisector model
    """
    
    # Create previous dataframe
    df = implement_bisector_wingspan_model(df)
    # Create empty lists
    ls_x_gk_bisector = []
    ls_y_gk_bisector = []

    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define shot position
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        shooting_location = (x_shot, y_shot)
        # Define linear equation of bisector
        m_bisector = df.at[row, 'bisector_slope']
        q_bisector = y_shot - m_bisector*x_shot
        # Define half cone angle in radians
        half_cone_angle = math.radians(get_angle(shooting_location, (x_LH, y_LH), (x_RH, y_RH))/2)
        
        # Rotate point (i.e., right horizontal goalpost) counterclockwise by half cone angle around origin (i.e., left horizontal goalpost)
        x_rotated = x_LH + math.cos(half_cone_angle) * (x_RH - x_LH) - math.sin(half_cone_angle) * (y_RH - y_LH)
        y_rotated = y_LH + math.sin(half_cone_angle) * (x_RH - x_LH) + math.cos(half_cone_angle) * (y_RH - y_LH)
        
        # Define linear equation of cone bisector
        m_cone = -(y_rotated-y_LH)/(x_rotated-x_LH)
        q_cone = y_LH - m_cone*x_LH
        
        # Calculate intersection of two linear equations
        y_gk_bisector = (q_bisector - q_cone*m_bisector/m_cone)/(1 - (m_bisector/m_cone))
        x_gk_bisector = (y_gk_bisector - q_cone)/m_cone

        # Add bisector gk coordinates to lists
        ls_x_gk_bisector.append(round(x_gk_bisector, 2))
        ls_y_gk_bisector.append(round(y_gk_bisector, 2))

    # Add lists to dataframe
    df['x_gk_inscribed'] = ls_x_gk_bisector
    df['y_gk_inscribed'] = ls_y_gk_bisector

    # Return dataframe
    return df

In [None]:
def implement_zonal_model(df):
    """
    Return dataframe with TOGKP from zonal intersection model
    """
    
    # Create previous dataframe
    df = implement_inscribed_circle_model(df)
    # Create empty lists
    ls_x_gk_zonal = []
    ls_y_gk_zonal = []
        
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define shot position
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Define shot location
        shot_location = Point(x_shot, y_shot)

        # If shot lies in orange polygon
        if polygon_orange_1.contains(shot_location) == True or polygon_orange_2.contains(shot_location) == True or polygon_orange_3.contains(shot_location) == True or polygon_orange_4.contains(shot_location) == True:
            # Define TOGKP
            x_gk_zonal = x_OH
            y_gk_zonal = y_OH

        # If shot lies in red polygon
        elif polygon_red.contains(shot_location) == True and (x_shot - x_PENALTY)**2 + (y_shot - y_PENALTY)**2 >= radius_PENALTY**2:
            # Define linear equation of ballline
            m_ballline = (y_shot-y_MH)/(x_shot-x_MH)
            q_ballline = y_shot - m_ballline*x_shot
            # If central shot
            if m_ballline == 0:
                # Define TOGKP
                x_gk_zonal = x_OH
                y_gk_zonal = y_OH
            # If decentral shot
            else:
                # Define linear equation of zonal line
                if m_ballline > 0:
                    m_zonal = (y_RH-y_OH)/(x_RH-x_OH)
                    q_zonal = y_RH - m_zonal*x_RH
                else:
                    m_zonal = (y_LH-y_OH)/(x_LH-x_OH)
                    q_zonal = y_LH - m_zonal*x_LH
                # Define TOGKP by calculating intersection of two linear equations
                y_gk_zonal = (q_zonal - q_ballline*m_zonal/m_ballline)/(1-(m_zonal/m_ballline))
                x_gk_zonal = (y_gk_zonal-q_ballline)/m_ballline

        # If shot lies in blue polygon
        elif polygon_blue.contains(shot_location) == True and (x_shot - x_PENALTY)**2 + (y_shot - y_PENALTY)**2 < radius_PENALTY**2:
            # Define linear equation of ballline
            m_ballline = (y_shot-y_MH)/(x_shot-x_MH)
            q_ballline = y_shot - m_ballline*x_shot
            # Define TOGKP
            x_gk_zonal = 114
            y_gk_zonal = q_ballline + m_ballline*x_gk_zonal

        # If shot lies in light blue polygon
        elif (polygon_light_1.contains(shot_location) == True and polygon_red.contains(shot_location) == False) or (polygon_light_1.contains(shot_location) == True and (x_shot - x_PENALTY)**2 + (y_shot - y_PENALTY)**2 < radius_PENALTY**2) or (polygon_light_2.contains(shot_location) == True and polygon_red.contains(shot_location) == False) or (polygon_light_2.contains(shot_location) == True and (x_shot - x_PENALTY)**2 + (y_shot - y_PENALTY)**2 < radius_PENALTY**2):
            # Define linear equation of ballline
            m_ballline = (y_shot-40)/(x_shot-120)
            q_ballline = y_shot - m_ballline*x_shot
            # Define linear equation of zonal line
            if m_ballline > 0:
                m_zonal = (36-39.085)/(117.6-114)
                q_zonal = 36 - m_zonal*117.6
            else:
                m_zonal = (44-40.915)/(117.6-114)
                q_zonal = 44 - m_zonal*117.6
            # Define TOGKP by calculating intersection of two linear equations
            y_gk_zonal = (q_zonal - q_ballline*m_zonal/m_ballline)/(1-(m_zonal/m_ballline))
            x_gk_zonal = (y_gk_zonal-q_ballline)/m_ballline

        # If shot lies in yellow polygon
        elif polygon_yellow_1.contains(shot_location) == True or polygon_yellow_2.contains(shot_location) == True:
            # Define linear equation of ballline
            m_ballline = (y_shot-y_MH)/(x_shot-x_MH)
            q_ballline = y_shot - m_ballline*x_shot
            # Define y coordinate of TOGKP
            if m_ballline > 0:
                y_gk_zonal = y_RH
            else:
                y_gk_zonal = y_LH
            # Define x coordinate of TOGKP by calculating intersection of two linear equations
            x_gk_zonal = (y_gk_zonal-q_ballline)/m_ballline

        # If shot lies in none of the zones
        else:
            x_gk_zonal = x_EX
            y_gk_zonal = y_EX

        # Add zonal gk coordinates to lists
        ls_x_gk_zonal.append(round(x_gk_zonal, 2))
        ls_y_gk_zonal.append(round(y_gk_zonal, 2))

    # Add lists to dataframe
    df['x_gk_zonal'] = ls_x_gk_zonal
    df['y_gk_zonal'] = ls_y_gk_zonal

    # Return dataframe
    return df

In [None]:
def implement_line_positioning_model(df):
    """
    Return dataframe with TOGKP from line positioning model
    """
    
    # Create previous dataframe
    df = implement_zonal_model(df)
    # Create empty lists
    ls_x_gk_line = []
    ls_y_gk_line = []
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):

        # Define shot position
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']

        # If shot is further away then line
        if x_shot < 118:
            # Define linear equation of ballline
            m_ballline = (y_shot - y_MH)/(x_shot - x_MH)
            q_ballline = y_shot - m_ballline*x_shot
            # Define TOGKP by calculating intersection of two linear equations
            x_gk_line = 118
            y_gk_line = q_ballline + m_ballline*x_gk_line

        # If shot is closer then line
        else:
            # Define TOGKP
            x_gk_line = x_EX
            y_gk_line = y_EX

        # Add line positioning gk coordinates to lists
        ls_x_gk_line.append(round(x_gk_line, 2))
        ls_y_gk_line.append(round(y_gk_line, 2))

    # Add lists to dataframe
    df['x_gk_line'] = ls_x_gk_line
    df['y_gk_line'] = ls_y_gk_line

    # Return dataframe
    return df

In [None]:
def implement_gk_arc_model(df):
    """
    Return dataframe with TOGKP from goalkeeper's arc model
    """
    
    # Create previous dataframe
    df = implement_line_positioning_model(df)
    # Create empty lists
    ls_x_gk_arc = []
    ls_y_gk_arc = []
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):

        # Define shot position
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Calculate intersecttions of a circle and a line
        intersections = calculate_circle_line_intersection((x_shot, y_shot))
        # Define TOGKP
        x_gk_arc = intersections[0][0]
        y_gk_arc = intersections[0][1]
        
        # If shot is closer then intersection
        if x_shot > x_gk_arc:
            # Define TOGKP
            x_gk_arc = x_EX
            y_gk_arc = y_EX

        # Add gk arc coordinates to lists
        ls_x_gk_arc.append(round(x_gk_arc, 2))
        ls_y_gk_arc.append(round(y_gk_arc, 2))
        
    # Add lists to dataframe
    df['x_gk_arc'] = ls_x_gk_arc
    df['y_gk_arc'] = ls_y_gk_arc

    # Return dataframe
    return df

In [None]:
def filter_for_1v1s(df):
    """
    Return dataframe filtered for 1v1 situations
    """
    
    # Create previous dataframe
    df = implement_gk_arc_model(df)
    # Exclude non 1v1 situations
    df = df.where(df['shooting_cone_players_count'] == 0).dropna().reset_index(drop = True)
    
    # Delete rows where some models could not define a TOGKP (i.e., shot lies in an undefined region)
    for row in range(len(df)):
        if df.at[row, 'x_gk_zonal'] == 0 or df.at[row, 'x_gk_line'] == 0 or df.at[row, 'x_gk_arc'] == 0:
            df = df.drop(row)
            
    # Return dataframe
    return df.reset_index(drop = True)

In [None]:
def add_alternative_data(df):
    """
    Return dataframe with alternative data
    """
    
    # Create previous dataframe
    df = filter_for_1v1s(df)
    # Create empty lists
    ls_goal_angle = []
    ls_gk_angle = []
    ls_distance_between_shot_and_goalline = []
    ls_distance_between_shot_and_goalcenter = []
    ls_angular_deviation = []
    ls_shooting_cone_without_gk = []
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define variables
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        x_gk = df.at[row, 'x_gk']
        y_gk = df.at[row, 'y_gk']
        m_ballline = (y_MH-y_shot)/(x_MH-x_shot)
        m_gk_goalcenter = (x_MH-x_gk)
        m_shot_gk = (y_gk-y_shot)/(x_gk-x_shot)
        m_bisector = df.at[row, 'bisector_slope']
        if x_gk == 120:
            ls_gk_angle.append(0)
        else:
            m_gk_goalcenter = (y_MH-y_gk)/(x_MH-x_gk)
            ls_gk_angle.append(round(math.degrees(abs(math.atan(m_gk_goalcenter))), 2))
            
        # Add values to lists
        ls_goal_angle.append(round(math.degrees(abs(math.atan(m_ballline))), 2))
        ls_distance_between_shot_and_goalcenter.append(round(math.dist([x_shot, y_shot], [120, 40]), 2))
        ls_distance_between_shot_and_goalline.append(120 - x_shot)
        ls_angular_deviation.append(round(math.degrees(abs(math.atan(m_shot_gk) - math.atan(m_bisector))), 2))
        # Add bool whether shooting cone includes no gk
        gk_location = Point(x_gk, y_gk)
        shooting_cone = Polygon([(x_shot, y_shot), (x_RH, y_RH), (x_LH, y_LH)])
        if shooting_cone.contains(gk_location) == True:
            ls_shooting_cone_without_gk.append(False)
        else:
            ls_shooting_cone_without_gk.append(True)
        
    # Add lists to dataframe
    df['goal_angle'] = ls_goal_angle
    df['gk_angle'] = ls_gk_angle
    df['distance_between_shot_and_goalline'] = ls_distance_between_shot_and_goalline
    df['distance_between_shot_and_goalcenter'] = ls_distance_between_shot_and_goalcenter
    df['angular_deviation'] = ls_angular_deviation
    df['shooting_cone_without_gk'] = ls_shooting_cone_without_gk
    
    # Return dataframe
    return df

In [None]:
def implement_knn_model(df):
    """
    Return dataframe with TOGKP from knn model
    """
    
    # Create previous dataframe
    df = add_alternative_data(df)
    # Create empty lists
    ls_x_gk_knn = []
    ls_y_gk_knn = []
    
    # Iterate over all rows in dataframe
    for row_i in range(len(df)):
        
        # Define shot position
        x_shot = df.at[row_i, 'x_shot']
        y_shot = df.at[row_i, 'y_shot']
        # Create empty list
        ls_distance = []

        # Iterate over all rows in dataframe again
        for row_j in range(len(df)):
            # Define similar shot position
            x_shot_similar = df.at[row_j, 'x_shot']
            y_shot_similar = df.at[row_j, 'y_shot']
            # Add distance between shot position and similar shot position to list
            ls_distance.append(math.dist([x_shot, y_shot], [x_shot_similar, y_shot_similar]))
        # Sort indices by ascending distance values and save value 1 to 21 (i.e. don't save the smallest)
        sorted_indices = np.argsort(ls_distance).tolist()[1:21]

        # Define counter variables
        x_gk_tot = 0
        y_gk_tot = 0
        # Sum up gk coordinates from 20 most similar shots
        for similar_shot in sorted_indices:
            x_gk_tot = x_gk_tot + df.at[similar_shot, 'x_gk']
            y_gk_tot = y_gk_tot + df.at[similar_shot, 'y_gk']
        # Add mean gk coordinates from 20 most similar shots to list
        ls_x_gk_knn.append(round(x_gk_tot/20, 2))
        ls_y_gk_knn.append(round(y_gk_tot/20, 2))   
        
    # Add lists to dataframe
    df['x_gk_knn'] = ls_x_gk_knn
    df['y_gk_knn'] = ls_y_gk_knn
    
    # Return dataframe
    return df

In [None]:
def implement_ml_model(df):
    """
    Return dataframe with TOGKP from ml model
    """
    
    # Create previous dataframe
    df = implement_knn_model(df)
    # Mirror shot positions and gk positions
    df = mirror_shots(df)
    # Create empty lists
    ls_y_shot = []
    ls_y_gk = []
    ls_x_gk_ml = []
    ls_y_gk_ml = []
    
    # Define features
    X_train = df[['x_shot', 'y_shot']]
    # Define target
    y_train = df[['x_gk', 'y_gk']]
    # Define model
    model = LinearRegression()
    # Fit model
    model.fit(X_train, y_train)
    # Apply model
    y_pred = model.predict(X_train)
    
    # Iterate over all rows in array
    for row in range(len(y_pred)):
        # Unmirror shot position, gk position, and ml gk position
        if df.at[row, 'mirror'] == True:
            # Add mirrored positions unmirrored to lists
            ls_y_shot.append(80 - df.iloc[row]['y_shot'])
            ls_y_gk.append(80 - df.iloc[row]['y_gk'])
            ls_y_gk_ml.append(round(80 - y_pred[row][1], 2))
        else:
            # Add unmirrored positions to lists
            ls_y_shot.append(df.iloc[row]['y_shot'])
            ls_y_gk.append(df.iloc[row]['y_gk'])
            ls_y_gk_ml.append(round(y_pred[row][1], 2))
        # Add x coordinate to list (doesn't need to be unmirrored)
        ls_x_gk_ml.append(round(y_pred[row][0], 2))
    # Delete mirror column
    del df['mirror']
        
    # Add lists to dataframes
    df['y_shot'] = ls_y_shot
    df['y_gk'] = ls_y_gk
    df['x_gk_ml'] = ls_x_gk_ml
    df['y_gk_ml'] = ls_y_gk_ml
    
    # Return dataframe
    return df

In [None]:
def add_model_metrics(df):
    """
    Return dataframe with model metrics
    """
    
    # Create previous dataframe
    df = implement_ml_model(df)
    # Add values to dataframe
    df = add_reaction_distance(df)
    df = add_max_open_goal(df)
    
    # Add values for each TOGKP model to dataframe
    for gk_name in LS_TOGKP_MODEL_NAMES:
        df_reduce = add_distance_between_gk_and_gk_alt(df, gk_name)
    for gk_name in LS_TOGKP_MODEL_NAMES:
        df_reduce = add_distance_between_gk_knn_and_gk_alt(df, gk_name)
    for gk_name in LS_TOGKP_MODEL_NAMES:
        df_reduce = add_distance_between_goalline_and_gk_alt(df, gk_name)
    for gk_name in LS_TOGKP_MODEL_NAMES:
        df_reduce = add_distance_between_bisector_and_gk_alt(df, gk_name) 
    for gk_name in LS_TOGKP_MODEL_NAMES:
        df_reduce = add_distance_between_ballline_and_gk_alt(df, gk_name) 
    for gk_name in LS_TOGKP_MODEL_NAMES:
        df_reduce = add_ratio_between_shot_gk_and_gk_bisector(df, gk_name)
    
    # Return dataframe
    return df

### Other Methods

In [None]:
def plot_vertical_pitch(label: bool, tick: bool):
    """
    Plot vertical StatsBomb pitch with predefined arguments
    """
    
    # Define arguments
    pitch = VerticalPitch(
        pitch_type='statsbomb', # Define data provider
        pitch_color='white',  # Define pitch color
        line_color='grey',     # Define line color
        goal_type='box',        # Define goal type
        linewidth=5,            # Define linewidth
        half=False,             # Show half pitch
        axis=True,              # Show axis
        label=label,            # Show labels
        tick=tick               # Show ticks
    )
    
    # Specify figure size (width, height) and draw figure
    fig, ax = pitch.draw(figsize=(20, 10))
    
    # Scale plot
    plt.axis('scaled')
    plt.xlim([13, 67])
    plt.ylim([-5, 25])

In [None]:
def plot_one_shot(df, row, gk_name, radius):
    """
    Plot one shot of dataframe
    """
    
    # Apply function
    plot_vertical_pitch(False, False)
    
    # Define horizontal shot position, transform to vertical shot position, plot
    x_shot_horizontal, y_shot_horizontal = df.at[row, 'x_shot'], df.at[row, 'y_shot']
    x_shot_vertical, y_shot_vertical = transform_horizontal_to_vertical_coordinates(x_shot_horizontal, y_shot_horizontal)
    plt.scatter(x_shot_vertical, y_shot_vertical, color='red', s=s_PITCH, zorder=2)
    
    # Define horizontal gk position and transform to vertical gk position
    x_gk_horizontal, y_gk_horizontal = df.at[row, 'x_' + gk_name], df.at[row, 'y_' + gk_name]
    x_gk_vertical, y_gk_vertical = transform_horizontal_to_vertical_coordinates(x_gk_horizontal, y_gk_horizontal)
    # Plot vertical gk position with different color for goals
    if df.at[row, 'outcome'] == 'Goal':
        color = 'orange'
    else:
        color = 'blue'
    plt.scatter(x_gk_vertical, y_gk_vertical, color=color, s=s_PITCH, zorder=2)
    
    # Plot other player positions
    for player in range(len(df.at[row, 'player_locations'])):
        # Define horizontal player position, transform to vertical player position, plot
        x_player_horizontal, y_player_horizontal = df.at[row, 'player_locations'][player][0], df.at[row, 'player_locations'][player][1]
        x_player_vertical, y_player_vertical = transform_horizontal_to_vertical_coordinates(x_player_horizontal, y_player_horizontal)
        plt.scatter(x_player_vertical, y_player_vertical, color='brown', s=s_PITCH, zorder=2)
    
    # Create labels for legend
    plt.scatter(1000, 1000, color='red', s=s_PITCH, label='Shooting Position')
    plt.scatter(1000, 1000, color='orange', s=s_PITCH, label='Goalkeeper Position (Goal)')
    plt.scatter(1000, 1000, color='blue', s=s_PITCH, label='Goalkeeper Position (No-Goal)')
    plt.scatter(1000, 1000, color='brown', s=s_PITCH, label='Field Player\'s Position')
    
    # Plot line between shot position and left goalpost
    x_values = [x_shot_vertical, x_LV]
    y_values = [y_shot_vertical, y_LV]
    plt.plot(x_values, y_values, color='red', linewidth=linewidth_PITCH, alpha=0.4, linestyle='dashed', label='Shooting Cone', zorder=1)
    # Plot line between shot position and right goalpost
    x_values = [x_shot_vertical, x_RV]
    y_values = [y_shot_vertical, y_RV]
    plt.plot(x_values, y_values, color='red', linewidth=linewidth_PITCH, alpha=0.4, linestyle='dashed', zorder=1)
    
    # Calculate linear equation of ball line
    m_ballline = (y_MV-y_shot_vertical)/(x_MV-x_shot_vertical)
    q_ballline = y_shot_vertical - m_ballline*x_shot_vertical
    # Plot ball line
    x = np.linspace(-5, 100, 100)
    y = m_ballline*x + q_ballline
    plt.plot(x, y, color='blue', linewidth=linewidth_PITCH, alpha=0.4, linestyle='dashed', label='Ball Line', zorder=1)
    # Calculate linear equation of bisector
    m_bisector = -(1/df.at[row, 'bisector_slope'])
    q_bisector = y_shot_vertical - m_bisector*x_shot_vertical
    # Plot bisector
    x = np.linspace(-5, 100, 100)
    y = m_bisector*x + q_bisector
    plt.plot(x, y, color='green', linewidth=linewidth_PITCH, alpha=0.4, linestyle='dashed', label='Bisector', zorder=1)

    # Plot dive shadow
    plt.gca().plot(x_gk_vertical, y_gk_vertical, 'o', ms=radius, mec='none', mfc='blue', alpha=0.2, mew=0)
    
    # Plot legend
    plt.legend(loc='upper left', prop={'size': 20})
    # Scale plot
    plt.axis('scaled')
    plt.xlim([13, 67])
    if y_shot_vertical > 20:
        plt.ylim([-5, y_shot_vertical+5])
    else:
        plt.ylim([-5, 25])
    # Show plot
    plt.show()

In [None]:
def plot_all_shots(df):
    """
    Plot all shots of dataframe
    """
    
    # Apply function
    plot_vertical_pitch(False, False)
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define shot position transformed from horizontal to vertical pitch
        x_shot = 80 - df.at[row, 'y_shot']
        y_shot = 120 - df.at[row, 'x_shot']
        # Plot shot position (different color for goals)
        color = 'blue'
        marker = 's'
        if df.at[row, 'outcome'] == 'Goal':
            color = 'red'
            marker = 'o'
        plt.scatter(x_shot, y_shot, marker=marker, facecolors='none', edgecolors=color, s=50, zorder=2)

    # Create labels for legend
    plt.scatter(1000, 1000, marker='o', facecolors='none', edgecolors='red', s=s_PITCH, label='Shot Position (Goal)')
    plt.scatter(1000, 1000, marker='s', facecolors='none', edgecolors='blue', s=s_PITCH, label='Shot Position (No-Goal)')

    # Plot legend
    plt.legend(loc='upper left', prop={'size': 20})
    # Scale plot
    plt.axis('scaled')
    plt.xlim([-5, 85])
    plt.ylim([-5, 65])

In [None]:
def plot_all_gks(df, gk_name):
    """
    Plot all gk positions
    """
    
    # Apply function
    plot_vertical_pitch(False, False)
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        
        # Define gk position transformed from horizontal to vertical pitch
        x_gk = 80 - df.at[row, 'y_' + gk_name]
        y_gk = 120 - df.at[row, 'x_' + gk_name]
        # Plot gk position (differnt color for goals)
        color = 'blue'
        marker = 's'
        if gk_name == 'gk':
            if df.at[row, 'outcome'] == 'Goal':
                color = 'red'
                marker = 'o'
        plt.scatter(x_gk, y_gk, marker=marker, facecolors='none', edgecolors=color, s=50, zorder=2)

    # Create labels for legend    
    if gk_name == 'gk':    
        plt.scatter(1000, 1000, marker = 'o', facecolors='none', edgecolors='red', s=s_PITCH, label='GK Position (Goal)')
        plt.scatter(1000, 1000, marker = 's', facecolors='none', edgecolors='blue', s=s_PITCH, label='GK Position (No-Goal)')
    else:
        plt.scatter(1000, 1000, marker = 's', facecolors='none', edgecolors='blue', s=s_PITCH, label='Modeled GK Position')

    # Plot legend
    plt.legend(loc='upper left', prop={'size': 30})
    # Scale plot
    plt.axis('scaled')
    plt.xlim([17, 63])
    plt.ylim([-3, 19])

In [None]:
def plot_coordinate_system(x, y):
    """
    Plot coordinate system
    """
    
    # Define tick interval for both axes
    xmin, xmax, ymin, ymax = -x, x, -y, y
    ticks_frequency = 2
    # Create figure and axes object
    fig, ax = plt.subplots(figsize=(10, 10))
    # Set face color
    fig.patch.set_facecolor('#ffffff')
    # Apply ranges to axes
    ax.set(xlim=(xmin-1, xmax+1), ylim=(ymin-1, ymax+1), aspect='equal')
    # Set both axes to zero position
    ax.spines['bottom'].set_position('zero')
    ax.spines['left'].set_position('zero')
    # Hide the top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # Create x tick, y tick, and apply them to both axes
    x_ticks = np.arange(xmin, xmax+1, ticks_frequency)
    y_ticks = np.arange(ymin, ymax+1, ticks_frequency)
    ax.set_xticks(x_ticks[x_ticks != 0])
    ax.set_yticks(y_ticks[y_ticks != 0])
    ax.set_xticks(np.arange(xmin, xmax+1), minor=True)
    ax.set_yticks(np.arange(ymin, ymax+1), minor=True)
    # Add a grid
    ax.grid(which='both', color='grey', linewidth=1, linestyle='-', alpha=0.2)

In [None]:
def calculate_distance_between_gk_and_gk_alt(df, gk_alt):
    """
    Calculate distance between gk position and alternative gk position
    """
    
    # Create variables
    x_tot = 0
    y_tot = 0
    x_tot_abs = 0
    y_tot_abs = 0

    # Iterate over all rows in dataframe
    for row in range(len(df)):

        # Define gk position relative to centered TOGKP
        x_relative = df_1v1.at[row, 'x_gk'] - df_1v1.at[row, 'x_' + gk_alt]
        y_relative = df_1v1.at[row, 'y_gk'] - df_1v1.at[row, 'y_' + gk_alt]
        # Calculate total deviation
        x_tot = x_tot + x_relative
        y_tot = y_tot + y_relative
        # Calculate absolute total deviation
        x_tot_abs = x_tot_abs + abs(x_relative)
        y_tot_abs = y_tot_abs + abs(y_relative)

    # Calculate mean absolute deviation
    x_dev = round(x_tot / len(df), 2)
    y_dev = round(y_tot / len(df), 2)
    x_dev_abs = round(x_tot_abs / len(df), 2)
    y_dev_abs = round(y_tot_abs / len(df), 2)

    # Display results
    print('Deviation from gk position to', gk_alt)
    print('Deviation in x direction:', x_dev)
    print('Deviation in y direction:', y_dev)
    print('Absolute deviation in x direction:', x_dev_abs)
    print('Absolute deviation in y direction:', y_dev_abs)

In [None]:
def plot_relation(df, column_on_x_axis, column_on_y_axis, y_lim):
    """
    Plot one column on x axis and another one on y axis
    """
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        # Define point
        x = df.at[row, column_on_x_axis]
        y = df.at[row, column_on_y_axis]
        # Color for no-goals
        color = 'blue'
        # Color for goals
        if df.at[row, 'outcome'] == 'Goal':
            color = 'red'
        # Plot point
        plt.scatter(x, y, color=color)

        # Add axis label
        plt.xlabel(column_on_x_axis)
        plt.ylabel(column_on_y_axis)
    
    plt.ylim([-1, y_lim])

    # Display plot
    plt.show()

In [None]:
def plot_shot_position(plot_text):
    plt.scatter(x_shot, y_shot, marker='v', color='red', s=s_PITCH, zorder=2, label='Shot Position (S)')
    if plot_text == True:
        plt.text(x_shot+0.4, y_shot, 'S', fontsize=fontsize_PITCH)
    
def plot_gk_position(plot_text):
    plt.scatter(x_gk, y_gk, marker='^', color='blue', s=s_PITCH, zorder=2, label='GK Position (G)')
    if plot_text == True:
        plt.text(x_gk+0.4, y_gk, 'G', fontsize=fontsize_PITCH)
    
def plot_goalposts(plot_text):
    # Plot left goalpoast
    plt.scatter(x_LV, y_LV, marker='s', color='black', s=s_PITCH, zorder=2, label='Goalposts (L/R)')
    if plot_text == True:
        plt.text(x_LV-1, y_LV-1.3, 'L', fontsize=fontsize_PITCH)
    # Plot right goalpoast
    plt.scatter(x_RV, y_RV, marker='s', color='black', s=s_PITCH, zorder=2)
    if plot_text == True:
        plt.text(x_RV+0.4, y_RV-1.3, 'R', fontsize=fontsize_PITCH)
    
def plot_goalcenter(plot_text):
    plt.scatter(x_MV, y_MV, marker='D', color='black', s=s_PITCH, zorder=2, label='Goal Centre (C)')
    if plot_text == True:
        plt.text(x_MV, y_MV-1.3, 'C', fontsize=fontsize_PITCH)
    
def plot_goalline():
    x_values = [x_LV, x_RV]
    y_values = [y_LV, y_RV]
    plt.plot(x_values, y_values, linestyle='solid', color='red', linewidth=5, zorder=1, label='Goal Line (LR)')
    
def plot_shot_cone():
    # Plot line between shot position and left goalpost
    x_values = [x_shot, x_LV]
    y_values = [y_shot, y_LV]
    plt.plot(x_values, y_values, linestyle='dashed', color='red', linewidth=linewidth_PITCH, alpha=alpha_PITCH, zorder=1, label='Shot Cone')
    # Plot line between shot position and right goalpost
    x_values = [x_shot, x_RV]
    y_values = [y_shot, y_RV]
    plt.plot(x_values, y_values, linestyle='dashed', color='red', linewidth=linewidth_PITCH, alpha=alpha_PITCH, zorder=1)

def plot_ballline():
    # Plot ballline
    m_ballline = (y_shot - y_MV)/(x_shot - x_MV)
    q_ballline = y_shot - m_ballline*x_shot
    x = np.linspace(-5, 100, 100)
    y = m_ballline*x + q_ballline
    plt.plot(x, y, linestyle=(0, (3, 1, 1, 1, 1, 1)), color='blue', linewidth=linewidth_PITCH, alpha=alpha_PITCH, zorder=1, label='Ball Line')

def plot_bisector():
    m_bisector, q_bisector = get_bisector_variables()
    x = np.linspace(-5, 100, 100)
    y = m_bisector*x + q_bisector
    plt.plot(x, y, linestyle='dashdot', color='green', linewidth=linewidth_PITCH, alpha=alpha_PITCH, zorder=1, label='Bisector')

def get_bisector_variables():
    """
    x_shot, y_shot = origin = shot position
    x_LV, y_LV = point = left goalpost position
    x_r, y_r = result = rotated point position
    """

    # Calculate linear equation between shot position and left goalpost
    m_lv = (y_LV - y_shot)/(x_LV - x_shot)
    q_lv = y_shot - m_lv*x_shot
    # Calculate linear equation between shot position and right goalpost
    m_rv = (y_RV - y_shot)/(x_RV - x_shot)
    q_rv = y_shot - m_rv*x_shot

    # Calculate shooting angle
    shooting_angle = math.degrees(abs(math.atan(m_lv) - math.atan(m_rv)))
    # Calculate half shooting angle
    ang = math.radians(shooting_angle)/2

    # Rotate point counterclockwise by defined angle in radians around defined origin
    x_rotated = x_shot + math.cos(ang)*(x_LV-x_shot) - math.sin(ang)*(y_LV-y_shot)
    y_rotated = y_shot + math.sin(ang)*(x_LV-x_shot) + math.cos(ang)*(y_LV-y_shot)

    # Calculate linear equation of bisector
    m_bisector = (y_rotated-y_shot)/(x_rotated-x_shot)
    q_bisector = y_shot - m_bisector*x_shot
    
    # Return results
    return m_bisector, q_bisector

def plot_zone(color, add_label, label):
    plt.fill(x, y, color=color, alpha=alpha_PITCH_LIGHT)
    if add_label == True:
        plt.scatter(1000, 1000, marker='v', color=color, s=s_ZONES, alpha=alpha_PITCH_LIGHT, label=label)

### Datasets

In [None]:
# Create final dataframe for men
df_final_men = implement_gk_arc_model(create_dataframe(LS_FILENAME_MEN))
# Display dataframe
print('Number of rows:', len(df_final_men))
df_final_men.head(1)

In [None]:
# Create final dataframe for women
df_final_women = implement_gk_arc_model(create_dataframe(LS_FILENAME_WOMEN))
# Display dataframe
print('Number of rows:', len(df_final_women))
df_final_women.head(1)

In [None]:
# Create 1v1 dataframe for men
df_1v1_men = add_model_metrics(create_dataframe(LS_FILENAME_MEN))
# Reorder and exclude: 'possession_team', 'shot_body_part', 'shot_technique', 'ratio_shotgk_bisectorgk', 'ratio_shotgk_knn_bisectorgk_knn', 'ratio_shotgk_zonal_bisectorgk_zonal', 'ratio_shotgk_line_bisectorgk_line', 'ratio_shotgk_arc_bisectorgk_arc', 'ratio_shotgk_bisector_bisectorgk_bisector', 'ratio_shotgk_ml_bisectorgk_ml'
df_1v1_men = df_1v1_men[['shot', 'outcome', 'statsbomb_xg', 'goalkeeper', 'goalkeeper_id', 'x_shot', 'y_shot', 'x_gk', 'y_gk', 'x_gk_zonal', 'y_gk_zonal', 'x_gk_arc', 'y_gk_arc', 'x_gk_line', 'y_gk_line', 'x_gk_bisector_dive', 'y_gk_bisector_dive', 'x_gk_bisector_wingspan', 'y_gk_bisector_wingspan', 'x_gk_inscribed', 'y_gk_inscribed', 'x_gk_knn', 'y_gk_knn', 'x_gk_ml', 'y_gk_ml', 'distance_between_gk_and_gk', 'distance_between_gk_and_gk_knn', 'distance_between_gk_and_gk_zonal', 'distance_between_gk_and_gk_line', 'distance_between_gk_and_gk_arc', 'distance_between_gk_and_gk_bisector_wingspan', 'distance_between_gk_and_gk_bisector_dive', 'distance_between_gk_and_gk_inscribed', 'distance_between_gk_and_gk_ml', 'distance_between_gk_knn_and_gk', 'distance_between_gk_knn_and_gk_knn', 'distance_between_gk_knn_and_gk_zonal', 'distance_between_gk_knn_and_gk_line', 'distance_between_gk_knn_and_gk_arc', 'distance_between_gk_knn_and_gk_bisector_wingspan', 'distance_between_gk_knn_and_gk_bisector_dive', 'distance_between_gk_knn_and_gk_inscribed', 'distance_between_gk_knn_and_gk_ml', 'distance_between_goalline_and_gk', 'distance_between_goalline_and_gk_zonal', 'distance_between_goalline_and_gk_arc', 'distance_between_goalline_and_gk_line', 'distance_between_goalline_and_gk_bisector_dive', 'distance_between_goalline_and_gk_bisector_wingspan', 'distance_between_goalline_and_gk_inscribed', 'distance_between_goalline_and_gk_knn', 'distance_between_goalline_and_gk_ml', 'distance_between_bisector_and_gk', 'distance_between_bisector_and_gk_zonal', 'distance_between_bisector_and_gk_arc', 'distance_between_bisector_and_gk_line', 'distance_between_bisector_and_gk_bisector_wingspan', 'distance_between_bisector_and_gk_bisector_dive', 'distance_between_bisector_and_gk_inscribed', 'distance_between_bisector_and_gk_knn', 'distance_between_bisector_and_gk_ml', 'distance_between_ballline_and_gk', 'distance_between_ballline_and_gk_zonal', 'distance_between_ballline_and_gk_arc', 'distance_between_ballline_and_gk_line', 'distance_between_ballline_and_gk_bisector_wingspan', 'distance_between_ballline_and_gk_bisector_dive', 'distance_between_ballline_and_gk_inscribed', 'distance_between_ballline_and_gk_knn', 'distance_between_ballline_and_gk_ml', 'bisector_slope', 'shooting_angle', 'goal_angle', 'gk_angle', 'angular_deviation', 'player_locations', 'shooting_cone_players', 'shooting_cone_players_count', 'shooting_cone_without_gk', 'distance_between_shot_and_goalline', 'distance_between_shot_and_goalcenter', 'reaction_distance', 'max_open_goal']]
# Display dataframe
print('Number of rows:', len(df_1v1_men))
df_1v1_men.head(1)

In [None]:
# Create 1v1 dataframe for women
df_1v1_women = add_model_metrics(create_dataframe(LS_FILENAME_WOMEN))
# Reorder and exclude: 'possession_team', 'shot_body_part', 'shot_technique', 'ratio_shotgk_bisectorgk', 'ratio_shotgk_knn_bisectorgk_knn', 'ratio_shotgk_zonal_bisectorgk_zonal', 'ratio_shotgk_line_bisectorgk_line', 'ratio_shotgk_arc_bisectorgk_arc', 'ratio_shotgk_bisector_bisectorgk_bisector', 'ratio_shotgk_ml_bisectorgk_ml'
df_1v1_women = df_1v1_women[['shot', 'outcome', 'statsbomb_xg', 'goalkeeper', 'goalkeeper_id', 'x_shot', 'y_shot', 'x_gk', 'y_gk', 'x_gk_zonal', 'y_gk_zonal', 'x_gk_arc', 'y_gk_arc', 'x_gk_line', 'y_gk_line', 'x_gk_bisector_dive', 'y_gk_bisector_dive', 'x_gk_bisector_wingspan', 'y_gk_bisector_wingspan', 'x_gk_inscribed', 'y_gk_inscribed', 'x_gk_knn', 'y_gk_knn', 'x_gk_ml', 'y_gk_ml', 'distance_between_gk_and_gk', 'distance_between_gk_and_gk_knn', 'distance_between_gk_and_gk_zonal', 'distance_between_gk_and_gk_line', 'distance_between_gk_and_gk_arc', 'distance_between_gk_and_gk_bisector_wingspan', 'distance_between_gk_and_gk_bisector_dive', 'distance_between_gk_and_gk_inscribed', 'distance_between_gk_and_gk_ml', 'distance_between_gk_knn_and_gk', 'distance_between_gk_knn_and_gk_knn', 'distance_between_gk_knn_and_gk_zonal', 'distance_between_gk_knn_and_gk_line', 'distance_between_gk_knn_and_gk_arc', 'distance_between_gk_knn_and_gk_bisector_wingspan', 'distance_between_gk_knn_and_gk_bisector_dive', 'distance_between_gk_knn_and_gk_inscribed', 'distance_between_gk_knn_and_gk_ml', 'distance_between_goalline_and_gk', 'distance_between_goalline_and_gk_zonal', 'distance_between_goalline_and_gk_arc', 'distance_between_goalline_and_gk_line', 'distance_between_goalline_and_gk_bisector_dive', 'distance_between_goalline_and_gk_bisector_wingspan', 'distance_between_goalline_and_gk_inscribed', 'distance_between_goalline_and_gk_knn', 'distance_between_goalline_and_gk_ml', 'distance_between_bisector_and_gk', 'distance_between_bisector_and_gk_zonal', 'distance_between_bisector_and_gk_arc', 'distance_between_bisector_and_gk_line', 'distance_between_bisector_and_gk_bisector_wingspan', 'distance_between_bisector_and_gk_bisector_dive', 'distance_between_bisector_and_gk_inscribed', 'distance_between_bisector_and_gk_knn', 'distance_between_bisector_and_gk_ml', 'distance_between_ballline_and_gk', 'distance_between_ballline_and_gk_zonal', 'distance_between_ballline_and_gk_arc', 'distance_between_ballline_and_gk_line', 'distance_between_ballline_and_gk_bisector_wingspan', 'distance_between_ballline_and_gk_bisector_dive', 'distance_between_ballline_and_gk_inscribed', 'distance_between_ballline_and_gk_knn', 'distance_between_ballline_and_gk_ml', 'bisector_slope', 'shooting_angle', 'goal_angle', 'gk_angle', 'angular_deviation', 'player_locations', 'shooting_cone_players', 'shooting_cone_players_count', 'shooting_cone_without_gk', 'distance_between_shot_and_goalline', 'distance_between_shot_and_goalcenter', 'reaction_distance', 'max_open_goal']]
# Display dataframe
print('Number of rows:', len(df_1v1_women))
df_1v1_women.head(1)

### Testing

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = x_SV, y_SV
# Define gk position
x_gk, y_gk = 41.78, 3.58

# Plot gk arc
x, y = semicircle(13.5, 40, 0)
plt.scatter(x, y, s=5, c='orange')
x = np.linspace(100, 100, 100)
y = 0.5*x + 100

# Plot legend and save figure
# plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/harrison.png', bbox_inches='tight')

### Test Pitches

In [None]:
plot_one_shot(df_1v1_men, 16, 'gk_bisector_inscribed', 120)

In [None]:
plot_all_shots(df_1v1_men)

In [None]:
plot_all_gks(df_1v1_men, 'gk')

### Test Relations

In [None]:
plot_relation(df_1v1_men, 'shooting_angle', 'distance_between_shot_and_goalline', 60)

In [None]:
plot_relation(df_1v1_men, 'distance_between_shot_and_goalline', 'distance_between_goalline_and_gk', 14)

In [None]:
plot_relation(df_1v1_women, 'distance_between_shot_and_goalline', 'distance_between_goalline_and_gk', 22)

In [None]:
plot_relation(df_1v1_men, 'goal_angle', 'gk_angle', 90)

In [None]:
plot_relation(df_1v1_women, 'goal_angle', 'gk_angle', 90)

In [None]:
plot_relation(df_1v1_men, 'distance_between_goalline_and_gk', 'distance_between_bisector_and_gk', 7)

In [None]:
plot_relation(df_1v1_women, 'distance_between_goalline_and_gk', 'distance_between_bisector_and_gk', 7)

### xG vs Indicators

Plots:

In [None]:
plot_relation(df_1v1_men, 'statsbomb_xg', 'distance_between_bisector_and_gk', 7)
plot_relation(df_1v1_women, 'statsbomb_xg', 'distance_between_bisector_and_gk', 8)

In [None]:
plot_relation(df_1v1_men, 'statsbomb_xg', 'distance_between_ballline_and_gk', 7)
plot_relation(df_1v1_women, 'statsbomb_xg', 'distance_between_ballline_and_gk', 8)

In [None]:
plot_relation(df_1v1_men, 'statsbomb_xg', 'distance_between_gk_knn_and_gk', 12)
plot_relation(df_1v1_women, 'statsbomb_xg', 'distance_between_gk_knn_and_gk', 12)

In [None]:
plot_relation(df_1v1_men, 'statsbomb_xg', 'distance_between_gk_and_gk_bisector_inscribed', 15)
plot_relation(df_1v1_women, 'statsbomb_xg', 'distance_between_gk_and_gk_bisector_inscribed', 20)

In [None]:
plot_relation(df_1v1_men, 'statsbomb_xg', 'distance_between_gk_and_gk_line', 15)
plot_relation(df_1v1_women, 'statsbomb_xg', 'distance_between_gk_and_gk_line', 15)

Correlation:

In [None]:
ls_xg_men = df_1v1_men['statsbomb_xg'].to_list()
ls_distance_between_bisector_and_gk_men = df_1v1_men['distance_between_bisector_and_gk'].to_list()
ls_distance_between_goalline_and_gk_men = df_1v1_men['distance_between_goalline_and_gk'].to_list()
ls_distance_between_gk_and_gk_knn_men = df_1v1_men['distance_between_gk_and_gk_knn'].to_list()

ls_xg_women = df_1v1_women['statsbomb_xg'].to_list()
ls_distance_between_bisector_and_gk_women = df_1v1_women['distance_between_bisector_and_gk'].to_list()
ls_distance_between_goalline_and_gk_women = df_1v1_women['distance_between_goalline_and_gk'].to_list()
ls_distance_between_gk_and_gk_knn_women = df_1v1_women['distance_between_gk_and_gk_knn'].to_list()

In [None]:
print('Men:')
print('xg and distance_bisector:', round(np.corrcoef(ls_xg_men, ls_distance_between_bisector_and_gk_men)[0, 1], 4))
print('xg and distance_ballline:', round(np.corrcoef(ls_xg_men, ls_distance_between_goalline_and_gk_men)[0, 1], 4))
print('xg and distance_gk_knn:', round(np.corrcoef(ls_xg_men, ls_distance_between_gk_and_gk_knn_men)[0, 1], 4))

print()
print('Women:')
print('xg and distance_bisector:', round(np.corrcoef(ls_xg_women, ls_distance_between_bisector_and_gk_women)[0, 1], 4))
print('xg and distance_ballline:', round(np.corrcoef(ls_xg_women, ls_distance_between_goalline_and_gk_women)[0, 1], 4))
print('xg and distance_gk_knn:', round(np.corrcoef(ls_xg_women, ls_distance_between_gk_and_gk_knn_women)[0, 1], 4))

### Test Relative Positions

Plots:

In [None]:
# Plot one column on x and y axis each and add linreg line
def plot_xy_linreg(ls_x, ls_y, x_axis, y_axis):
    
    # Transform list to numpy array
    x = np.array(ls_x)
    y = np.array(ls_y)
    # Define figure size
    fig, ax = plt.subplots(figsize=(10, 10))
    
    # Create scatter plot
    plt.plot(x, y, 'o')
    # Add axis label
    plt.xlabel(x_axis)
    plt.ylabel(y_axis)
    
    # Define m = slope and b = intercept
    m, b = np.polyfit(x, y, 1)
    # Add line of best fit
    plt.plot(x, m*x + b)
    
    # Scale plot
    plt.axis('scaled')
    plt.xlim([0, 26])
    plt.ylim([0, 26])

In [None]:
# Apply method
plot_xy_linreg(ls_gsaa_ranking, ls_distance_between_bisector_and_gk_gsaa, 'GSAA (Ranking)', 'Distance Between GK and Bisector (Ranking)')

# Save plot
plt.savefig('images/correlation_bisector.png')

In [None]:
# Apply method
plot_xy_linreg(ls_gsaa_ranking, ls_distance_between_goalline_and_gk_gsaa, 'GSAA (Ranking)', 'Distance Between GK and Ball Line (Ranking)')

# Save plot
plt.savefig('images/correlation_ballline.png')

In [None]:
# Apply method
plot_xy_linreg(ls_gsaa_ranking, ls_distance_between_gk_and_gk_knn_gsaa, 'GSAA (Ranking)', 'Distance Between GK and k-NN (Ranking)')

# Save plot
plt.savefig('images/correlation_knn.png')

### PLOT RELATIVE GK POSITIONS

In [None]:
def plot_relative_gk(df, gk_centered, gk_relative, x, y):
    """
    Plot GK position relative to centered optimal GK position
    """
    
    # Apply function
    plot_coordinate_system(x, y)
    
    # Define variable
    vertical_negatives = 0
    
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        if df.at[row, 'outcome'] != 'Goal':
            # Save alternative gk position relative to centered gk position
            y = df.at[row, 'x_' + gk_centered] - df.at[row, 'x_' + gk_relative]
            x = df.at[row, 'y_' + gk_centered] - df.at[row, 'y_' + gk_relative]
            # Color for goals
            plt.scatter(x, y, marker='s', facecolors='none', color='blue')
            # Update counter
            if 0 > y:
                vertical_negatives += 1
            
    # Iterate over all rows in dataframe
    for row in range(len(df)):
        if df.at[row, 'outcome'] == 'Goal':
            # Save alternative gk position relative to centered gk position
            y = df.at[row, 'x_' + gk_centered] - df.at[row, 'x_' + gk_relative]
            x = df.at[row, 'y_' + gk_centered] - df.at[row, 'y_' + gk_relative]
            # Color for goals
            plt.scatter(x, y, marker='o', facecolors='none', color='red')
            # Update counter
            if 0 > y:
                vertical_negatives += 1
    
    # Show counter
    print(round(vertical_negatives/len(df)*100, 2), '% negative y-value')
    # Plot legend
    plt.scatter(1000, 1000, marker = 'o', facecolors='none', color='red', label='Relative GK Position (Goal)')
    plt.scatter(1000, 1000, marker = 's', facecolors='none', color='blue', label='Relative GK Position (No-Goal)')
    # plt.legend(loc='upper left', prop={'size': 15})

In [None]:
plot_relative_gk(df_1v1_men, 'gk_inscribed', 'gk', 8, 8)
plt.savefig('images/relative_gk_inscribed_men.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_women, 'gk_inscribed', 'gk', 8, 8)
plt.savefig('images/relative_gk_inscribed_women.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_men, 'gk_knn', 'gk', 8, 8)
plt.savefig('images/relative_gk_knn_men.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_women, 'gk_knn', 'gk', 8, 8)
plt.savefig('images/relative_gk_knn_women.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_men, 'gk_ml', 'gk', 8, 8)
plt.savefig('images/relative_gk_ml_men.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_women, 'gk_ml', 'gk', 8, 8)
plt.savefig('images/relative_gk_ml_women.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_men, 'gk_arc', 'gk', 7, 7)
plt.savefig('images/relative_gk_arc_men.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_women, 'gk_arc', 'gk', 7, 7)
plt.savefig('images/relative_gk_arc_women.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_men, 'gk_line', 'gk', 9, 9)
plt.savefig('images/relative_gk_line_men.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_women, 'gk_line', 'gk', 9, 9)
plt.savefig('images/relative_gk_line_women.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_men, 'gk_bisector_dive', 'gk', 9, 9)
plt.savefig('images/relative_gk_bisector_dive_men.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_women, 'gk_bisector_dive', 'gk', 9, 9)
plt.savefig('images/relative_gk_bisector_dive_women.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_men, 'gk_zonal', 'gk', 7, 7)
plt.savefig('images/relative_gk_zonal_men.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_women, 'gk_zonal', 'gk', 7, 7)
plt.savefig('images/relative_gk_zonal_women.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_men, 'gk_bisector_wingspan', 'gk', 12, 12)
plt.savefig('images/relative_gk_bisector_wingspan_men.png', bbox_inches='tight')

In [None]:
plot_relative_gk(df_1v1_women, 'gk_bisector_wingspan', 'gk', 12, 12)
plt.savefig('images/relative_gk_bisector_wingspan_women.png', bbox_inches='tight')

# 0. OTHERS <a name='others'></a>

### 0.1 Title Page <a name='title'></a>

In [None]:
df_final_men.where((df_final_men['possession_team'] == 'Switzerland') & (df_final_men['goalkeeper'] == 'Hugo Lloris') & (df_final_men['outcome'] == 'Goal')).dropna()

In [None]:
# Apply function
plot_vertical_pitch(False, False)

# Define dataframe and row
df = df_final_men.copy()
row = 286

# Define and plot shot position transformed from horizontal to vertical pitch
x_shot = 80 - df.at[row, 'y_shot']
y_shot = 120 - df.at[row, 'x_shot']
# Define gk position transformed from horizontal to vertical pitch
x_gk = 80 - df.at[row, 'y_gk']
y_gk = 120 - df.at[row, 'x_gk']

# Apply method
plot_shot_position(False)
plot_gk_position(True)

# Plot shot text
plt.text(x_shot+0.5, y_shot-0.5, 'S', fontsize=fontsize_PITCH)

# Plot other player positions
for player in range(len(df.at[row, 'player_locations'])):
    x_player = 80 - df.at[row, 'player_locations'][player][1]
    y_player = 120 - df.at[row, 'player_locations'][player][0]
    plt.scatter(x_player, y_player, facecolors='none', edgecolors='green', s=s_PITCH, zorder=2)
plt.scatter(1000, 1000, facecolors='none', edgecolors='green', s=s_PITCH, label='Field Player')

# Apply method
plot_goalposts(True)
plot_goalline()
plot_shot_cone()

# Plot dive shadow
plt.gca().plot(x_gk, y_gk, 'o', ms=130, mfc='blue', alpha=0.3)
plt.scatter(1000, 1000, color='blue', s=s_ZONES, alpha=0.3, zorder=1, label='Dive Shadow')

# Plot xG
x_xg, y_xG = 63, 22
plt.gca().plot(x_xg+1.4, y_xG+0.45, 'o', ms=90, mfc='red', alpha=0.3)
plt.scatter(1000, 1000, color='red', s=s_ZONES, alpha=0.3, zorder=1, label='Goal Probability')
# Plot shot text
plt.text(x_xg, y_xG, '18%', fontsize=fontsize_PITCH)

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/title_page.png', bbox_inches='tight')

# 1. INTRODUCTION <a name='introduction'></a>

Total number of games and average number of events per game:

In [None]:
# Create variables
filecount = 0
eventcount = 0

# Iterate over all files in directory
for filename in LS_FILENAME_MEN:

    # Save current filepath
    filepath = os.path.join(DIRECTORY_EVENTS, filename)
    # Check if current file has json ending
    if filename.split('.')[1] == 'json':

        # Create dataframe from current json file
        df_file = pd.read_json(filepath)
        # Update counters
        eventcount = eventcount + len(df_file)
        filecount = filecount + 1

# Calculate average number of events per game
avg_events = round(eventcount / filecount, 2)

# Display results
print('Men:')
print('Total number of games:', filecount)
print('Average number of events per game:', avg_events)

In [None]:
# Create variables
filecount = 0
eventcount = 0

# Iterate over all files in directory
for filename in LS_FILENAME_WOMEN:

    # Save current filepath
    filepath = os.path.join(DIRECTORY_EVENTS, filename)
    # Check if current file has json ending
    if filename.split('.')[1] == 'json':

        # Create dataframe from current json file
        df_file = pd.read_json(filepath)
        # Update counters
        eventcount = eventcount + len(df_file)
        filecount = filecount + 1

# Calculate average number of events per game
avg_events = round(eventcount / filecount, 2)

# Display results
print('Women:')
print('Total number of games:', filecount)
print('Average number of events per game:', avg_events)

Total number and percentage of games with one or zero goals difference:

In [None]:
# Create dataframe
df_matches = pd.read_json(MATCHPATH)

# Calculate goal difference
df_matches['goal_difference'] = abs(df_matches['home_score'] - df_matches['away_score'])

# Save goal difference
goal_difference = df_matches['goal_difference'].where(df_matches['goal_difference'] <= 1).dropna().count()

# Display results
print('Total number of games with one or zero goals difference:', goal_difference)
print('Percentage of games with one or zero goals difference:', round(100*goal_difference/filecount, 2), '%')

Total number of shots:

In [None]:
# Create dataframe
df_all_men = create_dataframe(LS_FILENAME_MEN)
df_all_women = create_dataframe(LS_FILENAME_WOMEN)

# Display results
print('Men:')
print(df_all_men['shot'].dropna().count())
print()
print('Women:')
print(df_all_women['shot'].dropna().count())

Total number of deflections and set pieces:

In [None]:
# Define variable and dataframe
deflections = 0
set_pieces = 0
df_all_shots_men = df_all_men.dropna().reset_index(drop = True)

# Iterate over all rows in dataframe
for row in range(len(df_all_shots_men)):
    if 'deflected' in df_all_shots_men.iloc[row]['shot']:
        deflections = deflections + 1
    if df_all_shots_men.at[row, 'shot']['type']['name'] != 'Open Play':
        set_pieces = set_pieces + 1

# Display results
print('Men:')
print('Total number of deflections:', deflections)
print('Total number of set pieces:', set_pieces)

In [None]:
# Define variable and dataframe
deflections = 0
set_pieces = 0
df_all_shots_women = df_all_women.dropna().reset_index(drop = True)

# Iterate over all rows in dataframe
for row in range(len(df_all_shots_women)):
    if 'deflected' in df_all_shots_women.iloc[row]['shot']:
        deflections = deflections + 1
    if df_all_shots_women.at[row, 'shot']['type']['name'] != 'Open Play':
        set_pieces = set_pieces + 1

# Display results
print('Women:')
print('Total number of deflections:', deflections)
print('Total number of set pieces:', set_pieces)

Total number of open-shots:

In [None]:
print('Men:')
print(df_final_men.where(df_final_men['shooting_cone_players_count'] == 0).dropna()['shot'].count())
print()
print('Women:')
print(df_final_women.where(df_final_women['shooting_cone_players_count'] == 0).dropna()['shot'].count())

Total number of undefined situations:

In [None]:
# Define dataframe with all open-goal-situations
df_open_men = df_final_men.where(df_final_men['shooting_cone_players_count'] == 0).dropna()

count_undefined = 0

for row in range(len(df_open_men)):
    if df_open_men.iloc[row]['x_gk_zonal'] == 0 or df_open_men.iloc[row]['x_gk_line'] == 0 or df_open_men.iloc[row]['x_gk_arc'] == 0:
        count_undefined += 1

print('Men:', count_undefined)

In [None]:
# Define dataframe with all open-goal-situations
df_open_women = df_final_women.where(df_final_women['shooting_cone_players_count'] == 0).dropna()

count_undefined = 0

for row in range(len(df_open_women)):
    if df_open_women.iloc[row]['x_gk_zonal'] == 0 or df_open_women.iloc[row]['x_gk_line'] == 0 or df_open_women.iloc[row]['x_gk_arc'] == 0:
        count_undefined += 1

print('Women:', count_undefined)

Percentage of off target shots:

In [None]:

shots_off_target = df_1v1['outcome'].where((df_1v1['outcome'] == 'Off T') | (df_1v1['outcome'] == 'Blocked') | (df_1v1['outcome'] == 'Wayward') | (df_1v1['outcome'] == 'Saved Off Target') | (df_1v1['outcome'] == 'Post')).dropna().count()
shots_total = df_1v1['outcome'].count()

print('Percentage of off target shots:', round(100*shots_off_target/shots_total, 2), '%')

All shot positions from open-goal-situations:

In [None]:
plot_all_shots(df_1v1_men)
# Save plot
plt.savefig('images/all_shots_men.png', bbox_inches='tight')

In [None]:
plot_all_shots(df_1v1_women)
# Save plot
plt.savefig('images/all_shots_women.png', bbox_inches='tight')

All actual gk positions from open-goal-situations:

In [None]:
plot_all_gks(df_1v1_men, 'gk')
# Save plot
plt.savefig('images/all_gks_men.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_women, 'gk')
# Save plot
plt.savefig('images/all_gks_women.png', bbox_inches='tight')

Open-goal-situation:

In [None]:
# Apply function
plot_vertical_pitch(False, False)

# Define dataframe and row
df = df_1v1_men.copy()
row = 6

# Define and plot shot position transformed from horizontal to vertical pitch
x_shot = df.at[row, 'y_shot']
y_shot = 120 - df.at[row, 'x_shot']
# Define gk position transformed from horizontal to vertical pitch
x_gk = df.at[row, 'y_gk']
y_gk = 120 - df.at[row, 'x_gk']

# Apply method
plot_shot_position(True)
plot_gk_position(True)

# Plot other player positions
for player in range(len(df.at[row, 'player_locations'])):
    x_player = df.at[row, 'player_locations'][player][1]
    y_player = 120 - df.at[row, 'player_locations'][player][0]
    plt.scatter(x_player, y_player, facecolors='none', edgecolors='green', s=s_PITCH, zorder=2)
plt.scatter(1000, 1000, facecolors='none', edgecolors='green', s=s_PITCH, label='Field Player')

# Apply method
plot_goalposts(True)
plot_goalline()
plot_shot_cone()

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/open_goal_situation.png', bbox_inches='tight')

Non-open-goal-situation:

In [None]:
# Apply function
plot_vertical_pitch(False, False)

# Define dataframe and row
df = df_final_men.copy()
row = 16

# Define and plot shot position transformed from horizontal to vertical pitch
x_shot = 80 - df.at[row, 'y_shot']
y_shot = 120 - df.at[row, 'x_shot']
# Define gk position transformed from horizontal to vertical pitch
x_gk = 80 - df.at[row, 'y_gk']
y_gk = 120 - df.at[row, 'x_gk']

# Apply method
plot_shot_position(True)
plot_gk_position(True)

# Plot other player positions
for player in range(len(df.at[row, 'player_locations'])):
    x_player = df.at[row, 'player_locations'][player][1]
    y_player = 120 - df.at[row, 'player_locations'][player][0]
    plt.scatter(x_player, y_player, facecolors='none', edgecolors='green', s=s_PITCH, zorder=2)
plt.scatter(1000, 1000, facecolors='none', edgecolors='green', s=s_PITCH, label='Field Player')

# Apply method
plot_goalposts(True)
plot_goalline()
plot_shot_cone()

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/non_open_goal_situation.png', bbox_inches='tight')

# 2. GKP MODELS <a name='gkp'></a>

### 2.1 Geometric GKP Models <a name='geometric'></a>

Shot angle:

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = x_SV, y_SV

# Apply method
plot_shot_position(True)
plot_goalposts(True)
plot_goalline()
plot_shot_cone()

# Define gk position
x_gk, y_gk = x_GKV+28, y_GKV
# Calculate linear equation perpendicular to bisector (i.e., gk reach)
m_bisector, q_bisector = get_bisector_variables()
m_perpendicular = -(1/m_bisector)
q_perpendicular = y_gk - m_perpendicular*(x_gk)
# Plot gk reach
x = np.linspace(46.05, 48.2, 100)
y = m_perpendicular*x + q_perpendicular
plt.plot(x, y, linestyle=(0, (1, 1)), color='black', linewidth=linewidth_PITCH, zorder=1, label= 'Shot Angle (\u03C6)')
plt.text(47.8, 16, '\u03C6', fontsize=fontsize_PITCH)

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/shooting_angle.png', bbox_inches='tight')

Bisector and ball line (central shot):

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = 45, 14

# Apply method
plot_shot_position(True)
plot_goalcenter(True)
plot_goalposts(True)
plot_goalline()
plot_shot_cone()
plot_bisector()
plot_ballline()

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/lines_central.png', bbox_inches='tight')

Bisector and ball line (close shot):

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = 45, 3

# Apply method
plot_shot_position(False)
plot_goalcenter(True)
plot_goalposts(True)
plot_goalline()
plot_shot_cone()
plot_bisector()
plot_ballline()

# Plot shot text
plt.text(x_shot+0.4, y_shot-1, 'S', fontsize=fontsize_PITCH)

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/lines_close.png', bbox_inches='tight')

Dive shadow:

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = x_SV, y_SV
# Define gk position
x_gk, y_gk = x_GKV, y_GKV+1.8

# Apply method
plot_shot_position(True)
plot_gk_position(True)
plot_goalposts(True)
plot_goalline()
plot_shot_cone()

# Plot dive shadow
plt.gca().plot(x_gk, y_gk, 'o', ms=130, mfc='blue', alpha=0.3)
plt.scatter(1000, 1000, color='blue', s=s_ZONES, alpha=0.3, zorder=1, label='Dive Shadow')

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/dive_shadow.png', bbox_inches='tight')

Goalkeeper reach:

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = x_SV, y_SV
# Define gk position
x_gk, y_gk = x_GKV, y_GKV

# Apply method
plot_shot_position(True)
plot_gk_position(True)
plot_goalposts(True)
plot_goalline()
plot_shot_cone()
plot_bisector()

# Calculate linear equation perpendicular to bisector (i.e., gk reach)
m_bisector, q_bisector = get_bisector_variables()
m_perpendicular = -(1/m_bisector)
q_perpendicular = y_gk - m_perpendicular*(x_gk)
# Plot gk reach
x = np.linspace(39, 44.5, 100)
y = m_perpendicular*x + q_perpendicular
plt.plot(x, y, linestyle=(0, (1, 1)), color='blue', linewidth=linewidth_PITCH, alpha=alpha_PITCH, zorder=1, label= 'GK Reach')

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/gk_reach.png', bbox_inches='tight')

Zonal penalty area model (zones):

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Plot zone 1
x, y = [40, 18, 18], [0, 13, 0]
plot_zone('green', True, 'Zone 1')
x, y = [40, 62, 62], [0, 13, 0]
plot_zone('green', False, '')
# Plot zone 2
x, y = [40, 32, 18, 18], [0, 18, 18, 13]
plot_zone('orange', True, 'Zone 2')
x, y = [40, 48, 62, 62], [0, 18, 18, 13]
plot_zone('orange', False, '')
# Plot zone 3
x, y = [40, 32, 48], [0, 18, 18]
plot_zone('red', True, 'Zone 3')

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/zones_3.png', bbox_inches='tight')

Zonal penalty area model (shot example for each zone):

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = 56, 6
# Define gk position
x_gk, y_gk = 44.5, 1.7

# Apply method
plot_shot_position(False)
plot_gk_position(False)
plot_goalcenter(True)
plot_goalposts(True)
plot_goalline()
plot_shot_cone()
plot_ballline()

# Plot shot text
plt.text(x_shot+0.4, y_shot-1, 'S', fontsize=fontsize_PITCH)
# Plot gk text
plt.text(x_gk+0.4, y_gk-0.8, 'G', fontsize=fontsize_PITCH)

# Plot zone 1
x, y = [40, 18, 18], [0, 13, 0]
plot_zone('green', True, 'Zone 1')
x, y = [40, 62, 62], [0, 13, 0]
plot_zone('green', False, '')

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/zone_1.png', bbox_inches='tight')

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = 52, 13
# Define gk position
x_gk, y_gk = 43.2, 3.5

# Apply method
plot_shot_position(False)
plot_gk_position(False)
plot_goalcenter(True)
plot_goalposts(True)
plot_goalline()
plot_shot_cone()
plot_ballline()

# Plot shot text
plt.text(x_shot+0.4, y_shot-1, 'S', fontsize=fontsize_PITCH)
# Plot gk text
plt.text(x_gk+0.4, y_gk-0.8, 'G', fontsize=fontsize_PITCH)

# Plot zone 2
x, y = [40, 32, 18, 18], [0, 18, 18, 13]
plot_zone('orange', True, 'Zone 2')
x, y = [40, 48, 62, 62], [0, 18, 18, 13]
plot_zone('orange', False, '')

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/zone_2.png', bbox_inches='tight')

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = 42.1, 14
# Define gk position
x_gk, y_gk = 40.6, 4.5

# Apply method
plot_shot_position(True)
plot_gk_position(True)
plot_goalcenter(True)
plot_goalposts(True)
plot_goalline()
plot_shot_cone()
plot_ballline()

# Plot zone 3
x, y = [40, 32, 48], [0, 18, 18]
plot_zone('red', True, 'Zone 3')

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/zone_3.png', bbox_inches='tight')

Zonal intersection model (original zones):

In [None]:
def semicircle(r, h, k, width, quantity):
    """
    Return points of a circle segment
    Width/quantity for red zone = 1.3/50
    Width/quantity for blue zone = 2.9/20
    I used this method to get the points for the round parts of the polygons
    """
    x0 = h - r/width  # determine x start
    x1 = h + r/width  # determine x finish
    x = np.linspace(x0, x1, quantity)  # many points to solve for y

    # use numpy for array solving of the semicircle equation
    y = k + np.sqrt(r**2 - (x - h)**2)  
    return x.tolist(), y.tolist()

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Plot orange point
x_orange, y_orange = 40, 2.4
plt.scatter(x_orange, y_orange, color='orange', s=s_PITCH, label='Orange Point (O)', zorder = 2)
plt.text(x_orange, y_orange+0.8, 'O', fontsize=fontsize_PITCH_SMALL)

# Plot left red line
x_values = [x_LV, x_orange]
y_values = [y_LV, y_orange]
plt.plot(x_values, y_values, linestyle=(0, (1, 1)), color='red', linewidth=linewidth_PITCH, alpha=alpha_PITCH, zorder = 1, label='Red Line')
# Plot right red line
x_values = [x_RV, x_orange]
y_values = [y_RV, y_orange]
plt.plot(x_values, y_values, linestyle=(0, (1, 1)), color='red', linewidth=linewidth_PITCH, alpha=alpha_PITCH, zorder = 1)

# Plot red zone
x = [18, 18, 0, 0, 18, 18, 32, 32.30769230769231, 32.62166405023548, 32.93563579277865, 33.24960753532182, 33.56357927786499, 33.87755102040816, 34.19152276295134, 34.505494505494504, 34.81946624803768, 35.133437990580845, 35.44740973312402, 35.76138147566719, 36.07535321821036, 36.389324960753534, 36.7032967032967, 37.017268445839875, 37.33124018838305, 37.645211930926216, 37.95918367346939, 38.27315541601256, 38.58712715855573, 38.901098901098905, 39.21507064364207, 39.529042386185246, 39.84301412872841, 40.15698587127159, 40.47095761381476, 40.78492935635793, 41.098901098901095, 41.41287284144427, 41.72684458398744, 42.04081632653062, 42.354788069073784, 42.66875981161695, 42.982731554160125, 43.2967032967033, 43.61067503924647, 43.92464678178964, 44.23861852433281, 44.55259026687598, 44.866562009419155, 45.18053375196232, 45.494505494505496, 45.80847723704866, 46.12244897959184, 46.43642072213501, 46.75039246467818, 47.06436420722135, 47.37833594976452, 47.69230769230769, 48, 62, 62, 80, 80, 62, 62]
y = [50, 120-98, 120-98, 120-114, 120-114, 120-102, 120-102, 18.389710663783134, 18.749826561654196, 19.07776506728853, 19.377818212237006, 19.65326651095276, 19.906681888902252, 20.140122369269857, 20.355262375943013, 20.55348291895057, 20.735935794663227, 20.903590391631127, 21.057268517891185, 21.197670772439523, 21.325396815201085, 21.440961146700353, 21.544805523205675, 21.637308808370634, 21.71879484039805, 21.789538738948128, 21.849771966028108, 21.899686375532774, 21.939437427482197, 21.969146698967144, 21.988903790005686, 21.998767695882385, 21.998767695882385, 21.988903790005686, 21.969146698967144, 21.939437427482197, 21.899686375532774, 21.849771966028108, 21.789538738948124, 21.71879484039805, 21.637308808370634, 21.544805523205675, 21.440961146700353, 21.32539681520108, 21.197670772439523, 21.057268517891185, 20.903590391631127, 20.735935794663227, 20.55348291895057, 20.355262375943013, 20.140122369269857, 19.906681888902252, 19.65326651095276, 19.377818212237006, 19.07776506728853, 18.749826561654196, 18.389710663783134, 120-102, 120-102, 120-114, 120-114, 120-98, 120-98, 50]
plot_zone('red', True, 'Red Zone')

# Plot orange zones
x, y = [62, 62, 80, 80], [120-114, 120-120, 120-120, 120-114]
plot_zone('orange', True, 'Orange Zone')
x, y = [18, 18, 0, 0], [120-114, 120-120, 120-120, 120-114]
plot_zone('orange', False, '')
x, y = [62, 62, 80, 80], [50, 120-98, 120-98, 50]
plot_zone('orange', False, '')
x, y = [18, 18, 0, 0], [50, 120-98, 120-98, 50]
plot_zone('orange', False, '')

# Plot blue zone
x = [41, 39, 36.55172413793103, 36.91470054446461, 37.277676950998185, 37.64065335753176, 38.00362976406534, 38.366606170598914, 38.72958257713248, 39.09255898366606, 39.455535390199636, 39.81851179673321, 40.18148820326679, 40.544464609800364, 40.90744101633394, 41.27041742286752, 41.63339382940109, 41.99637023593466, 42.35934664246824, 42.722323049001815, 43.08529945553539, 43.44827586206897]
y = [120-114, 120-114, 21.38666040608014, 21.512146301948526, 21.622315584976075, 21.717689201691616, 21.79869919331511, 21.86569939730957, 21.918973715645922, 21.95874243074269, 21.985166913410858, 21.998352965967694, 21.998352965967694, 21.985166913410858, 21.95874243074269, 21.918973715645922, 21.86569939730957, 21.79869919331511, 21.717689201691616, 21.622315584976075, 21.512146301948526, 21.38666040608014]
plt.fill(x, y, color='blue', alpha=0.4)
plt.scatter(1000, 1000, color='blue', marker='v', s=s_ZONES, alpha=alpha_PITCH, label='Blue Zone')

# Plot light blue zones
x = [32, 32.30769230769231, 32.62166405023548, 32.93563579277865, 33.24960753532182, 33.56357927786499, 33.87755102040816, 34.19152276295134, 34.505494505494504, 34.81946624803768, 35.133437990580845, 35.44740973312402, 35.76138147566719, 36.07535321821036, 36.389324960753534, 36.55172413793103, 39, 36, 18, 18]
y = [120-102, 18.389710663783134, 18.749826561654196, 19.07776506728853, 19.377818212237006, 19.65326651095276, 19.906681888902252, 20.140122369269857, 20.355262375943013, 20.55348291895057, 20.735935794663227, 20.903590391631127, 21.057268517891185, 21.197670772439523, 21.325396815201085, 21.38666040608014, 120-114, 120-117.6, 120-106.8, 120-102]
plot_zone('blue', True, 'Light Blue Zone')
x = [48, 62, 62, 44, 41, 43.44827586206897, 43.61067503924647, 43.92464678178964, 44.23861852433281, 44.55259026687598, 44.866562009419155, 45.18053375196232, 45.494505494505496, 45.80847723704866, 46.12244897959184, 46.43642072213501, 46.75039246467818, 47.06436420722135, 47.37833594976452, 47.69230769230769]
y = [120-102, 120-102, 120-106.8, 120-117.6, 120-114, 21.38666040608014, 21.32539681520108, 21.197670772439523, 21.057268517891185, 20.903590391631127, 20.735935794663227, 20.55348291895057, 20.355262375943013, 20.140122369269857, 19.906681888902252, 19.65326651095276, 19.377818212237006, 19.07776506728853, 18.749826561654196, 18.389710663783134]
plot_zone('blue', False, '')

# Plot yellow zones
x, y = [18, 18, 36, 36], [120-106.8, 120-120, 120-120, 120-117.6]
plot_zone('yellow', True, 'Yellow Zone')
x, y = [44, 44, 62, 62], [120-117.6, 120-120, 120-120, 120-106.8]
plot_zone('yellow', False, '')

# Scale plot
plt.axis('scaled')
plt.xlim([-5, 85])
plt.ylim([-5, 65])

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH_SMALL})
plt.savefig('images/zones_colors_origin.png', bbox_inches='tight')

Zonal intersection model (implemented zones):

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Plot orange point
x_orange, y_orange = 40, 2.4
plt.scatter(x_orange, y_orange, color='orange', s=s_PITCH, label='Orange Point (O)', zorder = 2)
plt.text(x_orange, y_orange+0.8, 'O', fontsize=fontsize_PITCH_SMALL)

# Plot left red line
x_values = [x_LV, x_orange]
y_values = [y_LV, y_orange]
plt.plot(x_values, y_values, linestyle=(0, (1, 1)), color='red', linewidth=linewidth_PITCH, alpha=alpha_PITCH, zorder = 1, label='Red Line')
# Plot right red line
x_values = [x_RV, x_orange]
y_values = [y_RV, y_orange]
plt.plot(x_values, y_values, linestyle=(0, (1, 1)), color='red', linewidth=linewidth_PITCH, alpha=alpha_PITCH, zorder = 1)

# Plot red zone
x = [18, 18, 0, 0, 18, 18, 32, 32.30769230769231, 32.62166405023548, 32.93563579277865, 33.24960753532182, 33.56357927786499, 33.87755102040816, 34.19152276295134, 34.505494505494504, 34.81946624803768, 35.133437990580845, 35.44740973312402, 35.76138147566719, 36.07535321821036, 36.389324960753534, 36.7032967032967, 37.017268445839875, 37.33124018838305, 37.645211930926216, 37.95918367346939, 38.27315541601256, 38.58712715855573, 38.901098901098905, 39.21507064364207, 39.529042386185246, 39.84301412872841, 40.15698587127159, 40.47095761381476, 40.78492935635793, 41.098901098901095, 41.41287284144427, 41.72684458398744, 42.04081632653062, 42.354788069073784, 42.66875981161695, 42.982731554160125, 43.2967032967033, 43.61067503924647, 43.92464678178964, 44.23861852433281, 44.55259026687598, 44.866562009419155, 45.18053375196232, 45.494505494505496, 45.80847723704866, 46.12244897959184, 46.43642072213501, 46.75039246467818, 47.06436420722135, 47.37833594976452, 47.69230769230769, 48, 62, 62, 80, 80, 62, 62]
y = [120-60, 120-98, 120-98, 120-114, 120-114, 120-102, 120-102, 18.389710663783134, 18.749826561654196, 19.07776506728853, 19.377818212237006, 19.65326651095276, 19.906681888902252, 20.140122369269857, 20.355262375943013, 20.55348291895057, 20.735935794663227, 20.903590391631127, 21.057268517891185, 21.197670772439523, 21.325396815201085, 21.440961146700353, 21.544805523205675, 21.637308808370634, 21.71879484039805, 21.789538738948128, 21.849771966028108, 21.899686375532774, 21.939437427482197, 21.969146698967144, 21.988903790005686, 21.998767695882385, 21.998767695882385, 21.988903790005686, 21.969146698967144, 21.939437427482197, 21.899686375532774, 21.849771966028108, 21.789538738948124, 21.71879484039805, 21.637308808370634, 21.544805523205675, 21.440961146700353, 21.32539681520108, 21.197670772439523, 21.057268517891185, 20.903590391631127, 20.735935794663227, 20.55348291895057, 20.355262375943013, 20.140122369269857, 19.906681888902252, 19.65326651095276, 19.377818212237006, 19.07776506728853, 18.749826561654196, 18.389710663783134, 120-102, 120-102, 120-114, 120-114, 120-98, 120-98, 120-60]
plot_zone('red', True, 'Red Zone')

# Plot orange zones
x, y = [62, 62, 80, 80], [120-114, 120-120, 120-120, 120-114]
plot_zone('orange', True, 'Orange Zone')
x, y = [18, 18, 0, 0], [120-114, 120-120, 120-120, 120-114]
plot_zone('orange', False, '')
x, y = [62, 62, 80, 80], [120-60, 120-98, 120-98, 120-60]
plot_zone('orange', False, '')
x, y = [18, 18, 0, 0], [120-60, 120-98, 120-98, 120-60]
plot_zone('orange', False, '')

# Plot blue zone
x = [41, 39, 36.55172413793103, 36.91470054446461, 37.277676950998185, 37.64065335753176, 38.00362976406534, 38.366606170598914, 38.72958257713248, 39.09255898366606, 39.455535390199636, 39.81851179673321, 40.18148820326679, 40.544464609800364, 40.90744101633394, 41.27041742286752, 41.63339382940109, 41.99637023593466, 42.35934664246824, 42.722323049001815, 43.08529945553539, 43.44827586206897]
y = [120-114, 120-114, 21.38666040608014, 21.512146301948526, 21.622315584976075, 21.717689201691616, 21.79869919331511, 21.86569939730957, 21.918973715645922, 21.95874243074269, 21.985166913410858, 21.998352965967694, 21.998352965967694, 21.985166913410858, 21.95874243074269, 21.918973715645922, 21.86569939730957, 21.79869919331511, 21.717689201691616, 21.622315584976075, 21.512146301948526, 21.38666040608014]
plt.fill(x, y, color='blue', alpha=0.4)
plt.scatter(1000, 1000, color='blue', marker='v', s=s_ZONES, alpha=alpha_PITCH, label='Blue Zone')

# Plot light blue zones
x = [32, 32.30769230769231, 32.62166405023548, 32.93563579277865, 33.24960753532182, 33.56357927786499, 33.87755102040816, 34.19152276295134, 34.505494505494504, 34.81946624803768, 35.133437990580845, 35.44740973312402, 35.76138147566719, 36.07535321821036, 36.389324960753534, 36.55172413793103, 39, 36, 18, 18]
y = [120-102, 18.389710663783134, 18.749826561654196, 19.07776506728853, 19.377818212237006, 19.65326651095276, 19.906681888902252, 20.140122369269857, 20.355262375943013, 20.55348291895057, 20.735935794663227, 20.903590391631127, 21.057268517891185, 21.197670772439523, 21.325396815201085, 21.38666040608014, 120-114, 120-117.6, 120-106.8, 120-102]
plot_zone('blue', True, 'Light Blue Zone')
x = [48, 62, 62, 44, 41, 43.44827586206897, 43.61067503924647, 43.92464678178964, 44.23861852433281, 44.55259026687598, 44.866562009419155, 45.18053375196232, 45.494505494505496, 45.80847723704866, 46.12244897959184, 46.43642072213501, 46.75039246467818, 47.06436420722135, 47.37833594976452, 47.69230769230769]
y = [120-102, 120-102, 120-106.8, 120-117.6, 120-114, 21.38666040608014, 21.32539681520108, 21.197670772439523, 21.057268517891185, 20.903590391631127, 20.735935794663227, 20.55348291895057, 20.355262375943013, 20.140122369269857, 19.906681888902252, 19.65326651095276, 19.377818212237006, 19.07776506728853, 18.749826561654196, 18.389710663783134]
plot_zone('blue', False, '')

# Plot yellow zones
x, y = [18, 18, 36, 36], [120-106.8, 120-120, 120-120, 120-117.6]
plot_zone('yellow', True, 'Yellow Zone')
x, y = [44, 44, 62, 62], [120-117.6, 120-120, 120-120, 120-106.8]
plot_zone('yellow', False, '')

# Scale plot
plt.axis('scaled')
plt.xlim([-5, 85])
plt.ylim([-5, 65])

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH_SMALL})
plt.savefig('images/zones_colors_implementation.png', bbox_inches='tight')

Zonal intersection model (shot example for each zone):

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Plot shot position
x_shot, y_shot = 70, 25
# Define orange point
x_gk, y_gk = 40, 2.4

# Apply method
plot_shot_position(False)
plot_gk_position(False)
plot_goalcenter(False)
plot_ballline()

# Plot shot text
plt.text(x_shot+0.4, y_shot-2, 'S', fontsize=fontsize_PITCH_SMALL)
# Plot gk text
plt.text(x_gk+0.4, y_gk, 'G', fontsize=fontsize_PITCH_SMALL)
# Plot goalcenter text
plt.text(x_MV+0.4, y_MV-2.09, 'C', fontsize=fontsize_PITCH_SMALL)

# Plot orange zones
x, y = [62, 62, 80, 80], [120-114, 120-120, 120-120, 120-114]
plot_zone('orange', True, 'Orange Zone')
x, y = [18, 18, 0, 0], [120-114, 120-120, 120-120, 120-114]
plot_zone('orange', False, '')
x, y = [62, 62, 80, 80], [50, 120-98, 120-98, 50]
plot_zone('orange', False, '')
x, y = [18, 18, 0, 0], [50, 120-98, 120-98, 50]
plot_zone('orange', False, '')

# Scale plot
plt.axis('scaled')
plt.xlim([-5, 85])
plt.ylim([-5, 65])

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH_SMALL})
plt.savefig('images/zones_orange.png', bbox_inches='tight')

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = 58, 22
# Define gk position
x_gk, y_gk = 41.3, 1.7

# Apply method
plot_shot_position(False)
plot_gk_position(False)
plot_goalcenter(False)

# Plot shot text
plt.text(x_shot+0.4, y_shot-2, 'S', fontsize=fontsize_PITCH_SMALL)
# Plot gk text
plt.text(x_gk+1, y_gk-0.5, 'G', fontsize=fontsize_PITCH_SMALL)
# Plot goalcenter text
plt.text(x_MV+0.4, y_MV-2.09, 'C', fontsize=fontsize_PITCH_SMALL)

# Plot orange point
x_orange, y_orange = 40, 2.4
plt.scatter(x_orange, y_orange, color='orange', s=s_PITCH, label='Orange Point (O)', zorder = 2)
plt.text(x_orange, y_orange+0.8, 'O', fontsize=fontsize_PITCH_SMALL)

# Plot left red line
x_values = [x_LV, x_orange]
y_values = [y_LV, y_orange]
plt.plot(x_values, y_values, linestyle=(0, (1, 1)), color='red', linewidth=linewidth_PITCH, alpha=alpha_PITCH, zorder = 1, label='Red Line')
# Plot right red line
x_values = [x_RV, x_orange]
y_values = [y_RV, y_orange]
plt.plot(x_values, y_values, linestyle=(0, (1, 1)), color='red', linewidth=linewidth_PITCH, alpha=alpha_PITCH, zorder = 1)

# Apply method
plot_ballline()

# Plot red zone
x = [18, 18, 0, 0, 18, 18, 32, 32.30769230769231, 32.62166405023548, 32.93563579277865, 33.24960753532182, 33.56357927786499, 33.87755102040816, 34.19152276295134, 34.505494505494504, 34.81946624803768, 35.133437990580845, 35.44740973312402, 35.76138147566719, 36.07535321821036, 36.389324960753534, 36.7032967032967, 37.017268445839875, 37.33124018838305, 37.645211930926216, 37.95918367346939, 38.27315541601256, 38.58712715855573, 38.901098901098905, 39.21507064364207, 39.529042386185246, 39.84301412872841, 40.15698587127159, 40.47095761381476, 40.78492935635793, 41.098901098901095, 41.41287284144427, 41.72684458398744, 42.04081632653062, 42.354788069073784, 42.66875981161695, 42.982731554160125, 43.2967032967033, 43.61067503924647, 43.92464678178964, 44.23861852433281, 44.55259026687598, 44.866562009419155, 45.18053375196232, 45.494505494505496, 45.80847723704866, 46.12244897959184, 46.43642072213501, 46.75039246467818, 47.06436420722135, 47.37833594976452, 47.69230769230769, 48, 62, 62, 80, 80, 62, 62]
y = [50, 120-98, 120-98, 120-114, 120-114, 120-102, 120-102, 18.389710663783134, 18.749826561654196, 19.07776506728853, 19.377818212237006, 19.65326651095276, 19.906681888902252, 20.140122369269857, 20.355262375943013, 20.55348291895057, 20.735935794663227, 20.903590391631127, 21.057268517891185, 21.197670772439523, 21.325396815201085, 21.440961146700353, 21.544805523205675, 21.637308808370634, 21.71879484039805, 21.789538738948128, 21.849771966028108, 21.899686375532774, 21.939437427482197, 21.969146698967144, 21.988903790005686, 21.998767695882385, 21.998767695882385, 21.988903790005686, 21.969146698967144, 21.939437427482197, 21.899686375532774, 21.849771966028108, 21.789538738948124, 21.71879484039805, 21.637308808370634, 21.544805523205675, 21.440961146700353, 21.32539681520108, 21.197670772439523, 21.057268517891185, 20.903590391631127, 20.735935794663227, 20.55348291895057, 20.355262375943013, 20.140122369269857, 19.906681888902252, 19.65326651095276, 19.377818212237006, 19.07776506728853, 18.749826561654196, 18.389710663783134, 120-102, 120-102, 120-114, 120-114, 120-98, 120-98, 50]
plot_zone('red', True, 'Red Zone')

# Scale plot
plt.axis('scaled')
plt.xlim([-5, 85])
plt.ylim([-5, 65])

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH_SMALL})
plt.savefig('images/zones_red.png', bbox_inches='tight')

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = 52, 14
# Define gk position
x_gk, y_gk = 43, 3.7

# Apply method
plot_shot_position(False)
plot_gk_position(False)
plot_goalcenter(False)
plot_ballline()

# Plot shot text
plt.text(x_shot+0.4, y_shot-2, 'S', fontsize=fontsize_PITCH_SMALL)
# Plot gk text
plt.text(x_gk+1, y_gk-0.8, 'G', fontsize=fontsize_PITCH_SMALL)
# Plot goalcenter text
plt.text(x_MV+0.4, y_MV-2.09, 'C', fontsize=fontsize_PITCH_SMALL)


# Plot light blue zones
x = [32, 32.30769230769231, 32.62166405023548, 32.93563579277865, 33.24960753532182, 33.56357927786499, 33.87755102040816, 34.19152276295134, 34.505494505494504, 34.81946624803768, 35.133437990580845, 35.44740973312402, 35.76138147566719, 36.07535321821036, 36.389324960753534, 36.55172413793103, 39, 36, 18, 18]
y = [120-102, 18.389710663783134, 18.749826561654196, 19.07776506728853, 19.377818212237006, 19.65326651095276, 19.906681888902252, 20.140122369269857, 20.355262375943013, 20.55348291895057, 20.735935794663227, 20.903590391631127, 21.057268517891185, 21.197670772439523, 21.325396815201085, 21.38666040608014, 120-114, 120-117.6, 120-106.8, 120-102]
plot_zone('blue', True, 'Light Blue Zone')
x = [48, 62, 62, 44, 41, 43.44827586206897, 43.61067503924647, 43.92464678178964, 44.23861852433281, 44.55259026687598, 44.866562009419155, 45.18053375196232, 45.494505494505496, 45.80847723704866, 46.12244897959184, 46.43642072213501, 46.75039246467818, 47.06436420722135, 47.37833594976452, 47.69230769230769]
y = [120-102, 120-102, 120-106.8, 120-117.6, 120-114, 21.38666040608014, 21.32539681520108, 21.197670772439523, 21.057268517891185, 20.903590391631127, 20.735935794663227, 20.55348291895057, 20.355262375943013, 20.140122369269857, 19.906681888902252, 19.65326651095276, 19.377818212237006, 19.07776506728853, 18.749826561654196, 18.389710663783134]
plot_zone('blue', False, '')

# Scale plot
plt.axis('scaled')
plt.xlim([-5, 85])
plt.ylim([-5, 65])

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH_SMALL})
plt.savefig('images/zones_blue.png', bbox_inches='tight')

Arc model:

In [None]:
# function for semicircle
def semicircle(r, h, k):
    x0 = h - r  # determine x start
    x1 = h + r  # determine x finish
    x = np.linspace(x0, x1, 100000)  # many points to solve for y

    # use numpy for array solving of the semicircle equation
    y = k + np.sqrt(r**2 - (x - h)**2)  
    return x, y

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = x_SV, y_SV
# Define gk position
x_gk, y_gk = 41.78, 3.58

# Apply method
plot_shot_position(True)
plot_gk_position(True)
plot_goalcenter(True)
plot_goalposts(True)

# Plot gk arc
x, y = semicircle(4, 40, 0)
plt.scatter(x, y, s=5, c='orange')
x = np.linspace(100, 100, 100)
y = 0.5*x + 100
plt.plot(x, y, color='orange', linewidth=5, label='Arc')

# Apply method
plot_ballline()

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/gk_arc.png', bbox_inches='tight')

Line model:

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = x_SV, y_SV
# Define gk position
x_gk, y_gk = 40.97, 2

# Apply method
plot_shot_position(True)
plot_gk_position(True)
plot_goalcenter(True)

# Plot goalkeeper line
x_values = [-5, 85]
y_values = [2, 2]
plt.plot(x_values, y_values, color='blue', linewidth=linewidth_PITCH, alpha=0.4, label='GK Line', zorder=1)

# Apply method
plot_ballline()

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/line_positioning.png', bbox_inches='tight')

Inscribed circle model:

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = x_SV, y_SV
# Define gk position
x_gk, y_gk = x_GKV, y_GKV

# Apply method
plot_shot_position(True)
plot_gk_position(False)
plot_goalposts(True)
plot_goalline()
plot_shot_cone()
plot_bisector()

# Plot dive shadow
plt.gca().plot(x_gk, y_gk, 'o', ms=134, mfc='blue', alpha=0.3)
plt.scatter(1000, 1000, color='blue', s=s_ZONES, alpha=0.3, zorder=1, label='Inscribed Circle')

# Plot gk text
plt.text(x_gk+0.4, y_gk-1, 'G', fontsize=fontsize_PITCH)

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/inscribed_circle.png', bbox_inches='tight')

### 2.2 Data-Driven GKP Models <a name='data-driven'></a>

Minimal PSxG model (alternative goalkeeper positions):

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Define shot position
x_shot, y_shot = x_SV, y_SV
# Define gk position
x_gk, y_gk = x_GKV, y_GKV

# Apply method
plot_shot_position(True)
plt.scatter(x_GKV, y_GKV, marker='^', color='blue', s=s_PITCH, label='GK Position')
plot_goalposts(True)

# Plot alternative gk positions
plt.scatter(x_GKV, y_GKV+1, color='blue', s=s_PITCH, alpha=alpha_PITCH, label='Alternative GK Position')
plt.scatter(x_GKV+0.7, y_GKV+0.7, color='blue', s=s_PITCH, alpha=alpha_PITCH)
plt.scatter(x_GKV+1, y_GKV, color='blue', s=s_PITCH, alpha=alpha_PITCH)
plt.scatter(x_GKV+0.7, y_GKV-0.7, color='blue', s=s_PITCH, alpha=alpha_PITCH)
plt.scatter(x_GKV, y_GKV-1, color='blue', s=s_PITCH, alpha=alpha_PITCH)
plt.scatter(x_GKV-0.7, y_GKV-0.7, color='blue', s=s_PITCH, alpha=alpha_PITCH)
plt.scatter(x_GKV-1, y_GKV, color='blue', s=s_PITCH, alpha=alpha_PITCH)
plt.scatter(x_GKV-0.7, y_GKV+0.7, color='blue', s=s_PITCH, alpha=alpha_PITCH)
plt.scatter(1000, 1000, color='blue', s=s_PITCH, alpha=alpha_PITCH)

# Apply method
plot_goalline()
plot_shot_cone()

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH})
plt.savefig('images/alternative_positions.png', bbox_inches='tight')

k-NN model:

In [None]:
# Copy 1v1 dataframe
df = df_1v1_men.copy()
# Create empty list
ls_distance = []

# Define row
i = 33       
# Define shot position
x_shot = df.at[i, 'x_shot']
y_shot = df.at[i, 'y_shot']

# Iterate over all rows in dataframe
for row in range(len(df)):
    # Define similar shot position
    x_shot_similar = df.at[row, 'x_shot']
    y_shot_similar = df.at[row, 'y_shot']
    # Add distance between shot and similar shot to list
    ls_distance.append(math.dist([x_shot, y_shot], [x_shot_similar, y_shot_similar]))
# Sort indices by ascending distance values and save value 1 to 21 (don't save the smallest)
sorted_indices = np.argsort(ls_distance).tolist()[1:21]

# Define counter variables
x_gk_tot = 0
y_gk_tot = 0
# Sum up gk coordinates from 20 most similar shots
for row in sorted_indices:
    x_gk_tot = x_gk_tot + df.at[row, 'x_gk']
    y_gk_tot = y_gk_tot + df.at[row, 'y_gk']
# Calculate average gk position
x_gk_avg = x_gk_tot/20
y_gk_avg = y_gk_tot/20

In [None]:
# Apply function
plot_vertical_pitch(label=False, tick=False)

# Iterate over all 20 similar shot positions
for i in sorted_indices:
    # Plot similar shooting positions
    x_shot_similar = df.at[i, 'y_shot']       # Transformation from horizontal y to vertical x   
    y_shot_similar = 120 - df.at[i, 'x_shot'] # Transformation from horizontal x to vertical y
    plt.scatter(x_shot_similar, y_shot_similar, marker='v', color='red', s=s_PITCH)
    # Plot similar gk positions
    x_gk_similar = df.at[i, 'y_gk']
    y_gk_similar = 120 - df.at[i, 'x_gk']
    plt.scatter(x_gk_similar, y_gk_similar, marker='^', color='blue', s=s_PITCH)

# Create labels for legend
plt.scatter(1000, 1000, marker='v', color='red', s=s_PITCH, label='20 Most Similar Shots')
plt.scatter(1000, 1000, marker='^', color='blue', s=s_PITCH, label='20 GK Positions')
plt.scatter(y_shot, 120-x_shot, marker='X', color='orange', s=300, label='Original Shot')
plt.scatter(y_gk_avg, 120-x_gk_avg, marker='D', color='deeppink', s=300, label='TOGKP')

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': 30})
plt.savefig('images/knn.png', bbox_inches='tight')

### 2.3 All Modeled GK Positions <a name='modeled'></a>

In [None]:
plot_all_gks(df_1v1_men, 'gk_zonal')
# Save plot
plt.savefig('images/all_gks_zonal_men.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_women, 'gk_zonal')
# Save plot
plt.savefig('images/all_gks_zonal_women.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_men, 'gk_arc')
# Save plot
plt.savefig('images/all_gks_arc_men.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_women, 'gk_arc')
# Save plot
plt.savefig('images/all_gks_arc_women.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_men, 'gk_line')
# Save plot
plt.savefig('images/all_gks_line_men.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_women, 'gk_line')
# Save plot
plt.savefig('images/all_gks_line_women.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_men, 'gk_bisector_wingspan')
# Save plot
plt.savefig('images/all_gks_bisector_wingspan_men.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_women, 'gk_bisector_wingspan')
# Save plot
plt.savefig('images/all_gks_bisector_wingspan_women.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_men, 'gk_bisector_dive')
# Save plot
plt.savefig('images/all_gks_bisector_dive_men.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_women, 'gk_bisector_dive')
# Save plot
plt.savefig('images/all_gks_bisector_dive_women.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_men, 'gk_inscribed')
# Save plot
plt.savefig('images/all_gks_inscribed_men.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_women, 'gk_inscribed')
# Save plot
plt.savefig('images/all_gks_inscribed_women.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_men, 'gk_knn')
# Save plot
plt.savefig('images/all_gks_knn_men.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_women, 'gk_knn')
# Save plot
plt.savefig('images/all_gks_knn_women.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_men, 'gk_ml')
# Save plot
plt.savefig('images/all_gks_ml_men.png', bbox_inches='tight')

In [None]:
plot_all_gks(df_1v1_women, 'gk_ml')
# Save plot
plt.savefig('images/all_gks_ml_women.png', bbox_inches='tight')

# 3. EVALUATION METHODS <a name='evaluation'></a>

# 4. RESULTS <a name='results'></a>

### 4.1 Proximity to Actual GK Position for All Outcomes <a name='41'></a>

In [None]:
def calculate_mean_column_value(df, col):
    """
    Print mean value of dataframe column
    """
    
    print('Mean ' + col, round(df[col].mean(), 2))

In [None]:
# Define lists of similar columns
ls_distance_between_gk_and_gk_alt = ['distance_between_gk_and_gk', 'distance_between_gk_and_gk_zonal', 'distance_between_gk_and_gk_arc', 'distance_between_gk_and_gk_line', 'distance_between_gk_and_gk_bisector_wingspan', 'distance_between_gk_and_gk_bisector_dive', 'distance_between_gk_and_gk_inscribed', 'distance_between_gk_and_gk_knn', 'distance_between_gk_and_gk_ml']
ls_distance_between_gk_knn_and_gk_alt = ['distance_between_gk_knn_and_gk', 'distance_between_gk_knn_and_gk_zonal', 'distance_between_gk_knn_and_gk_arc', 'distance_between_gk_knn_and_gk_line', 'distance_between_gk_knn_and_gk_bisector_wingspan', 'distance_between_gk_knn_and_gk_bisector_dive', 'distance_between_gk_knn_and_gk_inscribed', 'distance_between_gk_knn_and_gk_knn', 'distance_between_gk_knn_and_gk_ml']
ls_distance_between_gk_alt_and_bisector = ['distance_between_bisector_and_gk', 'distance_between_bisector_and_gk_zonal', 'distance_between_bisector_and_gk_arc', 'distance_between_bisector_and_gk_line', 'distance_between_bisector_and_gk_bisector_wingspan', 'distance_between_bisector_and_gk_bisector_dive', 'distance_between_bisector_and_gk_inscribed', 'distance_between_bisector_and_gk_knn', 'distance_between_bisector_and_gk_ml']
ls_distance_between_gk_alt_and_ballline = ['distance_between_ballline_and_gk', 'distance_between_ballline_and_gk_zonal', 'distance_between_ballline_and_gk_arc', 'distance_between_ballline_and_gk_line', 'distance_between_ballline_and_gk_bisector_wingspan', 'distance_between_ballline_and_gk_bisector_dive', 'distance_between_ballline_and_gk_inscribed', 'distance_between_ballline_and_gk_knn', 'distance_between_ballline_and_gk_ml']
ls_distance_between_goalline_and_gk_alt = ['distance_between_goalline_and_gk', 'distance_between_goalline_and_gk_zonal', 'distance_between_goalline_and_gk_arc', 'distance_between_goalline_and_gk_line', 'distance_between_goalline_and_gk_bisector_dive', 'distance_between_goalline_and_gk_bisector_wingspan', 'distance_between_goalline_and_gk_inscribed', 'distance_between_goalline_and_gk_knn', 'distance_between_goalline_and_gk_ml',]
ls_x_gk = ['x_gk', 'x_gk_zonal', 'x_gk_arc', 'x_gk_line', 'x_gk_bisector_wingspan', 'x_gk_bisector_dive', 'x_gk_inscribed', 'x_gk_knn', 'x_gk_ml']
ls_y_gk = ['y_gk', 'y_gk_zonal', 'y_gk_arc', 'y_gk_line', 'y_gk_bisector_wingspan', 'y_gk_bisector_dive', 'y_gk_inscribed', 'y_gk_knn', 'y_gk_ml']

In [None]:
# Apply method for each column in list
print('Men:')
print()
for col_name in ls_distance_between_gk_and_gk_alt:
    calculate_mean_column_value(df_1v1_men, col_name)
    
print()
print('Women:')
print()
for col_name in ls_distance_between_gk_and_gk_alt:
    calculate_mean_column_value(df_1v1_women, col_name)

### 4.2 Proximity to Actual GK Position for Specific Outcomes <a name='42'></a>

Means:

In [None]:
def calculate_mean_column_value_by_outcome(df, col):
    """
    Print mean value of dataframe column for different outcomes
    """
    
    # Print gk_name
    print(col)
    
    # Mean value for saves
    df_saved = df.where(df['outcome'] == 'Saved').dropna().reset_index(drop = True)
    print('Mean value for saves:', round(df_saved[col].mean(), 2))
    # Mean value for nogoals
    df_nogoal = df.where(df['outcome'] != 'Goal').dropna().reset_index(drop = True)
    print('Mean value for no-goals:', round(df_nogoal[col].mean(), 2))
    # Mean value for goals
    df_goal = df.where(df['outcome'] == 'Goal').dropna().reset_index(drop = True)
    print('Mean value goals:', round(df_goal[col].mean(), 2))
    print()

In [None]:
# Apply method for each column in list
print('Men:')
print()
for col_name in ls_distance_between_gk_and_gk_alt:
    calculate_mean_column_value_by_outcome(df_1v1_men, col_name)
    
print()
print('Women:')
print()
for col_name in ls_distance_between_gk_and_gk_alt:
    calculate_mean_column_value_by_outcome(df_1v1_women, col_name)

Non-normal distribution:

In [None]:
sns.displot(df_1v1_men.where(df_1v1_men['distance_between_gk_and_gk_inscribed'] < 5), x = 'distance_between_gk_and_gk_inscribed', binwidth=0.2)

In [None]:
sns.displot(df_1v1_men.where((df_1v1_men['distance_between_gk_and_gk_inscribed'] < 5) & (df_1v1_men['outcome'] == 'Saved')).dropna(), x = 'distance_between_gk_and_gk_inscribed', binwidth=0.2)

In [None]:
sns.displot(df_1v1_men.where((df_1v1_men['distance_between_gk_and_gk_inscribed'] < 10) & (df_1v1_men['outcome'] == 'Goal')).dropna(), x = 'distance_between_gk_and_gk_inscribed', binwidth=0.2)

P-values:

In [None]:
def print_significance_for_different_outcomes(df, col):
    """
    Perform Non-Parametric Test (Mann-Whitney U Test)
    """
    print('P-value for', col)
    print(round(stats.mannwhitneyu(df.where(df['outcome'] == 'Goal').dropna()[col].to_list(), df.where(df['outcome'] == 'Saved').dropna()[col].to_list())[1], 4))
    print()

In [None]:
print('Men:')
for col in ls_distance_between_gk_and_gk_alt:
    print_significance_for_different_outcomes(df_1v1_men, col)

print()   
print()
print('Women:') 
for col in ls_distance_between_gk_and_gk_alt:
    print_significance_for_different_outcomes(df_1v1_women, col)

### 4.3 Harrison Comparison <a name='43'></a>

In [None]:
df_harrison_far_men = df_1v1_men.where(df_1v1_men['distance_between_shot_and_goalcenter'] > 13.5).dropna().reset_index(drop=True)
df_harrison_close_men = df_1v1_men.where(df_1v1_men['distance_between_shot_and_goalcenter'] <= 13.5).dropna().reset_index(drop=True)

df_harrison_far_women = df_1v1_women.where(df_1v1_women['distance_between_shot_and_goalcenter'] > 13.5).dropna().reset_index(drop=True)
df_harrison_close_women = df_1v1_women.where(df_1v1_women['distance_between_shot_and_goalcenter'] <= 13.5).dropna().reset_index(drop=True)

Means:

In [None]:
# Apply method for each column in list
print('Men:')
print('Close shots:')
for col_name in ls_x_gk:
    print('Mean y value from ' + col_name, round((120-df_harrison_close_men[col_name]).mean(), 2))

print()
print('Far shots:')
for col_name in ls_x_gk:
    print('Mean y value from ' + col_name, round((120-df_harrison_far_men[col_name]).mean(), 2))

print()
print('Women:')
print('Close shots:')
for col_name in ls_x_gk:
    print('Mean y value from ' + col_name, round((120-df_harrison_close_women[col_name]).mean(), 2))

print()
print('Far shots:')
for col_name in ls_x_gk:
    print('Mean y value from ' + col_name, round((120-df_harrison_far_women[col_name]).mean(), 2))

P-values:

In [None]:
def print_significance_for_different_distances(df_close, df_far, col):
    """
    Perform Non-Parametric Test (Mann-Whitney U Test)
    """
    print('P-value for', col)
    print(round(stats.mannwhitneyu(df_close[col].to_list(), df_far[col].to_list())[1], 4))
    print()

In [None]:
print('Men:')
for col in ls_distance_between_goalline_and_gk_alt:
    print_significance_for_different_distances(df_harrison_close_men, df_harrison_far_men, col)

print()   
print()
print('Women:') 
for col in ls_distance_between_goalline_and_gk_alt:
    print_significance_for_different_distances(df_harrison_close_women, df_harrison_far_women, col)

### 4.4 Correlation Between Two Variables <a name='44'></a>

In [None]:
def print_correlations(df, col_1, col_2):
    """
    Print Pearson and Spearman correlation
    """
    print('Pearson correlation of', col_1, 'and', col_2)
    print(round(np.corrcoef(df[col_1].to_list(), df[col_2].to_list())[0, 1], 4))
    print('Spearman correlation of', col_1, 'and', col_2)
    print(round(stats.spearmanr(df[col_1].to_list(), df[col_2].to_list())[0], 4))
    

In [None]:
print('Men:')
for col in ls_distance_between_gk_and_gk_alt:
    if col != 'distance_between_gk_and_gk':
        print_correlations(df_1v1_men, 'statsbomb_xg', col)
        print()

print()
print()
print('Women:')
for col in ls_distance_between_gk_and_gk_alt:
    if col != 'distance_between_gk_and_gk':
        print_correlations(df_1v1_women, 'statsbomb_xg', col)
        print()

Correlations: https://towardsdatascience.com/rbo-v-s-kendall-tau-to-compare-ranked-lists-of-items-8776c5182899

FUT: https://www.futhead.com/22/players/?group=gk&level=all_nif&bin_platform=ps

12Jul: https://www.fifacm.com/

Women: https://www.ea.com/en-gb/games/fifa/fifa-23/womens-ratings

Create individual goalkeeper dataframe:

In [None]:
# Copy dataframe
df = df_1v1_men.copy()

# Define dataframes with stats grouped by gk
df_shots_per_gk = df[['outcome', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).count()
df_goals_per_gk = df[['outcome', 'goalkeeper']].where(df['outcome'] == 'Goal').dropna().groupby(by=['goalkeeper'], dropna = False).count()
df_xg_per_gk = df[['statsbomb_xg', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).sum()
df_distance_between_bisector_and_gk = df[['distance_between_bisector_and_gk', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_ballline_and_gk = df[['distance_between_ballline_and_gk', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_zonal = df[['distance_between_gk_and_gk_zonal', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_arc = df[['distance_between_gk_and_gk_arc', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_line = df[['distance_between_gk_and_gk_line', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_wingspan = df[['distance_between_gk_and_gk_bisector_wingspan', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_dive = df[['distance_between_gk_and_gk_bisector_dive', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_inscribed = df[['distance_between_gk_and_gk_inscribed', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_knn = df[['distance_between_gk_and_gk_knn', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_ml = df[['distance_between_gk_and_gk_ml', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
# Define dataframe with all gk names and ids (convert gk id from float to int)
df_gk = df[['goalkeeper', 'goalkeeper_id']].sort_values(by = ['goalkeeper'], ascending = True).drop_duplicates().reset_index(drop = True)
df_gk = df_gk.astype({'goalkeeper_id': int})

# Create lists
ls_gk_name = df_gk['goalkeeper'].tolist()
ls_gk_id = df_gk['goalkeeper_id'].tolist()
ls_shots_per_gk = df_shots_per_gk['outcome'].tolist()
ls_goals_per_gk = df_goals_per_gk['outcome'].tolist()
ls_xg_per_gk = df_xg_per_gk['statsbomb_xg'].tolist()
ls_distance_between_bisector_and_gk = round(df_distance_between_bisector_and_gk['distance_between_bisector_and_gk'], 4).tolist()
ls_distance_between_ballline_and_gk = round(df_distance_between_ballline_and_gk['distance_between_ballline_and_gk'], 4).tolist()
df_distance_between_gk_and_gk_zonal = round(df_distance_between_gk_and_gk_zonal['distance_between_gk_and_gk_zonal'], 4).tolist()
df_distance_between_gk_and_gk_arc = round(df_distance_between_gk_and_gk_arc['distance_between_gk_and_gk_arc'], 4).tolist()
df_distance_between_gk_and_gk_line = round(df_distance_between_gk_and_gk_line['distance_between_gk_and_gk_line'], 4).tolist()
df_distance_between_gk_and_gk_wingspan = round(df_distance_between_gk_and_gk_wingspan['distance_between_gk_and_gk_bisector_wingspan'], 4).tolist()
df_distance_between_gk_and_gk_dive = round(df_distance_between_gk_and_gk_dive['distance_between_gk_and_gk_bisector_dive'], 4).tolist()
df_distance_between_gk_and_gk_inscribed = round(df_distance_between_gk_and_gk_inscribed['distance_between_gk_and_gk_inscribed'], 4).tolist()
ls_distance_between_gk_and_gk_knn = round(df_distance_between_gk_and_gk_knn['distance_between_gk_and_gk_knn'], 4).tolist()
df_distance_between_gk_and_gk_ml = round(df_distance_between_gk_and_gk_ml['distance_between_gk_and_gk_ml'], 4).tolist()
# Create goals per gk list (can not be created with groupby because two gks have received no goal and consequently would not be in the dataframe)
ls_goals_per_gk = []
for gk_name in ls_gk_name:
    goal_count = 0
    df_per_gk = df.where(df['goalkeeper'] == gk_name).dropna().reset_index(drop=True)
    for row in range(len(df_per_gk)):
        if df_per_gk.at[row, 'outcome'] == 'Goal':
            goal_count += 1
    ls_goals_per_gk.append(goal_count)

# Create empty dataframe and add complete lists 
df_gk_individual_men = pd.DataFrame()
df_gk_individual_men['gk_name'] = ls_gk_name
df_gk_individual_men['gk_id'] = ls_gk_id
# df_gk_individual_men['fifa22_fut_rating'] = [76, 70, 74, 70, 82, 89, 75, 87, 83, 85, 83, 76, 90, 81, 75, 85, 76, 82, 75, 89, 81, 82, 79, 87, 85] # fifa 22 fut
df_gk_individual_men['fifa_rating'] = [76, 70, 74, 72, 81, 86, 74, 87, 80, 85, 83, 74, 90, 82, 72, 85, 77, 83, 73, 89, 81, 80, 78, 87, 84] # fifa 21 12. jul
df_gk_individual_men['shots_received'] = ls_shots_per_gk
df_gk_individual_men['goals_received'] = ls_goals_per_gk
df_gk_individual_men['summed_xg'] = ls_xg_per_gk
df_gk_individual_men['summed_gsaa'] = df_gk_individual_men['summed_xg'] - df_gk_individual_men['goals_received']
df_gk_individual_men['gsaa_per_shot'] = round(df_gk_individual_men['summed_gsaa'] / df_gk_individual_men['shots_received'], 4)
df_gk_individual_men['mean_distance_between_bisector_and_gk'] = ls_distance_between_bisector_and_gk
df_gk_individual_men['mean_distance_between_ballline_and_gk'] = ls_distance_between_ballline_and_gk
df_gk_individual_men['distance_between_gk_and_gk_zonal'] = df_distance_between_gk_and_gk_zonal
df_gk_individual_men['distance_between_gk_and_gk_arc'] = df_distance_between_gk_and_gk_arc
df_gk_individual_men['distance_between_gk_and_gk_line'] = df_distance_between_gk_and_gk_line
df_gk_individual_men['distance_between_gk_and_gk_bisector_wingspan'] = df_distance_between_gk_and_gk_wingspan
df_gk_individual_men['distance_between_gk_and_gk_bisector_dive'] = df_distance_between_gk_and_gk_dive
df_gk_individual_men['distance_between_gk_and_gk_inscribed'] = df_distance_between_gk_and_gk_inscribed
df_gk_individual_men['distance_between_gk_and_gk_knn'] = ls_distance_between_gk_and_gk_knn
df_gk_individual_men['distance_between_gk_and_gk_ml'] = df_distance_between_gk_and_gk_ml

# Add gsaa ranking
df_gk_individual_men = df_gk_individual_men.sort_values(by = ['gsaa_per_shot'], ascending = False)
df_gk_individual_men['gsaa_ranking'] = list(range(1, len(df_gk_individual_men)+1))
# Add rating ranking
df_gk_individual_men = df_gk_individual_men.sort_values(by = ['fifa_rating'], ascending = False)
df_gk_individual_men['rating_ranking'] = list(range(1, len(df_gk_individual_men)+1))

In [None]:
# Copy dataframe
df = df_1v1_women.copy()

# Define dataframes with stats grouped by gk
df_shots_per_gk = df[['outcome', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).count()
df_goals_per_gk = df[['outcome', 'goalkeeper']].where(df['outcome'] == 'Goal').dropna().groupby(by=['goalkeeper'], dropna = False).count()
df_xg_per_gk = df[['statsbomb_xg', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).sum()
df_distance_between_bisector_and_gk = df[['distance_between_bisector_and_gk', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_ballline_and_gk = df[['distance_between_ballline_and_gk', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_zonal = df[['distance_between_gk_and_gk_zonal', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_arc = df[['distance_between_gk_and_gk_arc', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_line = df[['distance_between_gk_and_gk_line', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_wingspan = df[['distance_between_gk_and_gk_bisector_wingspan', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_dive = df[['distance_between_gk_and_gk_bisector_dive', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_inscribed = df[['distance_between_gk_and_gk_inscribed', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_knn = df[['distance_between_gk_and_gk_knn', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
df_distance_between_gk_and_gk_ml = df[['distance_between_gk_and_gk_ml', 'goalkeeper']].groupby(by=['goalkeeper'], dropna = False).mean()
# Define dataframe with all gk names and ids (convert gk id from float to int)
df_gk = df[['goalkeeper', 'goalkeeper_id']].sort_values(by = ['goalkeeper'], ascending = True).drop_duplicates().reset_index(drop = True)
df_gk = df_gk.astype({'goalkeeper_id': int})

# Create lists
ls_gk_name = df_gk['goalkeeper'].tolist()
ls_gk_id = df_gk['goalkeeper_id'].tolist()
ls_shots_per_gk = df_shots_per_gk['outcome'].tolist()
ls_goals_per_gk = df_goals_per_gk['outcome'].tolist()
ls_xg_per_gk = df_xg_per_gk['statsbomb_xg'].tolist()
ls_distance_between_bisector_and_gk = round(df_distance_between_bisector_and_gk['distance_between_bisector_and_gk'], 4).tolist()
ls_distance_between_ballline_and_gk = round(df_distance_between_ballline_and_gk['distance_between_ballline_and_gk'], 4).tolist()
df_distance_between_gk_and_gk_zonal = round(df_distance_between_gk_and_gk_zonal['distance_between_gk_and_gk_zonal'], 4).tolist()
df_distance_between_gk_and_gk_arc = round(df_distance_between_gk_and_gk_arc['distance_between_gk_and_gk_arc'], 4).tolist()
df_distance_between_gk_and_gk_line = round(df_distance_between_gk_and_gk_line['distance_between_gk_and_gk_line'], 4).tolist()
df_distance_between_gk_and_gk_wingspan = round(df_distance_between_gk_and_gk_wingspan['distance_between_gk_and_gk_bisector_wingspan'], 4).tolist()
df_distance_between_gk_and_gk_dive = round(df_distance_between_gk_and_gk_dive['distance_between_gk_and_gk_bisector_dive'], 4).tolist()
df_distance_between_gk_and_gk_inscribed = round(df_distance_between_gk_and_gk_inscribed['distance_between_gk_and_gk_inscribed'], 4).tolist()
ls_distance_between_gk_and_gk_knn = round(df_distance_between_gk_and_gk_knn['distance_between_gk_and_gk_knn'], 4).tolist()
df_distance_between_gk_and_gk_ml = round(df_distance_between_gk_and_gk_ml['distance_between_gk_and_gk_ml'], 4).tolist()
# Create goals per gk list (can not be created with groupby because two gks have received no goal and consequently would not be in the dataframe)
ls_goals_per_gk = []
for gk_name in ls_gk_name:
    goal_count = 0
    df_per_gk = df.where(df['goalkeeper'] == gk_name).dropna().reset_index(drop=True)
    for row in range(len(df_per_gk)):
        if df_per_gk.at[row, 'outcome'] == 'Goal':
            goal_count += 1
    ls_goals_per_gk.append(goal_count)

# Create empty dataframe and add complete lists 
df_gk_individual_women = pd.DataFrame()
df_gk_individual_women['gk_name'] = ls_gk_name
df_gk_individual_women['gk_id'] = ls_gk_id
df_gk_individual_women['fifa_rating'] = [73, 0, 0, 74, 0, 78, 0, 0, 86, 82, 84, 79, 73, 83, 84, 88, 76, 78] # fifa 23
df_gk_individual_women['shots_received'] = ls_shots_per_gk
df_gk_individual_women['goals_received'] = ls_goals_per_gk
df_gk_individual_women['summed_xg'] = ls_xg_per_gk
df_gk_individual_women['summed_gsaa'] = df_gk_individual_women['summed_xg'] - df_gk_individual_women['goals_received']
df_gk_individual_women['gsaa_per_shot'] = round(df_gk_individual_women['summed_gsaa'] / df_gk_individual_women['shots_received'], 4)
df_gk_individual_women['mean_distance_between_bisector_and_gk'] = ls_distance_between_bisector_and_gk
df_gk_individual_women['mean_distance_between_ballline_and_gk'] = ls_distance_between_ballline_and_gk
df_gk_individual_women['distance_between_gk_and_gk_zonal'] = df_distance_between_gk_and_gk_zonal
df_gk_individual_women['distance_between_gk_and_gk_arc'] = df_distance_between_gk_and_gk_arc
df_gk_individual_women['distance_between_gk_and_gk_line'] = df_distance_between_gk_and_gk_line
df_gk_individual_women['distance_between_gk_and_gk_bisector_wingspan'] = df_distance_between_gk_and_gk_wingspan
df_gk_individual_women['distance_between_gk_and_gk_bisector_dive'] = df_distance_between_gk_and_gk_dive
df_gk_individual_women['distance_between_gk_and_gk_inscribed'] = df_distance_between_gk_and_gk_inscribed
df_gk_individual_women['distance_between_gk_and_gk_knn'] = ls_distance_between_gk_and_gk_knn
df_gk_individual_women['distance_between_gk_and_gk_ml'] = df_distance_between_gk_and_gk_ml

# Add gsaa ranking
df_gk_individual_women = df_gk_individual_women.sort_values(by = ['gsaa_per_shot'], ascending = False)
df_gk_individual_women['gsaa_ranking'] = list(range(1, len(df_gk_individual_women)+1))
# Add rating ranking
df_gk_individual_women = df_gk_individual_women.sort_values(by = ['fifa_rating'], ascending = False)
df_gk_individual_women['rating_ranking'] = list(range(1, len(df_gk_individual_women)+1))

Pearson correlation (with unsorted values lists):

In [None]:
print('Men:')
for col in ls_distance_between_gk_and_gk_alt:
    if col != 'distance_between_gk_and_gk':
        print_correlations(df_gk_individual_men, 'gsaa_per_shot', col)
        print()
        
print()
print()
print('Women:')
for col in ls_distance_between_gk_and_gk_alt:
    if col != 'distance_between_gk_and_gk':
        print_correlations(df_gk_individual_women, 'gsaa_per_shot', col)
        print()

GK model assumptions:

In [None]:
print_correlations(df_gk_individual, 'gsaa_per_shot', 'mean_distance_between_bisector_and_gk')
print()
print_correlations(df_gk_individual, 'gsaa_per_shot', 'mean_distance_between_ballline_and_gk')
print()
print()
print()
print_correlations(df_gk_individual, 'fifa_rating', 'mean_distance_between_bisector_and_gk')
print()
print_correlations(df_gk_individual, 'fifa_rating', 'mean_distance_between_ballline_and_gk')
print()
print()
print()
print_correlations(df_gk_individual, 'fifa_rating', 'gsaa_per_shot')

### 4.5 Messi Test <a name='45'></a>

Men:

In [None]:
df_gk_individual_men.count()

In [None]:
top_rating = 87

In [None]:
df_test = df_gk_individual_men.sort_values(by = ['distance_between_gk_and_gk_zonal'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_men.sort_values(by = ['distance_between_gk_and_gk_arc'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_men.sort_values(by = ['distance_between_gk_and_gk_line'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_men.sort_values(by = ['distance_between_gk_and_gk_bisector_wingspan'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_men.sort_values(by = ['distance_between_gk_and_gk_bisector_dive'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_men.sort_values(by = ['distance_between_gk_and_gk_inscribed'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_men.sort_values(by = ['distance_between_gk_and_gk_knn'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_men.sort_values(by = ['distance_between_gk_and_gk_ml'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

Women:

In [None]:
df_gk_individual_women.count()

In [None]:
top_rating = 85

In [None]:
df_test = df_gk_individual_women.sort_values(by = ['distance_between_gk_and_gk_zonal'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_women.sort_values(by = ['distance_between_gk_and_gk_arc'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_women.sort_values(by = ['distance_between_gk_and_gk_line'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_women.sort_values(by = ['distance_between_gk_and_gk_bisector_wingspan'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_women.sort_values(by = ['distance_between_gk_and_gk_bisector_dive'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_women.sort_values(by = ['distance_between_gk_and_gk_inscribed'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_women.sort_values(by = ['distance_between_gk_and_gk_knn'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

In [None]:
df_test = df_gk_individual_women.sort_values(by = ['distance_between_gk_and_gk_ml'], ascending = True).reset_index(drop=True)
df_test = df_test.where(df_test['fifa_rating'] > top_rating).dropna()

print(mean([x+1 for x in df_test.index.tolist()]))

df_test

# 5. DISCUSSION <a name='discussion'></a>

# 6. SUMMARY AND OUTLOOK <a name='conclusion'></a>

# 7. APPENDIX <a name='appendix'></a>

### 7.1 Datasets <a name='datasets'></a>

Exemplary JSON file:

In [None]:
# Create dataframe
df_test = pd.read_json(EVENTPATH)

# Display dataframe
df_test

Columns in datasets:

In [None]:
# Display column names
df_test.columns.to_list()

### 7.2 GKP Models <a name='gkp-appendix'></a>

Zonal intersection model (with coordinate system):

In [None]:
# Apply function
plot_vertical_pitch(label=True, tick=True)

# Plot orange point
x_orange, y_orange = 40, 2.4
plt.scatter(x_orange, y_orange, color='orange', s=s_PITCH, label='Orange Point (O)', zorder = 2)
plt.text(x_orange-2, y_orange+0.8, 'O', fontsize=20)
# Plot right goalpoast
plt.scatter(x_RV, y_RV, marker='s', color='black', s=s_PITCH, label='Border Point', zorder=2)
plt.text(x_RV+0.6, y_RV-2.5, '(44, 0)', fontsize=20)

# Plot right red line
x_values = [x_RV, x_orange]
y_values = [y_RV, y_orange]
plt.plot(x_values, y_values, linestyle=(0, (1, 1)), color='red', linewidth=linewidth_PITCH, alpha=alpha_PITCH, zorder = 1)

# Plot red zone
x = [18, 18, 0, 0, 18, 18, 32, 32.30769230769231, 32.62166405023548, 32.93563579277865, 33.24960753532182, 33.56357927786499, 33.87755102040816, 34.19152276295134, 34.505494505494504, 34.81946624803768, 35.133437990580845, 35.44740973312402, 35.76138147566719, 36.07535321821036, 36.389324960753534, 36.7032967032967, 37.017268445839875, 37.33124018838305, 37.645211930926216, 37.95918367346939, 38.27315541601256, 38.58712715855573, 38.901098901098905, 39.21507064364207, 39.529042386185246, 39.84301412872841, 40.15698587127159, 40.47095761381476, 40.78492935635793, 41.098901098901095, 41.41287284144427, 41.72684458398744, 42.04081632653062, 42.354788069073784, 42.66875981161695, 42.982731554160125, 43.2967032967033, 43.61067503924647, 43.92464678178964, 44.23861852433281, 44.55259026687598, 44.866562009419155, 45.18053375196232, 45.494505494505496, 45.80847723704866, 46.12244897959184, 46.43642072213501, 46.75039246467818, 47.06436420722135, 47.37833594976452, 47.69230769230769, 48, 62, 62, 80, 80, 62, 62]
y = [120-60, 120-98, 120-98, 120-114, 120-114, 120-102, 120-102, 18.389710663783134, 18.749826561654196, 19.07776506728853, 19.377818212237006, 19.65326651095276, 19.906681888902252, 20.140122369269857, 20.355262375943013, 20.55348291895057, 20.735935794663227, 20.903590391631127, 21.057268517891185, 21.197670772439523, 21.325396815201085, 21.440961146700353, 21.544805523205675, 21.637308808370634, 21.71879484039805, 21.789538738948128, 21.849771966028108, 21.899686375532774, 21.939437427482197, 21.969146698967144, 21.988903790005686, 21.998767695882385, 21.998767695882385, 21.988903790005686, 21.969146698967144, 21.939437427482197, 21.899686375532774, 21.849771966028108, 21.789538738948124, 21.71879484039805, 21.637308808370634, 21.544805523205675, 21.440961146700353, 21.32539681520108, 21.197670772439523, 21.057268517891185, 20.903590391631127, 20.735935794663227, 20.55348291895057, 20.355262375943013, 20.140122369269857, 19.906681888902252, 19.65326651095276, 19.377818212237006, 19.07776506728853, 18.749826561654196, 18.389710663783134, 120-102, 120-102, 120-114, 120-114, 120-98, 120-98, 120-60]
plot_zone('red', True, 'Red Zone')

# Plot orange zones
x, y = [62, 62, 80, 80], [120-114, 120-120, 120-120, 120-114]
plot_zone('orange', True, 'Orange Zone')
x, y = [18, 18, 0, 0], [120-114, 120-120, 120-120, 120-114]
plot_zone('orange', False, '')
x, y = [62, 62, 80, 80], [120-60, 120-98, 120-98, 120-60]
plot_zone('orange', False, '')
x, y = [18, 18, 0, 0], [120-60, 120-98, 120-98, 120-60]
plot_zone('orange', False, '')

# Plot blue zone
x = [41, 39, 36.55172413793103, 36.91470054446461, 37.277676950998185, 37.64065335753176, 38.00362976406534, 38.366606170598914, 38.72958257713248, 39.09255898366606, 39.455535390199636, 39.81851179673321, 40.18148820326679, 40.544464609800364, 40.90744101633394, 41.27041742286752, 41.63339382940109, 41.99637023593466, 42.35934664246824, 42.722323049001815, 43.08529945553539, 43.44827586206897]
y = [120-114, 120-114, 21.38666040608014, 21.512146301948526, 21.622315584976075, 21.717689201691616, 21.79869919331511, 21.86569939730957, 21.918973715645922, 21.95874243074269, 21.985166913410858, 21.998352965967694, 21.998352965967694, 21.985166913410858, 21.95874243074269, 21.918973715645922, 21.86569939730957, 21.79869919331511, 21.717689201691616, 21.622315584976075, 21.512146301948526, 21.38666040608014]
plt.fill(x, y, color='blue', alpha=0.4)
plt.scatter(1000, 1000, color='blue', marker='v', s=s_ZONES, alpha=alpha_PITCH, label='Blue Zone')

# Plot light blue zones
x = [32, 32.30769230769231, 32.62166405023548, 32.93563579277865, 33.24960753532182, 33.56357927786499, 33.87755102040816, 34.19152276295134, 34.505494505494504, 34.81946624803768, 35.133437990580845, 35.44740973312402, 35.76138147566719, 36.07535321821036, 36.389324960753534, 36.55172413793103, 39, 36, 18, 18]
y = [120-102, 18.389710663783134, 18.749826561654196, 19.07776506728853, 19.377818212237006, 19.65326651095276, 19.906681888902252, 20.140122369269857, 20.355262375943013, 20.55348291895057, 20.735935794663227, 20.903590391631127, 21.057268517891185, 21.197670772439523, 21.325396815201085, 21.38666040608014, 120-114, 120-117.6, 120-106.8, 120-102]
plot_zone('blue', True, 'Light Blue Zone')
x = [48, 62, 62, 44, 41, 43.44827586206897, 43.61067503924647, 43.92464678178964, 44.23861852433281, 44.55259026687598, 44.866562009419155, 45.18053375196232, 45.494505494505496, 45.80847723704866, 46.12244897959184, 46.43642072213501, 46.75039246467818, 47.06436420722135, 47.37833594976452, 47.69230769230769]
y = [120-102, 120-102, 120-106.8, 120-117.6, 120-114, 21.38666040608014, 21.32539681520108, 21.197670772439523, 21.057268517891185, 20.903590391631127, 20.735935794663227, 20.55348291895057, 20.355262375943013, 20.140122369269857, 19.906681888902252, 19.65326651095276, 19.377818212237006, 19.07776506728853, 18.749826561654196, 18.389710663783134]
plot_zone('blue', False, '')

# Plot yellow zones
x, y = [18, 18, 36, 36], [120-106.8, 120-120, 120-120, 120-117.6]
plot_zone('yellow', True, 'Yellow Zone')
x, y = [44, 44, 62, 62], [120-117.6, 120-120, 120-120, 120-106.8]
plot_zone('yellow', False, '')

# Plot geo points
x_geo, y_geo = 62, 120-120
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo-4, y_geo-2.5, '(62, 0)', fontsize=20)
x_geo, y_geo = 80, 120-120
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo-3.5, y_geo-2.5, '(80, 0)', fontsize=20)

x_geo, y_geo = 44, 120-117.6
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo+0.6, y_geo-0.25, '(44, 2.4)', fontsize=20)

x_geo, y_geo = 40.915, 120-114
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo+0.6, y_geo+0.5, '(40.915, 6)', fontsize=20)
x_geo, y_geo = 62, 120-114
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo-4, y_geo-2.5, '(62, 6)', fontsize=20)
x_geo, y_geo = 80, 120-114
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo-3.5, y_geo-2.5, '(80, 6)', fontsize=20)

x_geo, y_geo = 62, 13.2
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo-4, y_geo-2.5, '(62, 13.2)', fontsize=20)

x_geo, y_geo = 48, 120-102
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo-4, y_geo-2.5, '(48, 18)', fontsize=20)
x_geo, y_geo = 62, 120-102
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo-4, y_geo-2.5, '(62, 18)', fontsize=20)

x_geo, y_geo = 43.44827586206897, 21.38666040608014
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo-7, y_geo+1.2, '(43.45, 21.39)', fontsize=20)
x_geo, y_geo = 62, 120-98
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo-4, y_geo-2.5, '(62, 22)', fontsize=20)

x_geo, y_geo = 80, 120-98
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo-4, y_geo-2.5, '(80, 22)', fontsize=20)

x_geo, y_geo = 62, 60
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo-4, y_geo-2.5, '(62, 60)', fontsize=20)
x_geo, y_geo = 80, 60
plt.scatter(x_geo, y_geo, marker='s', color='black', s=s_PITCH)
plt.text(x_geo-4, y_geo-2.5, '(80, 60)', fontsize=20)

# Scale plot
plt.axis('scaled')
plt.xlim([-5, 85])
plt.ylim([-5, 65])

# Plot legend and save figure
plt.legend(loc='upper left', prop={'size': fontsize_PITCH_SMALL})
plt.savefig('images/zones_coordinates.png', bbox_inches='tight')

ML model:

In [None]:
# List with all ml model names
LS_ML_MODEL_NAMES = [
    DecisionTreeRegressor(random_state = 0), 
    RandomForestRegressor(random_state = 0), 
    LinearRegression()
]

# Create dataframe
df_1v1_men_mirrored = mirror_shots(df_1v1_men)
# Lists with all dataframes and dataframe names to test
ls_dataframes = [df_1v1_men, df_1v1_men_mirrored]
ls_dataframe_names = ['1v1_men', '1v1_men_mirrored']
# Create loop count
loop_count = 0

# Iterate over all dataframes
for dataframe in ls_dataframes:
    # Define dataframe
    df = dataframe
    # Define features and targets (uncomment one of the three feature vectors X)
    #X = df[['x_shot', 'y_shot']]
    #X = df[['shooting_angle', 'goal_angle', 'bisector_slope', 'distance_between_shot_and_goalline', 'distance_between_shot_and_goalcenter']]
    X = df[['x_shot', 'y_shot', 'shooting_angle', 'goal_angle', 'bisector_slope', 'distance_between_shot_and_goalline', 'distance_between_shot_and_goalcenter']]
    y = df[['x_gk', 'y_gk']]
    # Define train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, random_state=0)
    # Total number of test rows
    n_test = len(y_test)

    # Iterate over all model names
    for model_name in LS_ML_MODEL_NAMES:    
        # Define model
        model = model_name
        # Fit model
        model.fit(X_train, y_train)
        # Test model
        y_pred = model.predict(X_test) 

        # Create variables ('ae' stands for 'absolute error')
        x_ae = 0
        y_ae = 0
        d_ae = 0
        y_test_reset = y_test.reset_index()
        # Iterate over all test rows
        for row in range (n_test):
            x_ae += abs(y_pred[row][0] - y_test_reset.at[row, 'x_gk'])
            y_ae += abs(y_pred[row][1] - y_test_reset.at[row, 'y_gk'])
            d_ae += math.dist([y_pred[row][0], y_pred[row][1]], [y_test_reset.at[row, 'x_gk'], y_test_reset.at[row, 'y_gk']])
        # Calculate mean absolute error ('mae' stands for 'mean absolute error')
        x_mae = round(x_ae / n_test, 2)
        y_mae = round(y_ae / n_test, 2)
        d_mae = round(d_ae / n_test, 2)

        # Print results
        print('DataFrame:', ls_dataframe_names[loop_count])
        print('ML model:', model_name)
        # print('x_mae:', x_mae)
        # print('y_mae:', y_mae)
        print('d_mae:', d_mae)
        print()
    
    # Update loop count
    loop_count += 1