### Import

In [None]:
# Import standard packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import os
from numpy.linalg import norm
from statistics import mean

# Import sklearn packages
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import mean_absolute_error

# Import specific packages
from mplsoccer import Pitch
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

### Configuration

In [None]:
# Display all columns
pd.set_option("display.max_columns", None)

In [None]:
# Directory including all event files
directory = '/Users/gian-andrea/Documents/Masterarbeit (offline)/statsbomb360-euro2020/events/'

# Filepath to one event file
filepath = directory + '3788745.json'

# Filepath to match file
matchpath = '/Users/gian-andrea/Documents/Masterarbeit (offline)/statsbomb360-euro2020/matches.json'

# Goalkeeper reach
goalkeeper_reach = 3.12

# List with all togkp names
ls_gk_names = ['', '_avg', '_geo', '_line', '_arc', '_bisector']

### Dataframe Methods

In [None]:
# Create dataframe including data from all json files in directory
def create_dataframe(directory):
    
    # Create empty lists
    ls_shot = []
    ls_possession_team = []
    ls_location = []

    # Iterate over all files in directory
    for filename in os.listdir(directory):

        # Save filepath to current file
        filepath = os.path.join(directory, filename)
        # If current file has json ending
        if filename.split('.')[1] == 'json':
            # Transform current json file to dataframe
            df_file = pd.read_json(filepath)
            # Transform columns of current dataframe to lists and add them to existing lists
            ls_shot.extend(df_file['shot'].tolist())
            ls_possession_team.extend(df_file['possession_team'].tolist())
            ls_location.extend(df_file['location'].tolist())

    # Create empty dataframe and add complete lists 
    df = pd.DataFrame()
    df['shot'] = ls_shot
    df['possession_team'] = ls_possession_team
    df['location'] = ls_location

    # Return dataframe
    return df

# Apply method and display dataframe
df_all = create_dataframe(directory)
print('Number of rows:', len(df_all))
df_all.head(1)

In [None]:
# Create dataframe including all undeflected shots from open plays
def filter_shots(df):

    # Create empty list
    ls_delete_rows = []
    # Filter input dataframe for shots
    df = df.dropna().reset_index(drop = True)
    
    # Add row number of all deflected or non open play shots to list 
    for row in range(0, len(df)):
        if df.at[row, 'shot']['type']['name'] != 'Open Play' or 'deflected' in df.iloc[row]['shot']:
            ls_delete_rows.append(row)
    # Delete deflected or non open play shots from dataframe
    df.drop(ls_delete_rows, inplace = True)
    
    # Transform shot location to shot coordinates
    df['x_shot'], df['y_shot'] = zip(*df['location'])
    # Delete shot location column
    del df['location']

    # Return dataframe
    return df.reset_index(drop = True)

# Apply method and display dataframe
df_shot = filter_shots(df_all)
print('Number of rows:', len(df_shot))
df_shot.head(1)

In [None]:
# Create previous dataframe including additional data
def complete_shots(df):
    
    # Create previous dataframe
    df = filter_shots(df)
    # Create empty lists
    ls_outcome = []
    ls_statsbomb_xg = []
    ls_possession_team = []
    ls_shot_body_part = []
    ls_shot_technique = []
    ls_goalkeeper = []
    ls_x_shot = []
    ls_y_shot = []
    ls_x_gk = []
    ls_y_gk = []
    ls_player_locations = []

    # Iterate over all rows in dataframe
    for row in range (0, len(df)):
        
        # If current row has freeze_frame
        if 'freeze_frame' in df.iloc[row]['shot']:
            
            # Reset nested player locations list
            ls_player_locations_nested = []
            # Add values to lists
            ls_outcome.append(df.iloc[row]['shot']['outcome']['name'])
            ls_statsbomb_xg.append(round(df.iloc[row]['shot']['statsbomb_xg'], 4))
            ls_possession_team.append(df.iloc[row]['possession_team']['name'])
            ls_shot_body_part.append(df.iloc[row]['shot']['body_part']['name'])
            ls_shot_technique.append(df.iloc[row]['shot']['technique']['name'])
            ls_x_shot.append(df.iloc[row]['x_shot'])
            ls_y_shot.append(df.iloc[row]['y_shot'])
            
            # Save shot information from current freeze_frame
            shot = df.iloc[row]['shot']['freeze_frame']
            # Iterate over all players in freeze_frame
            for i in range (0, len(shot)):
                
                # Add goalkeeper name and goalkeeper coordinates to lists
                if shot[i]['position']['name'] == 'Goalkeeper' and shot[i]['teammate'] == False:
                    ls_goalkeeper.append(shot[i]['player']['name'])
                    ls_x_gk.append(shot[i]['location'][0])
                    ls_y_gk.append(shot[i]['location'][1])
                # Add remaining player locations to list
                else:
                    ls_player_locations_nested.append(shot[i]['location'])
                    
            # Add remaining player locations list or empty string to list
            if len(ls_player_locations_nested) > 0:
                ls_player_locations.append(ls_player_locations_nested)
            else:
                ls_player_locations.append('')
    
    # Create empty dataframe and add complete lists
    df = pd.DataFrame()
    df['outcome'] = ls_outcome
    df['statsbomb_xg'] = ls_statsbomb_xg
    df['possession_team'] = ls_possession_team
    df['shot_body_part'] = ls_shot_body_part
    df['shot_technique'] = ls_shot_technique
    df['goalkeeper'] = ls_goalkeeper
    df['x_shot'] = ls_x_shot
    df['y_shot'] = ls_y_shot
    df['x_gk'] = ls_x_gk
    df['y_gk'] = ls_y_gk
    df['player_locations'] = ls_player_locations
    
    # Return dataframe
    return df

# Apply method and display dataframe
df_complete_shots = complete_shots(df_all)
df_complete_shots.head(1)

In [None]:
# Create previous dataframe with additional data on players in shot polygon
def polygon_players(df):
    
    # Create previous dataframe and exclude shots without coordinates from other players
    df = complete_shots(df).where(df_complete_shots['player_locations'] != '').dropna().reset_index(drop = True)
    # Create empty lists
    ls_polygon_players = []
    ls_polygon_players_count = []
        
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Reset nested polygon players list
        ls_polygon_players_nested = []
        # Iterate over all players in shot
        for i in range (0, len(df.at[row, 'player_locations'])):
            # Define point and polygon
            point = Point(df.at[row, 'player_locations'][i][0], df.at[row, 'player_locations'][i][1])
            polygon = Polygon([(df.at[row, 'x_shot'], df.at[row, 'y_shot']), (120, 36), (120, 44)])
            # Add points inside polygon to list
            if polygon.contains(point) == True:
                ls_polygon_players_nested.append(df.at[row, 'player_locations'][i])
        
        # Add nested polygon players list or empty string to list
        if len(ls_polygon_players_nested) > 0:
            ls_polygon_players.append(ls_polygon_players_nested)
        else:
            ls_polygon_players.append('')
        # Add polygon players count values to list    
        ls_polygon_players_count.append(len(ls_polygon_players_nested))
    
    # Add lists to dataframe
    df['polygon_players'] = ls_polygon_players
    df['polygon_players_count'] = ls_polygon_players_count
    
    # Return dataframe
    return df

# Apply method and display dataframe
df_polygon_players = polygon_players(df_all)
df_polygon_players.head(2)

In [None]:
# Create previous dataframe including shooting angle
def shooting_angle(df):
    
    # Create previous dataframe
    df = polygon_players(df)
    # Create empty list
    ls_shooting_angle = []
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        # Calculate slopes from shot position to goalposts
        m1 = (36-df.at[row, 'y_shot'])/(120-df.at[row, 'x_shot'])
        m2 = (44-df.at[row, 'y_shot'])/(120-df.at[row, 'x_shot'])
        # Add value to list
        ls_shooting_angle.append(round(math.degrees(abs(math.atan(m1) - math.atan(m2))), 4))

    # Add list to dataframe
    df['shooting_angle'] = ls_shooting_angle
    
    # Return dataframe
    return df

# Apply method and display dataframe
df_shooting_angle = shooting_angle(df_all)
df_shooting_angle.head(1)

In [None]:
# Create previous dataframe including bisector slope
def bisector_slope(df):
    
    # Create previous dataframe
    df = shooting_angle(df)
    # Create empty list
    ls_bisector_slope = []
    
    # x, y = origin = shot coordinates
    # x_p, y_p = point = lower goalpost coordinates
    # x_r, y_r = rotated point coordinates
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
    
        # Define shot coordinates
        x = df.at[row, 'x_shot']
        y = df.at[row, 'y_shot']
        # Define lower goalpost coordinates
        x_p = 120
        y_p = 36
        # Define half shooting angle
        ang = math.radians(df.at[row, 'shooting_angle'])/2

        # Rotate point counterclockwise by defined angle in radians around defined origin
        x_r = x + math.cos(ang) * (x_p - x) - math.sin(ang) * (y_p - y)
        y_r = y + math.sin(ang) * (x_p - x) + math.cos(ang) * (y_p - y)
        
        # Add value to list
        ls_bisector_slope.append(round((y_r-y)/(x_r-x), 4))
        
    # Add list to dataframe
    df['bisector_slope'] = ls_bisector_slope   
    
    # Return dataframe
    return df

# Apply method and display dataframe
df_bisector_slope = bisector_slope(df_all)
df_bisector_slope.head(1)

In [None]:
# Create previous dataframe including bisector goalkeeper position
def gk_bisector(df):
    
    # Create previous dataframe
    df = bisector_slope(df)
    # Create empty lists
    ls_x_gk_bisector = []
    ls_y_gk_bisector = []

    # Iterate over all rows in dataframe
    for j in range(0, len(df)):

        # Define linear equation of bisector
        x_shot = df.at[j, 'x_shot']
        y_shot = df.at[j, 'y_shot']
        m_bisector = df.at[j, 'bisector_slope']
        q_bisector = y_shot - m_bisector*x_shot
        # Define half shooting angle
        ang = math.radians(df.at[j, 'shooting_angle'])/2

        # If shot is central
        if m_bisector == 0:
            x_shift = (math.cos(ang)*goalkeeper_reach)/math.sin(ang)
            x_gk_bisector = x_shot + x_shift
            y_gk_bisector = 40
        # If shot is not central
        else:
            # Define linear equation going through closer goalpost and perpendicular to bisector
            d1 = math.dist([x_shot, y_shot], [120, 36])
            d2 = math.dist([x_shot, y_shot], [120, 44])
            if d1 < d2:
                y_post = 36
            else:
                y_post = 44
            x_post = 120
            m_post = -(1/m_bisector)
            q_post = y_post - m_post*x_post

            # Calculate intersection of two linear equations
            y_gk_bisector = (q_bisector - q_post*m_bisector/m_post)/(1-(m_bisector/m_post))
            x_gk_bisector = (y_gk_bisector-q_post)/m_post

        # If covered length is not enough
        if math.dist([x_post, y_post], [x_gk_bisector, y_gk_bisector]) > goalkeeper_reach:

            # Distance from shot position to optimal position to cover goalkeeper reach
            distance = (math.cos(ang)*goalkeeper_reach)/math.sin(ang)
            # Shift from shot position to optimal position
            x_shift = distance/(math.sqrt(1+m_bisector*m_bisector))
            y_shift = x_shift*m_bisector
            # Calculate optimal position
            x_gk_bisector = x_shot + x_shift
            y_gk_bisector = y_shot + y_shift
        
        # Add values to lists
        ls_x_gk_bisector.append(round(x_gk_bisector, 2))
        ls_y_gk_bisector.append(round(y_gk_bisector, 2))

    # Add lists to dataframe
    df['x_gk_bisector'] = ls_x_gk_bisector
    df['y_gk_bisector'] = ls_y_gk_bisector

    # Return dataframe
    return df

# Apply method and display dataframe
df_gk_bisector = gk_bisector(df_all)
df_gk_bisector.head(1)

In [None]:
# Create previous dataframe including average goalkeeper position
def gk_avg(df):
    
    # Create previous dataframe
    df = gk_bisector(df)
    # Create empty lists
    ls_x_gk_avg = []
    ls_y_gk_avg = []
    
    # Iterate over all rows in dataframe
    for i in range(0, len(df)):
        
        # Define shot coordinates
        x_shot = df.at[i, 'x_shot']
        y_shot = df.at[i, 'y_shot']
        # Create empty list
        ls_distance = []

        # Iterate over all rows in dataframe again
        for row in range(0, len(df)):
            # Define similar shot coordinates
            x_shot_similar = df.at[row, 'x_shot']
            y_shot_similar = df.at[row, 'y_shot']
            # Add distance between shot and similar shot to list
            ls_distance.append(math.dist([x_shot, y_shot], [x_shot_similar, y_shot_similar]))
        # Sort indices by ascending distance values and save first 20 (e.g. smallest) values
        sorted_indices = np.argsort(ls_distance).tolist()[:20]

        # Define counter variables
        x_gk_tot = 0
        y_gk_tot = 0
        # Sum up 20 x_gk and y_gk coordinates from 20 most similar shots
        for row in sorted_indices:
            x_gk_tot = x_gk_tot + df.at[row, 'x_gk']
            y_gk_tot = y_gk_tot + df.at[row, 'y_gk']
        # Add mean x_gk and mean y_gk coordinates from 20 most similar shots to list
        ls_x_gk_avg.append(round(x_gk_tot/20, 2))
        ls_y_gk_avg.append(round(y_gk_tot/20, 2))   
        
   # Add lists to dataframe
    df['x_gk_avg'] = ls_x_gk_avg    
    df['y_gk_avg'] = ls_y_gk_avg
    
    # Return dataframe
    return df

# Apply method and display dataframe
df_gk_avg = gk_avg(df_all)
df_gk_avg.head(1)

In [None]:
# Create previous dataframe including geometric goalkeeper position
def gk_geo(df):
    
    # Create previous dataframe
    df = gk_avg(df)
    # Create empty lists
    ls_x_gk_geo = []
    ls_y_gk_geo = []

#################################################################################################
    # Define orange polygons
    polygon_orange_1 = Polygon([(114, 62), (120, 62), (120, 80), (114, 80)])
    polygon_orange_2 = Polygon([(114, 0), (120, 0), (120, 18), (114, 18)])
    polygon_orange_3 = Polygon([(60, 62), (98.85, 62), (98.85, 80), (60, 80)])
    polygon_orange_4 = Polygon([(60, 0), (98.85, 0), (98.85, 18), (60, 18)])
    # Define yellow polygons
    polygon_yellow_1 = Polygon([(117.6, 44), (120, 44), (120, 62), (106.8, 62)])
    polygon_yellow_2 = Polygon([(106.8, 18), (120, 18), (120, 36), (117.6, 36)])
    # Define red polygon
    polygon_red = Polygon([(60, 18), (98.85, 18), (98.85, 0), (114, 0), (114, 18), (102, 18), (102, 33.1), (98.85, 33.1), (98.85, 46.9), (102, 46.9), (102, 62), (114, 62), (114, 80), (98.85, 80), (98.85, 62), (60, 62)])
    # Define blue polygon
    polygon_blue = Polygon([(98.85, 36.774625), (114, 39.085), (114, 40.915), (98.85, 43.225375)])
    # Define light blue polygons
    polygon_light_1 = Polygon([(98.85, 43.225375), (114, 40.915), (117.6, 44), (106.8, 62), (102, 62), (102, 46.9), (98.85, 46.9)])
    polygon_light_2 = Polygon([(102, 18), (106.8, 18), (117.6, 36), (114, 39.085), (98.85, 36.774625), (98.85, 33.1), (102, 33.1)])
#################################################################################################
        
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Define shot coordinates
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Define shot point
        point = Point(x_shot, y_shot)

        # If shot lies in orange polygon
        if polygon_orange_1.contains(point) == True or polygon_orange_2.contains(point) == True or polygon_orange_3.contains(point) == True or polygon_orange_4.contains(point) == True:
            # Define gk_geo coordinates
            x_gk_geo = 117.6
            y_gk_geo = 40

        # If shot lies in red polygon
        elif polygon_red.contains(point) == True:
            # Define linear equation of ballline
            m_ballline = (y_shot-40)/(x_shot-120)
            q_ballline = y_shot - m_ballline*x_shot
            # If central shot
            if m_ballline == 0:
                # Define gk_geo coordinates
                x_gk_geo = 117.6
                y_gk_geo = 40
            # If no central shot
            else:
                # Define linear equation of gk_geo line
                if m_ballline > 0:
                    m_gk_geo = (36-40)/(120-117.6)
                    q_gk_geo = 36 - m_gk_geo*120
                else:
                    m_gk_geo = (44-40)/(120-117.6)
                    q_gk_geo = 44 - m_gk_geo*120
                # Define gk_geo coordinates by calculating intersection of two linear equations
                y_gk_geo = (q_gk_geo - q_ballline*m_gk_geo/m_ballline)/(1-(m_gk_geo/m_ballline))
                x_gk_geo = (y_gk_geo-q_ballline)/m_ballline

        # If shot lies in blue polygon
        elif polygon_blue.contains(point) == True:
            # Define linear equation of ballline
            m_ballline = (y_shot-40)/(x_shot-120)
            q_ballline = y_shot - m_ballline*x_shot
            # Define gk_geo coordinates
            x_gk_geo = 114
            y_gk_geo = q_ballline + m_ballline*x_gk_geo

        # If shot lies in light blue polygon
        elif polygon_light_1.contains(point) == True or polygon_light_2.contains(point) == True:
            # Define linear equation of ballline
            m_ballline = (y_shot-40)/(x_shot-120)
            q_ballline = y_shot - m_ballline*x_shot
            # Define linear equation of gk_geo line
            if m_ballline > 0:
                m_gk_geo = (36-39.085)/(117.6-114)
                q_gk_geo = 36 - m_gk_geo*117.6
            else:
                m_gk_geo = (44-40.915)/(117.6-114)
                q_gk_geo = 44 - m_gk_geo*117.6
            # Define gk_geo coordinates by calculating intersection of two linear equations
            y_gk_geo = (q_gk_geo - q_ballline*m_gk_geo/m_ballline)/(1-(m_gk_geo/m_ballline))
            x_gk_geo = (y_gk_geo-q_ballline)/m_ballline

        # If shot lies in yellow polygon
        elif polygon_yellow_1.contains(point) == True or polygon_yellow_2.contains(point) == True:
            # Define linear equation of ballline
            m_ballline = (y_shot-40)/(x_shot-120)
            q_ballline = y_shot - m_ballline*x_shot
            # Define y_gk_geo
            if m_ballline > 0:
                y_gk_geo = 36
            else:
                y_gk_geo = 44
            # Define x_gk_geo by calculating intersection of two linear equations
            x_gk_geo = (y_gk_geo-q_ballline)/m_ballline

#################################################################################################
        # If no polygon contains point
        else:
            x_gk_geo = 120
            y_gk_geo = 40
#################################################################################################

        # Add values to lists
        ls_x_gk_geo.append(round(x_gk_geo, 2))
        ls_y_gk_geo.append(round(y_gk_geo, 2))
        
    # Add lists to dataframe
    df['x_gk_geo'] = ls_x_gk_geo
    df['y_gk_geo'] = ls_y_gk_geo

    # Return dataframe
    return df

# Apply method and display dataframe
df_gk_geo = gk_geo(df_all)
df_gk_geo.head(1)

In [None]:
# Create previous dataframe including line goalkeeper position
def gk_line(df):
    
    # Create previous dataframe
    df = gk_geo(df)
    # Create empty lists
    ls_x_gk_line = []
    ls_y_gk_line = []
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):

        # Define shot coordinates
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']

        # If shot is further away then line
        if x_shot < 118:
            # Define linear equation of ballline
            m_ballline = (y_shot-40)/(x_shot-120)
            q_ballline = y_shot - m_ballline*x_shot
            # Define gk_line coordinates by calculating intersection of two linear equations
            x_gk_line = 118
            y_gk_line = q_ballline + m_ballline*x_gk_line

#################################################################################################
        # If shot is not further away then line
        else:
            # Define gk_line coordinates
            x_gk_line = 0
            y_gk_line = 0
#################################################################################################

        # Add values to lists
        ls_x_gk_line.append(round(x_gk_line, 2))
        ls_y_gk_line.append(round(y_gk_line, 2))
        
    # Add lists to dataframe
    df['x_gk_line'] = ls_x_gk_line
    df['y_gk_line'] = ls_y_gk_line

    # Return dataframe
    return df

# Apply method and display dataframe
df_gk_line = gk_line(df_all)
df_gk_line.head(1)

In [None]:
# https://stackoverflow.com/questions/30844482/what-is-most-efficient-way-to-find-the-intersection-of-a-line-and-a-circle-in-py

# Find the points at which a circle intersects a line-segment (this can happen at 0, 1, or 2 points)
def circle_line_segment_intersection(pt1, full_line=True, tangent_tol=1e-9):
    
    # circle_center: The (x, y) location of the circle center
    # circle_radius: The radius of the circle
    # pt1: The (x, y) location of the first point of the segment
    # pt2: The (x, y) location of the second point of the segment
    # full_line: True to find intersections along full line - not just in the segment.  False will just return intersections within the segment.
    # tangent_tol: Numerical tolerance at which we decide the intersections are close enough to consider it a tangent
    # return Sequence[Tuple[float, float]]: A list of length 0, 1, or 2, where each element is a point at which the circle intercepts a line segment.

#################################################################################################    
    circle_center = (120, 40)
    circle_radius = 4
#################################################################################################    
    pt2 = (120, 40)
    
    (p1x, p1y), (p2x, p2y), (cx, cy) = pt1, pt2, circle_center
    (x1, y1), (x2, y2) = (p1x - cx, p1y - cy), (p2x - cx, p2y - cy)
    dx, dy = (x2 - x1), (y2 - y1)
    dr = (dx ** 2 + dy ** 2)**.5
    big_d = x1 * y2 - x2 * y1
    discriminant = circle_radius ** 2 * dr ** 2 - big_d ** 2

    if discriminant < 0:  # No intersection between circle and line
        return []
    else:  # There may be 0, 1, or 2 intersections with the segment
        intersections = [
            (cx + (big_d * dy + sign * (-1 if dy < 0 else 1) * dx * discriminant**.5) / dr ** 2,
             cy + (-big_d * dx + sign * abs(dy) * discriminant**.5) / dr ** 2)
            for sign in ((1, -1) if dy < 0 else (-1, 1))]  # This makes sure the order along the segment is correct
        if not full_line:  # If only considering the segment, filter out intersections that do not fall within the segment
            fraction_along_segment = [(xi - p1x) / dx if abs(dx) > abs(dy) else (yi - p1y) / dy for xi, yi in intersections]
            intersections = [pt for pt, frac in zip(intersections, fraction_along_segment) if 0 <= frac <= 1]
        if len(intersections) == 2 and abs(discriminant) <= tangent_tol:  # If line is tangent to circle, return just one point (as both intersections have same location)
            return [intersections[0]]
        else:
            return intersections

In [None]:
# Create previous dataframe including arc goalkeeper position
def gk_arc(df):
    
    # Create previous dataframe
    df = gk_line(df)
    # Create empty lists
    ls_x_gk_arc = []
    ls_y_gk_arc = []
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):

        # Define shot coordinates
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']

        # Calculate intersection of linear equation with circle
        intersections = circle_line_segment_intersection((x_shot, y_shot))
        # Define gk_arc coordinates
        x_gk_arc = intersections[0][0]
        y_gk_arc = intersections[0][1]
        
#################################################################################################
        # If shot is not further away then intersection
        if x_shot > x_gk_arc:
            # Define gk_arc coordinates
            x_gk_arc = 120
            y_gk_arc = 40
#################################################################################################

        # Add values to lists
        ls_x_gk_arc.append(round(x_gk_arc, 2))
        ls_y_gk_arc.append(round(y_gk_arc, 2))
        
    # Add lists to dataframe
    df['x_gk_arc'] = ls_x_gk_arc
    df['y_gk_arc'] = ls_y_gk_arc

    # Return dataframe
    return df

# Apply method and display dataframe
df_gk_arc = gk_arc(df_all)
df_gk_arc.head(1)

In [None]:
# Create previous dataframe including dimensionality reduced data
def reduce(df):
    
    # Create previous dataframe
    df = gk_arc(df)
    # Create empty lists
    ls_goal_angle = []
    ls_distance_shot_goalcenter = []
    ls_distance_shot_goalline = []
    ls_distance_gk_goalline = []
    ls_angular_deviation = []
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Define variables
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        x_gk = df.at[row, 'x_gk']
        y_gk = df.at[row, 'y_gk']
        x_gk_bisector = df.at[row, 'x_gk_bisector']
        y_gk_bisector = df.at[row, 'y_gk_bisector']
        m_shot_goalcenter = (40-y_shot)/(120-x_shot)
        m_shot_gk = (y_gk-y_shot)/(x_gk-x_shot)
        m_bisector = df.at[row, 'bisector_slope']
            
        # Add values to lists
        ls_goal_angle.append(round(math.degrees(abs(math.atan(m_shot_goalcenter))), 2))
        ls_distance_shot_goalcenter.append(round(math.dist([x_shot, y_shot], [120, 40]), 2))
        ls_distance_shot_goalline.append(120 - x_shot)
        ls_distance_gk_goalline.append(120 - x_gk)
        ls_angular_deviation.append(round(math.degrees(abs(math.atan(m_shot_gk) - math.atan(m_bisector))), 2))
        
    # Add lists to dataframe
    df['goal_angle'] = ls_goal_angle
    df['distance_shot_goalcenter'] = ls_distance_shot_goalcenter
    df['distance_shot_goalline'] = ls_distance_shot_goalline
    df['distance_gk_goalline'] = ls_distance_gk_goalline
    df['angular_deviation'] = ls_angular_deviation
    
    # Return dataframe
    return df[['outcome', 'possession_team', 'shot_body_part', 'shot_technique', 'goalkeeper', 'x_shot', 'y_shot', 'x_gk', 'y_gk', 'x_gk_avg', 'y_gk_avg', 'x_gk_geo', 'y_gk_geo', 'x_gk_line', 'y_gk_line', 'x_gk_arc', 'y_gk_arc', 'x_gk_bisector', 'y_gk_bisector', 'player_locations', 'polygon_players', 'polygon_players_count', 'shooting_angle', 'bisector_slope', 'goal_angle', 'distance_shot_goalcenter', 'distance_shot_goalline', 'distance_gk_goalline', 'angular_deviation']]


# Apply method and display dataframe
df_reduce = reduce(df_all)
df_reduce.head(1)

### Goalkeeper Methods

In [None]:
# Add column with distance between two positions to dataframe
def distance_gk_gk_alt(df, gk_alt):
    
    # Create empty list
    ls_distance = []
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Define coordinates of two points
        x_gk = df.at[row, 'x_gk']
        y_gk = df.at[row, 'y_gk']
        x_gk_alt = df.at[row, 'x_gk' + gk_alt]
        y_gk_alt = df.at[row, 'y_gk' + gk_alt]
        # Add distance between two points to list
        ls_distance.append(round(math.dist([x_gk, y_gk], [x_gk_alt, y_gk_alt]), 2))
    
    # Define column name
    name = 'distance_gk_gk' + gk_alt
    # Add list to dataframe
    df[name] = ls_distance

    # Return dataframe
    return df

In [None]:
# Add column with distance between goalkeeper and goalline to dataframe
def distance_gk_goalline(df, gk_name):
    
    # Create empty list
    ls_distance_gk_goalline = []
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Define goalkeeper x-coordinate
        x_gk = df.at[row, 'x_gk' + gk_name]
        # Add distance between two points to list
        ls_distance_gk_goalline.append(120 - x_gk)
    
    # Define column name
    name = 'distance_gk' + gk_name + '_goalline'
    # Add list to dataframe
    df[name] = ls_distance_gk_goalline

    # Return dataframe
    return df

In [None]:
# Add column to df with distance between bisector and gk_name
def distance_gk_bisector(df, gk_name):
    
    # Create empty list
    ls_distance_gk_bisector = []

    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Define shot coordinates
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Define goalkeeper coordinates
        x_gk = df.at[row, 'x_gk' + gk_name]
        y_gk = df.at[row, 'y_gk' + gk_name]
        # Define linear equation of bisector
        m_bisector = df.at[row, 'bisector_slope']
        q_bisector = y_shot - m_bisector *x_shot
        
        # If central shot
        if m_bisector == 0:
            # Define intersection of two linear equations
            x_intersection = x_gk
            y_intersection = 40
        # If no central shot
        else:
            # Define linear equation going through goalkeeper position and perpendicular to bisector
            m_perpendicular = -(1/m_bisector)
            q_perpendicular = y_gk - m_perpendicular *x_gk
            # Calculate intersection of two linear equations
            y_intersection = (q_perpendicular - q_bisector*m_perpendicular/m_bisector)/(1-(m_perpendicular/m_bisector))
            x_intersection = (y_intersection-q_bisector)/m_bisector

        # Add value to list
        ls_distance_gk_bisector.append(round(math.dist([x_gk, y_gk], [x_intersection, y_intersection]), 2))
    
    # Define column name
    col_name = 'distance_gk' + gk_name +'_bisector'
    # Add list to dataframe
    df[col_name] = ls_distance_gk_bisector
    
    # Return dataframe
    return df

In [None]:
# Add column with ratio between two distances to dataframe
def ratio_shotgk_gkbisector(df, gk_name):
    
    # Create empty list
    ls_ratio_shotgk_gkbisector = []
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Define variables
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        x_gk = df.at[row, 'x_gk' + gk_name]
        y_gk = df.at[row, 'y_gk' + gk_name]
        distance_gk_bisector = df.at[row, 'distance_gk' + gk_name + '_bisector']
        
        # Calculate value
        if distance_gk_bisector != 0:
            ratio_shotgk_gkbisector = round(math.dist([x_shot, y_shot], [x_gk, y_gk]) / distance_gk_bisector, 2)
        else:
            ratio_shotgk_gkbisector = 4030
        # Add value to list  
        ls_ratio_shotgk_gkbisector.append(ratio_shotgk_gkbisector)

    # Define column name
    col_name = 'ratio_shotgk' + gk_name + '_bisectorgk' + gk_name
    # Add list to dataframe
    df[col_name] = ls_ratio_shotgk_gkbisector

    # Return dataframe
    return df

### Other Methods

In [None]:
# Filter dataframe for 1v1 situations
def filter_1v1s(df):
    df = df.where(df['polygon_players_count'] == 0).dropna().reset_index(drop = True)
    
    return df

In [None]:
# Plot one shot
def plot_shot(df, row, gk_alt):
    
    # Soccer pitch
    pitch = Pitch(pitch_color='grass', line_color='white', stripe=True)
    fig, ax = pitch.draw()

    # Player location
    x_shot = df.at[row, 'x_shot']
    y_shot = df.at[row, 'y_shot']
    # Red dots for goals and blue dots for the rest
    color = 'blue'
    if df.at[row, 'outcome'] == 'Goal':
        color = 'red'
    plt.scatter(x_shot, y_shot, color = color)

    # Goalkeeper location
    x_gk = df.at[row, 'x_gk']
    y_gk = df.at[row, 'y_gk']
    plt.scatter(x_gk, y_gk, color = 'brown')
    
    # Other players location
    for i in range(0, len(df.at[row, 'polygon_players'])):
        x_player = df.at[row, 'polygon_players'][i][0]
        y_player = df.at[row, 'polygon_players'][i][1]
        plt.scatter(x_player, y_player, color = 'yellow')

    # Alternative goalkeeper location
    x_gk_alt = df.at[row, 'x_' + gk_alt]
    y_gk_alt = df.at[row, 'y_' + gk_alt]
    plt.scatter(x_gk_alt, y_gk_alt, color = 'orange')

    # Line between player location and gp1
    point1 = [df.at[row, 'x_shot'], df.at[row, 'y_shot']]
    point2 = [120, 36]
    x_values = [point1[0], point2[0]]
    y_values = [point1[1], point2[1]]
    plt.plot(x_values, y_values, color = 'black')

    # Line between player location and gp2
    point1 = [df.at[row, 'x_shot'], df.at[row, 'y_shot']]
    point2 = [120, 44]
    x_values = [point1[0], point2[0]]
    y_values = [point1[1], point2[1]]
    plt.plot(x_values, y_values, color = 'black')
    
    """
    # Plot bisector
    point1 = [df.at[row, 'x_loc'], df.at[row, 'y_loc']]
    point2 = [x_opt, y_opt]
    x_values = [point1[0], point2[0]]
    y_values = [point1[1], point2[1]]
    plt.plot(x_values, y_values, color = 'black')
    """
    
    # Scale plot
    plt.axis('scaled')
    plt.xlim([-5, 125])
    plt.ylim([-5, 85])

    # Display plot
    plt.show()
    
    # Display shot location, goalkeeper location and gk_bisector
    print('x_shot:', x_shot, 'y_shot:', y_shot, '(blue)')
    print('x_gk:', x_gk, 'y_gk:', y_gk, '(brown)')
    print('x_' + gk_alt + ':', x_gk_alt, 'y_' + gk_alt + ':', y_gk_alt, '(orange)')

In [None]:
# Plot all shots
def plot_all(df):
    
    # Soccer pitch
    pitch = Pitch(pitch_color='grass', line_color='white', stripe=True)
    fig, ax = pitch.draw()
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Player location
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Red dots for goals and blue dots for the rest
        color = 'blue'
        if df.at[row, 'outcome'] == 'Goal':
            color = 'red'
        # Plot point
        plt.scatter(x_shot, y_shot, color = color)
        
    # Scale plot
    plt.axis('scaled')
    plt.xlim([-5, 125])
    plt.ylim([-5, 85])

    # Display plot
    plt.show()

In [None]:
# Create coordinate system
def coordinates(x, y):
    # Get inliine charts
    %matplotlib inline
    # Define x range, y range, and tick interval for both axes
    xmin, xmax, ymin, ymax = -x, x, -y, y
    ticks_frequency = 1
    # Create figure and axes object
    fig, ax = plt.subplots(figsize=(10, 10))
    # Set face color
    fig.patch.set_facecolor('#ffffff')
    # Apply ranges to axes
    ax.set(xlim=(xmin-1, xmax+1), ylim=(ymin-1, ymax+1), aspect='equal')
    # Set both axes to zero position
    ax.spines['bottom'].set_position('zero')
    ax.spines['left'].set_position('zero')
    # Hide the top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # Set x label, y label, and add an origin label
    ax.set_xlabel('$x$', size=14, labelpad=-24, x=1.02)
    ax.set_ylabel('$y$', size=14, labelpad=-21, y=1.02, rotation=0)
    plt.text(0.49, 0.49, r"$O$", ha='right', va='top',
        transform=ax.transAxes,
             horizontalalignment='center', fontsize=14)
    # Create x tick, y tick, and apply them to both axes
    x_ticks = np.arange(xmin, xmax+1, ticks_frequency)
    y_ticks = np.arange(ymin, ymax+1, ticks_frequency)
    ax.set_xticks(x_ticks[x_ticks != 0])
    ax.set_yticks(y_ticks[y_ticks != 0])
    ax.set_xticks(np.arange(xmin, xmax+1), minor=True)
    ax.set_yticks(np.arange(ymin, ymax+1), minor=True)
    # Add a grid
    ax.grid(which='both', color='grey', linewidth=1, linestyle='-', alpha=0.2)

### Data

In [None]:
# Apply goalkeeper methods for all goalkeeper positions
for gk_name in ls_gk_names:
    df_reduce = distance_gk_gk_alt(df_reduce, gk_name)
for gk_name in ls_gk_names:
    df_reduce = distance_gk_goalline(df_reduce, gk_name)
for gk_name in ls_gk_names:
    df_reduce = distance_gk_bisector(df_reduce, gk_name) 
for gk_name in ls_gk_names:
    df_reduce = ratio_shotgk_gkbisector(df_reduce, gk_name)

# Display dataframe
print('Number of rows:', len(df_reduce))
df_reduce.head(1)

In [None]:
# Filter for 1v1 situations
df_1v1 = filter_1v1s(df_reduce)
print('Number of rows:', len(df_1v1))
df_1v1.head(1)

### Plot

In [None]:
# Plot one shot
plot_shot(df_1v1, 0, 'gk_geo')

In [None]:
# Plot all shots
plot_all(df_final)

In [None]:
# Plot 1v1 shots
plot_all(df_1v1)

### Deviations

In [None]:
# Define lists of similar columns
ls_distance_gk = ['distance_gk_gk', 'distance_gk_gk_avg', 'distance_gk_gk_geo', 'distance_gk_gk_line', 'distance_gk_gk_arc', 'distance_gk_gk_bisector']
ls_distance_bisector = ['distance_gk_bisector', 'distance_gk_avg_bisector', 'distance_gk_geo_bisector', 'distance_gk_line_bisector', 'distance_gk_arc_bisector', 'distance_gk_bisector_bisector']

In [None]:
# Check a column's mean value for different outcomes
def check_deviation(df, col):
    
    # Print column name
    print(col)
    
    # Mean value for saves
    df_saved = df.where(df['outcome'] == 'Saved').dropna().reset_index(drop = True)
    print('Deviation for saves:', round(df_saved[col].mean(), 2))
    
    # Mean value for nogoals
    df_nogoal = df.where(df['outcome'] != 'Goal').dropna().reset_index(drop = True)
    print('Deviation for nogoals:', round(df_nogoal[col].mean(), 2))

    # Mean value for goals
    df_goal = df.where(df['outcome'] == 'Goal').dropna().reset_index(drop = True)
    print('Deviation for goals:', round(df_goal[col].mean(), 2))
    print()

# Apply method for each column in list
for distance_name in ls_distance_bisector:
    check_deviation(df_1v1, distance_name)

In [None]:
# Calculate mean value of column
def mean_value(df, col):
    print('Mean ' + col, round(df[col].mean(), 2))

# Apply method for each column in list
for distance_name in ls_distance_bisector:
    mean_value(df_1v1, distance_name)

Plot GKP relative to centered TOGKP:
- https://pygmalion.nitri.org/cartesian-coordinates-with-matplotlib-1263.html)

In [None]:
# Create coordinate system
coordinates(12, 12)

# Initialize 1v1 dataframe
df = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)

# Iterate over all rows in dataframe
for row in range(0, len(df)):
    
    # Save goalkeeper position relative to centered TOGKP
    x = df.at[row, 'x_gk'] - df.at[row, 'x_gk_bisector']
    y = df.at[row, 'y_gk'] - df.at[row, 'y_gk_bisector']
    # Color for every outcome besides goals and saves
    color = 'orange'
    # Color for goals
    if df.at[row, 'outcome'] == 'Goal':
        color = 'red'
    # Color for saves
    if df.at[row, 'outcome'] == 'Saved':
        color = 'green'
    # Plot point
    plt.scatter(x, y, color = color)

Plot the distance from the goalkeeper to the goal center on x axis and the perpendicular distance on y axis:

In [None]:
# Create coordinate system
coordinates(24, 12)

# Initialize 1v1 dataframe
df = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)

# Iterate over all rows in dataframe
for row in range(0, len(df)):
    
    # Save goalkeeper position relative to centered TOGKP
    x = df.at[row, 'distance_gk_bisector']
    y = df.at[row, 'distance_gk_goalline']
    # Color for every outcome besides goals and saves
    color = 'orange'
    # Color for goals
    if df.at[row, 'outcome'] == 'Goal':
        color = 'red'
    # Color for saves
    if df.at[row, 'outcome'] == 'Saved':
        color = 'green'
    # Plot point
    plt.scatter(x, y, color = color)

Separately check average deviations from TOGKP in x and y direction:

In [None]:
# Initialize 1v1 dataframe
df = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)

# Initialize variables
x_tot = 0
y_tot = 0
x_tot_abs = 0
y_tot_abs = 0

# Iterate over all rows in dataframe
for row in range(0, len(df)):
    
    # Display goalkeeper position relative to centered TOGKP
    x = df_1v1.at[row, 'x_gk'] - df_1v1.at[row, 'x_gk_bisector']
    y = df_1v1.at[row, 'y_gk'] - df_1v1.at[row, 'y_gk_bisector']
    # Calculate total deviation
    x_tot = x_tot + x
    y_tot = y_tot + y
    # Calculate absolute total deviation
    x_tot_abs = x_tot_abs + abs(x)
    y_tot_abs = y_tot_abs + abs(y)

# Calculate average deviation
x_dev = round(x_tot / len(df), 2)
y_dev = round(y_tot / len(df), 2)
x_dev_abs = round(x_tot_abs / len(df), 2)
y_dev_abs = round(y_tot_abs / len(df), 2)

# Display results
print('Deviation in x direction:', x_dev)
print('Deviation in y direction:', y_dev)
print('Absolute deviation in x direction:', x_dev_abs)
print('Absolute deviation in y direction:', y_dev_abs)

### ML-Algorithm to predict Goal Probability

In [None]:
# Initialize settings
plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

# Initialize 1v1 dataframe
df = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)

# Select all features
# df = df[['outcome', 'x_shot', 'y_shot', 'x_gk', 'y_gk', 'x_gk_bisector', 'y_gk_bisector', 'shooting_angle', 'goal_angle', 'distance_shot_goalcenter', 'distance_gk_bisector', 'angular_deviation', 'ratio_shotgk_gkbisector']]

# Select reduced features
df = df[['outcome', 'shooting_angle', 'goal_angle', 'distance_shot_goalcenter', 'distance_gk_bisector', 'angular_deviation', 'ratio_shotgk_gkbisector']]

# Select reduced features without bisector features
# df = df[['outcome', 'shooting_angle', 'goal_angle', 'distance_shot_goalcenter']]

# Select coordinate features
# df = df[['outcome', 'x_shot', 'y_shot', 'x_gk', 'y_gk', 'x_gk_bisector', 'y_gk_bisector']]

# Select coordinate features without bisector features
# df = df[['outcome', 'x_shot', 'y_shot', 'x_gk', 'y_gk']]

# Select experimental features
# df = df[['outcome', 'distance_gk_bisector', 'angular_deviation', 'ratio_shotgk_gkbisector']]


# Encode outcome to binary
for row in range(0, len(df)):
    if df.at[row, 'outcome'] == 'Goal':
        df.at[row, 'outcome'] = 1
    else:
        df.at[row, 'outcome'] = 0    
# From object to int
df['outcome'] = df['outcome'].astype('int')

# Display dataframe
df.head(1)

In [None]:
# Import package
from sklearn.metrics import confusion_matrix

# Initialize input vector and label
X = df.loc[:, df.columns != 'outcome']
y = df.loc[:, df.columns == 'outcome']

# Implement model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Save predictions
y_pred = logreg.predict(X_test)

# Calculate confusion matrix and values
confusion_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = confusion_matrix.ravel()
accuracy =  (TP+TN) /(TP+FP+TN+FN)

# Display results
print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)
print('')
print('Number of correct predictions:', confusion_matrix[0][0] + confusion_matrix[1][1])
print('Number of incorrect predictions:', confusion_matrix[0][1] + confusion_matrix[1][0])
print('')
print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))
print('')
print(classification_report(y_test, y_pred))

In [None]:
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
# Show class imbalance
count_no_goal = len(df[df['outcome']==0])
count_goal = len(df[df['outcome']==1])
pct_of_no_goal = count_no_goal/(count_no_goal+count_goal)
pct_of_goal = count_goal/(count_no_goal+count_goal)
print("percentage of no goal is", pct_of_no_goal*100)
print("percentage of goal", pct_of_goal*100)

### ML-Algorithm to predict Goalkeeper Position

In [None]:
df

In [None]:
# Define dataframe
df = filter_1v1s(df_final)

# Create empty lists
ls_y_shot_mirror = []
ls_y_gk_mirror = []
ls_mirror = []

# Iterate over all rows in dataframe
for row in range(0, len(df)):
    if df.at[row, 'y_shot'] < 40:
        ls_y_shot_mirror.append(80 - df.at[row, 'y_shot'])
        ls_y_gk_mirror.append(80 - df.at[row, 'y_gk'])
        ls_mirror.append(True)
    else:
        ls_y_shot_mirror.append(df.at[row, 'y_shot'])
        ls_y_gk_mirror.append(df.at[row, 'y_gk'])
        ls_mirror.append(False)

# Add lists to dataframe
df['y_shot'] = ls_y_shot_mirror
df['y_gk'] = ls_y_gk_mirror
df['mirror'] = ls_mirror

In [None]:
df

In [None]:
# Define features ('x_shot', 'y_shot', 'shooting_angle', 'bisector_slope', 'goal_angle', 'distance_shot_goalcenter', 'x_gk_avg', 'y_gk_avg')
# X = df[['x_shot', 'y_shot']]
X = df[['shooting_angle', 'goal_angle', 'distance_shot_goalcenter']] #'x_gk_avg', 'y_gk_avg'
# Define target
y = df[['x_gk', 'y_gk']]

# Define train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Total number of test rows
n_test = len(y_test)

In [None]:
# Define model
model = RandomForestRegressor(random_state=1)
# Fit model
model.fit(X_train, y_train)
# Test model
y_pred = model.predict(X_test) 

# Calculate mean absolute error
x_ae = 0
y_ae = 0
d_ae = 0
y_test_reset = y_test.reset_index()

for row in range (0, n_test):
    x_ae = x_ae + abs(y_pred[row][0] - y_test_reset.at[row, 'x_gk'])
    y_ae = y_ae + abs(y_pred[row][1] - y_test_reset.at[row, 'y_gk'])
    d_ae = d_ae + round(math.dist([y_pred[row][0], y_pred[row][1]], [y_test_reset.at[row, 'x_gk'], y_test_reset.at[row, 'y_gk']]), 2)

x_mae = x_ae / n_test
y_mae = y_ae / n_test
d_mae = d_ae / n_test

x_mae, y_mae, d_mae

In [None]:
# Define model
model = LinearRegression()
# Fit model
model.fit(X_train, y_train)
# Test model
y_pred = model.predict(X_test) 

# Calculate mean absolute error
x_ae = 0
y_ae = 0
d_ae = 0
y_test_reset = y_test.reset_index()

for row in range (0, n_test):
    x_ae = x_ae + abs(y_pred[row][0] - y_test_reset.at[row, 'x_gk'])
    y_ae = y_ae + abs(y_pred[row][1] - y_test_reset.at[row, 'y_gk'])
    d_ae = d_ae + round(math.dist([y_pred[row][0], y_pred[row][1]], [y_test_reset.at[row, 'x_gk'], y_test_reset.at[row, 'y_gk']]), 2)

x_mae = x_ae / n_test
y_mae = y_ae / n_test
d_mae = d_ae / n_test

x_mae, y_mae, d_mae

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Define model
model = KNeighborsRegressor()
# Fit model
model.fit(X_train, y_train)
# Test model
y_pred = model.predict(X_test) 

# Calculate mean absolute error
x_ae = 0
y_ae = 0
d_ae = 0
y_test_reset = y_test.reset_index()

for row in range (0, n_test):
    x_ae = x_ae + abs(y_pred[row][0] - y_test_reset.at[row, 'x_gk'])
    y_ae = y_ae + abs(y_pred[row][1] - y_test_reset.at[row, 'y_gk'])
    d_ae = d_ae + round(math.dist([y_pred[row][0], y_pred[row][1]], [y_test_reset.at[row, 'x_gk'], y_test_reset.at[row, 'y_gk']]), 2)

x_mae = x_ae / n_test
y_mae = y_ae / n_test
d_mae = d_ae / n_test

x_mae, y_mae, d_mae

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Define model
model = DecisionTreeRegressor()
# Fit model
model.fit(X_train, y_train)
# Test model
y_pred = model.predict(X_test) 

# Calculate mean absolute error
x_ae = 0
y_ae = 0
d_ae = 0
y_test_reset = y_test.reset_index()

for row in range (0, n_test):
    x_ae = x_ae + abs(y_pred[row][0] - y_test_reset.at[row, 'x_gk'])
    y_ae = y_ae + abs(y_pred[row][1] - y_test_reset.at[row, 'y_gk'])
    d_ae = d_ae + round(math.dist([y_pred[row][0], y_pred[row][1]], [y_test_reset.at[row, 'x_gk'], y_test_reset.at[row, 'y_gk']]), 2)

x_mae = x_ae / n_test
y_mae = y_ae / n_test
d_mae = d_ae / n_test

x_mae, y_mae, d_mae

In [None]:
from numpy import absolute
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

# Define model
model = DecisionTreeRegressor()
# Define evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# Evaluate model and collect scores
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# Force scores to be positive
n_scores = absolute(n_scores)
# Summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

### Dataset Exploration

Display exemplary json file as dataframe:

In [None]:
# Create dataframe
df_test = pd.read_json(filepath)

# Display dataframe
df_test

Display column names of the dataframe:

In [None]:
# Display columns
df_test.columns.to_list()[:3]

Calculate total number of games and average number of events per game:

In [None]:
# Initialize variables
filecount = 0
eventcount = 0

# Iterate over all files in directory
for filename in os.listdir(directory):

    # Save current filepath
    filepath = os.path.join(directory, filename)
    # Check if current file has json ending
    if filename.split('.')[1] == 'json':

        # Create dataframe from current json file
        df_file = pd.read_json(filepath)
        # Update counters
        eventcount = eventcount + len(df_file)
        filecount = filecount + 1

# Calculate average number of events per game
avg_events = eventcount / filecount

# Display result
print("Number of games:", filecount)
print("Average number of events per game:", avg_events)

Caclulate total number of games with a goal difference of zero or one:

In [None]:
# Create dataframe
df_matches = pd.read_json(matchpath)

# Calculate goal difference
df_matches['goal_difference'] = abs(df_matches['home_score'] - df_matches['away_score'])

# Display goaldifference
df_matches['goal_difference'].where(df_matches['goal_difference'] <= 1).dropna().count()

Reason for setting ratio_shooter_perp values to 4030 in case perp = 0:

In [None]:
# Display maximum value for ratio_shooter_perp
df_1v1.where(df_1v1['ratio_shotgk_gkbisector'] > 4030).dropna()

### Experimental

Check which goalkeeper reach value results in smallest distance between GKP and TOGKP:
- Final goalkeeper reach: 3.5
- Final mean distance: 1.7

In [None]:
"""
goalkeeper_reach = 1.8

while goalkeeper_reach < 3.8:
    df_experimental = reduce(df_all)
    print('dr:', goalkeeper_reach)
    print('d:', df_experimental['distance'].where(df_experimental['outcome'] == 'Saved').mean())
    goalkeeper_reach = goalkeeper_reach + 0.1
"""

In [None]:
"""
# Initilaize variables
goalkeeper_reach_copy = goalkeeper_reach
goalkeeper_reach = 3.1
mean_distance = 100
count = 0

# Do as long as result does not get worse three times in a row
while count < 3:
    
    # Create dataframe with current dive radius
    df_experimental = reduce(df_all)
    
    # If mean distance is smaller update parameters and reset count to zero
    if df_experimental['distance'].mean() < mean_distance:
        mean_distance = df_experimental['distance'].mean()
        result = goalkeeper_reach
        count = 0
    # If mean distance is larger add one to count
    else:
        count = count + 1
    
    # Increase dive radius
    goalkeeper_reach = goalkeeper_reach + 0.1

# Recreate original dive radius
goalkeeper_reach = goalkeeper_reach_copy

# Print results
print("Final goalkeeper_reach:", result)
print("Final mean distance:", mean_distance)
"""

### xG
- https://github.com/hadisotudeh/cfg-datascience-task/blob/main/CFG_Data_Scientist_Task_HadiSotudeh.ipynb

In [None]:
# General libraries
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
import numpy as np
import itertools
import warnings
import math
import os

# Machine learning libraries
from sklearn.metrics import plot_roc_curve, roc_auc_score, brier_score_loss
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import joblib

# Model interpretation library
from sklearn.inspection import plot_partial_dependence

# Metrica functions
import Metrica_IO as mio
import Metrica_Viz as mviz

In [None]:
# Show plots inside the jupyter notebook
%matplotlib inline

# Pandas settings to show more columns are rows in the jupyter notebook
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50000)

# Increase font size of the plots 
plt.rcParams.update({'font.size': 18})

# Don't show warnings
warnings.filterwarnings('ignore')

# Target variable to predict
dep_var = 'outcome'

# Hyper-paramter tuning variables
cv = 5
seed = 42
scoring = 'roc_auc'

In [None]:
# Create and filter dataframe
df = reduce(df_all)
df = df[['polygon_players_count', 'distance_shot_goalcenter', 'shooting_angle', 'shot_body_part', 'outcome']]

# Encode shot body part to binary
for row in range(0, len(df)):
    if df.at[row, 'shot_body_part'] == 'Head':
        df.at[row, 'shot_body_part'] = 1
    else:
        df.at[row, 'shot_body_part'] = 0    
# From object to int
df['shot_body_part'] = df['shot_body_part'].astype('int')

# Encode outcome to binary
for row in range(0, len(df)):
    if df.at[row, 'outcome'] == 'Goal':
        df.at[row, 'outcome'] = 1
    else:
        df.at[row, 'outcome'] = 0    
# From object to int
df['outcome'] = df['outcome'].astype('int')

# Define features
features = [x for x in df.columns if x not in ["position_x","position_y",dep_var]]

# Display dataframe
df.head()

In [None]:
# Selecting correspondnig columns for training and test stes
X = df[features].values
y = df[dep_var].values

# Spliting train and test sets. 80% for the training and 20% for the test set.
xs, valid_xs, y, valid_y = train_test_split(X, y, test_size=0.20, random_state=seed, shuffle=True)

In [None]:
def calc_auc_roc(y, prob_pred):
  return roc_auc_score(y, prob_pred)

In [None]:
## Hyperparameters

lr_hyperparameters = {
    'lr__C': [0.01, 0.1, 1.0, 10, 100]
}

rf_hyperparameters = {
    'rf__n_estimators': np.arange(20,100,10),
    'rf__max_features': np.arange(0.5,1.0,0.1),
    'rf__max_depth': np.arange(1,20,5)
}

xgb_hyperparameters = {
    'xgb__max_depth': np.arange(2,12,2),  # the maximum depth of each tree
    'xgb__learning_rate': [0.1,0.3],  # the training step for each iteration
    'xgb__n_estimators': np.arange(1,80,10)
}

lgbm_hyperparameters = {
    'lgbm__n_estimators': np.arange(10,140,20),
    'lgbm__min_data_in_leaf': np.arange(100,1000,100),
    'lgbm__max_depth': np.arange(2,10,2)
}

hyperparameters = {
    'lr': lr_hyperparameters,
    'rf': rf_hyperparameters,
    'lgbm': lgbm_hyperparameters,
    'xgb': xgb_hyperparameters
}

# Pipeline of ML classiferis' pipielines 
pipelines = {
    'bl': Pipeline([('bl', DummyClassifier(strategy='most_frequent'))]), # base line
    'lr': Pipeline([('lr',LogisticRegression(random_state=seed, n_jobs=-1, penalty='l2'))]),
    'rf': Pipeline([('rf', RandomForestClassifier(random_state=seed, n_jobs=-1, oob_score=True))]),
    'xgb': Pipeline([('xgb', XGBClassifier(random_state=seed, n_jobs=-1))]),
    'lgbm': Pipeline([('lgbm', LGBMClassifier(random_state=seed, n_jobs=-1))])
}

In [None]:
# Start the training process

results = []
model_names = {"bl":"Baseline", "lr": "Logistic Regression" , "rf":"Random Forest", 
               "xgb": "XGBoost", "lgbm": "Light Gradient Boosting"}

fig, ax =  plt.subplots(figsize=(8, 8))
ax.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")

for key, pipeline in tqdm(pipelines.items()):
  if key == 'bl':
      model = pipeline
  else:
      model = GridSearchCV(pipeline, hyperparameters[key], cv=cv, scoring=scoring, n_jobs=-1)

  model.fit(xs,y)

  if hasattr(model,'best_estimator_'):
      best = model.best_estimator_.named_steps[key]
  else:
      best = model

  result = {}

  result["model"] = model_names[key]

  train_prob_pred = best.predict_proba(xs)[:,1]

  result["training (auc roc)"] = calc_auc_roc(y, train_prob_pred)
  
  validation_prob_pred = best.predict_proba(valid_xs)[:,1]

  result["validation (auc roc)"]  = calc_auc_roc(valid_y, validation_prob_pred)


  fraction_of_positives, mean_predicted_value = calibration_curve(valid_y, validation_prob_pred, n_bins=10)

  result["Brier score"] = brier_score_loss(valid_y, validation_prob_pred)

  results.append(result)

  # Do not log calibration information for baseline fitter, as I already have done it.
  if key == 'bl':
    continue

  # Plot the calibration plot  
  ax.plot(mean_predicted_value, fraction_of_positives, "s-", label="%s" % (model_names[key], ))
    
  # Save the model
  joblib.dump(best, f'{model_names[key]}.joblib')

results_df = pd.DataFrame(results).round(3)
display(results_df)

ax.set_xlabel("Mean predicted value")
ax.set_ylabel("Fraction of positives")
ax.set_ylim([-0.05, 1.05])
ax.legend(loc="lower right")
ax.set_title('Calibration plot  (reliability curve)')

plt.tight_layout()
plt.savefig('calibartion_plot.png', bbox_inches='tight')
plt.show()

In [None]:
best_model_name = "Logistic Regression"
selected_model = joblib.load(f"{best_model_name}.joblib")
print(f"selected model is {best_model_name}.\n")
print("Its parameters are:")
selected_model.get_params()

In [None]:
fig, ax =  plt.subplots(figsize=(7, 7))
ax.set_title('AUC ROC Curve of the Logistic Regression Model')
plot_roc_curve(selected_model, valid_xs, valid_y, ax=ax);

In [None]:
intercept = round(selected_model.intercept_[0],2)
print(f"The model intercept is {intercept}\n")

coefficients = [round(c,2) for c in selected_model.coef_[0]]

print("The model coefficients are:")
pd.DataFrame(coefficients, features, columns=['coef']).sort_values(by='coef', ascending=False)

In [None]:
explore_cols = ['polygon_players_count', 'distance_shot_goalcenter', 'shooting_angle']

valid_xs_df = pd.DataFrame(valid_xs, columns = features)

for index, col in enumerate(explore_cols):
    fig,ax = plt.subplots(figsize=(12, 4))
    plot_partial_dependence(selected_model, valid_xs_df, [col], grid_resolution=20, ax=ax);

In [None]:
paired_features = [
                  ("shooting_angle","distance_shot_goalcenter"),
                  ("shooting_angle","polygon_players_count"),
                  ("distance_shot_goalcenter","polygon_players_count"), 
                  ]

for index, pair in enumerate(paired_features):
  fig,ax = plt.subplots(figsize=(8, 8))
  plot_partial_dependence(selected_model, valid_xs_df, [pair], grid_resolution=20, ax=ax);

In [None]:
# Calculate xG value
def calc_xG(instance):
  '''Predict the goal-scoring probability of a shot by applying the trained model'''
  x = instance[features].values.reshape(1,-1)
  return round(selected_model.predict_proba(x)[:,1][0],2)

# Apply function
df['xG'] = df.apply(lambda instance: calc_xG(instance), axis=1)

# Display dataframe
df

In [None]:
# Binning xG

def calc_xg_quality(xg):
  if xg < 0.07:
    return "poor"
  elif 0.07 <= xg < 0.15:
    return "fair"
  elif 0.15 <= xg < 0.30:
    return "good"
  elif 0.30 <= xg:
    return "very good"

df["xG_quality"] = df["xG"].apply(lambda xg: calc_xg_quality(xg))
df

In [None]:
df['pred'] = 0

for row in range(0, len(df)):
    if df.at[row, 'xG'] >= 0.5:
        df.at[row, 'pred'] = 1
    else:
        df.at[row, 'pred'] = 0
df

In [None]:
df.sort_values(by=["xG"])

In [None]:
# Import package
from sklearn.metrics import confusion_matrix

# Calculate confusion matrix and values
y_test = np.array(df['outcome'].to_list())
y_pred = np.array(df['pred'].to_list())

confusion_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = confusion_matrix.ravel()
accuracy =  (TP+TN) /(TP+FP+TN+FN)

# Display results
print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)
print('')
print('Number of correct predictions:', confusion_matrix[0][0] + confusion_matrix[1][1])
print('Number of incorrect predictions:', confusion_matrix[0][1] + confusion_matrix[1][0])
print('')
print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))
print('')
print(classification_report(y_test, y_pred))