### Import

In [None]:
# Import standard packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import os
from numpy.linalg import norm
from statistics import mean

# Import sklearn packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn import preprocessing
from sklearn import metrics

# Import specific packages
from mplsoccer import Pitch
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

### Configuration

In [None]:
# Setting to display all columns
pd.set_option("display.max_columns", None)

In [None]:
# Directory with all event files
directory = '/Users/gian-andrea/Documents/Masterarbeit (offline)/statsbomb360-euro2020/events/'

# Filepath to one event file
filepath = directory + '3788745.json'

# Filepath to match file
matchpath = '/Users/gian-andrea/Documents/Masterarbeit (offline)/statsbomb360-euro2020/matches.json'

# Goalkeeper reach
goalkeeper_reach = 3.5

### Methods

In [None]:
# Create one dataframe including data from all json files
def create_dataframe(directory):
    
    # Create empty lists
    ls_shot = []
    ls_possession_team = []
    ls_location = []

    # Iterate over all files in directory
    for filename in os.listdir(directory):

        # Save filepath to current file
        filepath = os.path.join(directory, filename)
        # Check if current file has json ending
        if filename.split('.')[1] == 'json':

            # Transform current json file to dataframe
            df_file = pd.read_json(filepath)
            # Transform columns of current dataframe to lists and add them to existing lists
            ls_shot.extend(df_file['shot'].tolist())
            ls_possession_team.extend(df_file['possession_team'].tolist())
            ls_location.extend(df_file['location'].tolist())

    # Create empty dataframe and add complete lists 
    df = pd.DataFrame()
    df['shot'] = ls_shot
    df['possession_team'] = ls_possession_team
    df['location'] = ls_location

    # Return dataframe
    return df

# Apply method and display dataframe
df_all = create_dataframe(directory)
print('Number of rows:', len(df_all))
df_all.head(1)

In [None]:
# Create dataframe including all undeflected shots from open plays
def filter_shots(df):

    # Create empty delete rows list
    ls_delete_rows = []
    
    # Filter input dataframe for shots
    df = df.dropna().reset_index(drop = True)
    
    # Add rows of all undeflected or non open play shots to list 
    for row in range(0, len(df)):
        if df.at[row, 'shot']['type']['name'] != 'Open Play' or 'deflected' in df.iloc[row]['shot']:
            ls_delete_rows.append(row)
    # Delete rows from list
    df.drop(ls_delete_rows, inplace = True)
    
    # Unzip shot location to shot coordinates
    df['x_shot'], df['y_shot'] = zip(*df['location'])
    # Delete shot location column
    del df['location']

    # Return dataframe
    return df.reset_index(drop = True)

# Apply method and display dataframe
df_shot = filter_shots(df_all)
print('Number of rows:', len(df_shot))
df_shot

In [None]:
# Create shot dataframe including additional data
def complete_shots(df):
    
    # Create shot dataframe
    df = filter_shots(df)
    # Create empty lists
    ls_outcome = []
    ls_statsbomb_xg = []
    ls_possession_team = []
    ls_shot_body_part = []
    ls_shot_technique = []
    ls_goalkeeper = []
    ls_x_shot = []
    ls_y_shot = []
    ls_x_gk = []
    ls_y_gk = []
    ls_player_locations = []

    # Iterate over all rows in dataframe
    for row in range (0, len(df)):
        
        # Check if current row has freeze_frame information
        if 'freeze_frame' in df.iloc[row]['shot']:
            
            # Reset nested player locations list
            ls_player_locations_nested = []
            # Add values to lists
            ls_outcome.append(df.iloc[row]['shot']['outcome']['name'])
            ls_statsbomb_xg.append(round(df.iloc[row]['shot']['statsbomb_xg'], 4))
            ls_possession_team.append(df.iloc[row]['possession_team']['name'])
            ls_shot_body_part.append(df.iloc[row]['shot']['body_part']['name'])
            ls_shot_technique.append(df.iloc[row]['shot']['technique']['name'])
            ls_x_shot.append(df.iloc[row]['x_shot'])
            ls_y_shot.append(df.iloc[row]['y_shot'])
            
            # Save shot information from current freeze_frame
            shot = df.iloc[row]['shot']['freeze_frame']
            # Iterate over all players in one freeze_frame
            for i in range (0, len(shot)):
                
                # Add goalkeeper name and goalkeeper coordinates to lists
                if shot[i]['position']['name'] == 'Goalkeeper' and shot[i]['teammate'] == False:
                    ls_goalkeeper.append(shot[i]['player']['name'])
                    ls_x_gk.append(shot[i]['location'][0])
                    ls_y_gk.append(shot[i]['location'][1])
                # Add remaining player locations to list
                else:
                    ls_player_locations_nested.append(shot[i]['location'])
                    
            # Add remaining player locations list or empty string to list
            if len(ls_player_locations_nested) > 0:
                ls_player_locations.append(ls_player_locations_nested)
            else:
                ls_player_locations.append('')
    
    # Create empty dataframe and add complete lists
    df = pd.DataFrame()
    df['outcome'] = ls_outcome
    df['statsbomb_xg'] = ls_statsbomb_xg
    df['possession_team'] = ls_possession_team
    df['shot_body_part'] = ls_shot_body_part
    df['shot_technique'] = ls_shot_technique
    df['goalkeeper'] = ls_goalkeeper
    df['x_shot'] = ls_x_shot
    df['y_shot'] = ls_y_shot
    df['x_gk'] = ls_x_gk
    df['y_gk'] = ls_y_gk
    df['player_locations'] = ls_player_locations
    
    # Return dataframe
    return df

# Apply method and display dataframe
df_complete_shots = complete_shots(df_all)
print('Number of rows:', len(df_complete_shots))
df_complete_shots.head(1)

In [None]:
# Create complete shot dataframe with additional data on players in shot polygon
def polygon_players(df):
    
    # Create complete shot dataframe and exclude shots without coordinates from other players
    df = complete_shots(df).where(df_complete_shots['player_locations'] != '').dropna().reset_index(drop = True)
    # Create empty list
    ls_polygon_players = []
    ls_polygon_players_count = []
        
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Reset nested polygon players list
        ls_polygon_players_nested = []
        # Iterate over all players in shot
        for i in range (0, len(df.at[row, 'player_locations'])):
            
            # Define point and polygon
            point = Point(df.at[row, 'player_locations'][i][0], df.at[row, 'player_locations'][i][1])
            polygon = Polygon([(df.at[row, 'x_shot'], df.at[row, 'y_shot']), (120, 36), (120, 44)])
            # Add points inside polygon to list
            if polygon.contains(point) == True:
                ls_polygon_players_nested.append(df.at[row, 'player_locations'][i])
        
        # Add nested polygon players list or empty string to list
        if len(ls_polygon_players_nested) > 0:
            ls_polygon_players.append(ls_polygon_players_nested)
        else:
            ls_polygon_players.append('')
        # Add polygon players count values to list    
        ls_polygon_players_count.append(len(ls_polygon_players_nested))
    
    # Add lists to dataframe
    df['polygon_players'] = ls_polygon_players
    df['polygon_players_count'] = ls_polygon_players_count
    
    # Return dataframe
    return df

# Apply method and display dataframe
df_polygon_players = polygon_players(df_all)
print('Number of rows:', len(df_polygon_players))
df_polygon_players.head(2)

In [None]:
# Create complete shot dataframe including shooting angle
def shooting_angle(df):
    
    # Create polygon players dataframe
    df = polygon_players(df)
    # Create empty shooting angle list
    ls_shooting_angle = []
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Calculate slopes from shot location to goalposts
        m1 = (36-df.at[row, 'y_shot'])/(120-df.at[row, 'x_shot'])
        m2 = (44-df.at[row, 'y_shot'])/(120-df.at[row, 'x_shot'])
        
        # Add shooting angle values to list
        ls_shooting_angle.append(round(math.degrees(abs(math.atan(m1) - math.atan(m2))), 4))

    # Add shooting angle list to dataframe
    df['shooting_angle'] = ls_shooting_angle
    
    # Return dataframe
    return df

# Apply method and display dataframe
df_shooting_angle = shooting_angle(df_all)
print('Number of rows:', len(df_shooting_angle))
df_shooting_angle.head(1)

In [None]:
# Create complete shot dataframe including bisector slope
def bisector_slope(df):
    
    # Create shooting angle dataframe
    df = shooting_angle(df)
    # Create empty bisector slope list
    ls_bisector_slope = []
    
    # x, y = origin = shot coordinates
    # x_p, y_p = point = lower goalpost coordinates
    # x_r, y_r = rotated point coordinates
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
    
        # Define shot coordinates
        x = df.at[row, 'x_shot']
        y = df.at[row, 'y_shot']
        # Define lower goalpoast coordinates
        x_p = 120
        y_p = 36
        # Define half shooting angle
        ang = math.radians(df.at[row, 'shooting_angle'])/2

        # Rotate point counterclockwise by given angle in radians around given origin
        x_r = x + math.cos(ang) * (x_p - x) - math.sin(ang) * (y_p - y)
        y_r = y + math.sin(ang) * (x_p - x) + math.cos(ang) * (y_p - y)
        
        # Add bisector slope values to list
        ls_bisector_slope.append(round((y_r-y)/(x_r-x), 4))
        
    # Add bisector slope list to dataframe
    df['bisector_slope'] = ls_bisector_slope   
    
    # Return dataframe
    return df

# Apply method and display dataframe
df_bisector_slope = bisector_slope(df_all)
print('Number of rows:', len(df_bisector_slope))
df_bisector_slope.head(1)

In [None]:
# Create complete shot dataframe including TOGKP
def togkp(df):
    
    # Create bisector slope dataframe
    df = bisector_slope(df)
    # Create empty lists
    ls_x_togkp = []
    ls_y_togkp = []

    # Iterate over all rows in dataframe
    for j in range(0, len(df)):

        # Define linear equation of bisector
        x_shot = df.at[j, 'x_shot']
        y_shot = df.at[j, 'y_shot']
        m_bisector = df.at[j, 'bisector_slope']
        q_bisector = y_shot - m_bisector*x_shot
        # Define half shooting angle
        ang = math.radians(df.at[j, 'shooting_angle'])/2

        # If shot is central
        if m_bisector == 0:
            x_shift = (math.cos(ang)*goalkeeper_reach)/math.sin(ang)
            x_togkp = x_shot + x_shift
            y_togkp = 40
        # If shot is not central
        else:
            # Define linear equation perpendicular to bisector
            d1 = math.dist([x_shot, y_shot], [120, 36])
            d2 = math.dist([x_shot, y_shot], [120, 44])
            if d1 < d2:
                y_post = 36
            else:
                y_post = 44
            x_post = 120
            m_post = -(1/m_bisector)
            q_post = y_post - m_post*x_post

            # Calculate intersection of two linear equations
            y_togkp = (q_bisector - q_post*m_bisector/m_post)/(1-(m_bisector/m_post))
            x_togkp = (y_togkp-q_post)/m_post

        # If covered length is not enough
        if math.dist([x_post, y_post], [x_togkp, y_togkp]) > goalkeeper_reach:

            # Distance from event location to optimal location to cover goalkeeper reach
            distance = (math.cos(ang)*goalkeeper_reach)/math.sin(ang)
            # Shift from shot location to optimal location
            x_shift = distance/(math.sqrt(1+m_bisector*m_bisector))
            y_shift = x_shift*m_bisector
            # Calculate optimal location
            x_togkp = x_shot + x_shift
            y_togkp = y_shot + y_shift
        
        # Add value to list
        ls_x_togkp.append(round(x_togkp, 2))
        ls_y_togkp.append(round(y_togkp, 2))

    # Add list to dataframe
    df['x_togkp'] = ls_x_togkp
    df['y_togkp'] = ls_y_togkp

    # Return reordered dataframe
    return df

# Apply method and display dataframe
df_togkp = togkp(df_all)
print('Number of rows:', len(df_togkp))
df_togkp.head(1)

In [None]:
# Create complete shot dataframe including distance between GKP and TOGKP
def distance(df):
    
    # Create TOGKP dataframe
    df = togkp(df)
    # Create empty distance list
    ls_distance = []
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Define coordinates
        x_togkp = df.at[row, 'x_togkp']
        y_togkp = df.at[row, 'y_togkp']
        x_gk = df.at[row, 'x_gk']
        y_gk = df.at[row, 'y_gk']
        # Add distance value to list
        ls_distance.append(round(math.dist([x_togkp, y_togkp], [x_gk, y_gk]), 2))
    
    # Add distance list to dataframe
    df['distance'] = ls_distance

    # Return reordered dataframe
    return df

# Apply method and display dataframe
df_distance = distance(df_all)
print('Number of rows:', len(df_distance))
df_distance.head(1)

In [None]:
# Create complete shot dataframe including average goalkeeper position
def gk_avg(df):
    
    # Create distance dataframe
    df = distance(df)
    # Create empty lists
    ls_x_gk_avg = []
    ls_y_gk_avg = []
    
    # Iterate over all rows in dataframe
    for i in range(0, len(df)):
        # Define shot coordinates
        x_shot = df.at[i, 'x_shot']
        y_shot = df.at[i, 'y_shot']

        # Create empty list
        ls_distance = []

        # Iterate over all rows in dataframe again
        for row in range(0, len(df)):
            # Define similar shot coordinates
            x_shot_similar = df.at[row, 'x_shot']
            y_shot_similar = df.at[row, 'y_shot']
            # Add distance to list
            ls_distance.append(math.dist([x_shot, y_shot], [x_shot_similar, y_shot_similar]))
        
        # Sort indices by ascending distance values and save 20 first (e.g. smallest) values
        sorted_indices = np.argsort(ls_distance).tolist()[:20]

        # Define counter variables
        x_gk_tot = 0
        y_gk_tot = 0

        # Sum up 20 x_gk and y_gk coordinates from 20 most similar shots
        for row in sorted_indices:
            x_gk_tot = x_gk_tot + df.at[row, 'x_gk']
            y_gk_tot = y_gk_tot + df.at[row, 'y_gk']
        
        # Add mean x_gk and y_gk coordinates from 20 most similar shots to list
        ls_x_gk_avg.append(round(x_gk_tot/20, 2))
        ls_y_gk_avg.append(round(y_gk_tot/20, 2))   
        
   # Add lists to dataframe
    df['x_gk_avg'] = ls_x_gk_avg    
    df['y_gk_avg'] = ls_y_gk_avg
    
    # Return dataframe
    return df

# Apply method and display dataframe
df_gk_avg = gk_avg(df_all)
print('Number of rows:', len(df_gk_avg))
df_gk_avg.head()

In [None]:
# Create complete shot dataframe including dimensionality reduced data
def reduce(df):
    
    # Create goalkeeper average dataframe
    df = gk_avg(df)
    # Create empty lists
    ls_goal_angle = []
    ls_distance_shot_goalcenter = []
    ls_distance_gk_goalline = []
    ls_distance_gk_bisector = []
    ls_angular_deviation = []
    ls_ratio_shotgk_gkbisector = []
    ls_distance_gkavg_bisector = []
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Define variables
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        x_gk = df.at[row, 'x_gk']
        y_gk = df.at[row, 'y_gk']
        x_gk_avg = df.at[row, 'x_gk_avg']
        y_gk_avg = df.at[row, 'y_gk_avg']
        x_togkp = df.at[row, 'x_togkp']
        y_togkp = df.at[row, 'y_togkp']
        shot_location = np.asarray((x_shot, y_shot))
        gk_location = np.asarray((x_gk, y_gk))
        gk_avg_location = np.asarray((x_gk_avg, y_gk_avg))
        togkp_location = np.asarray((x_togkp, y_togkp))
        m_shot_goalcenter = (40-y_shot)/(120-x_shot)
        m_shot_gk = (y_gk-y_shot)/(x_gk-x_shot)
        m_bisector = df.at[row, 'bisector_slope']
        
        # Calculate values
        distance_gk_bisector = round(abs(np.cross(shot_location - togkp_location, togkp_location - gk_location) / norm(shot_location - togkp_location)), 2)
        if distance_gk_bisector != 0:
            ratio_shotgk_gkbisector = round(math.dist([x_shot, y_shot], [x_gk, y_gk]) / distance_gk_bisector, 2)
        else:
            ratio_shotgk_gkbisector = 4030
            
        # Add values to lists
        ls_goal_angle.append(round(math.degrees(abs(math.atan(m_shot_goalcenter))), 2))
        ls_distance_shot_goalcenter.append(round(math.dist([x_shot, y_shot], [120, 40]), 2))
        ls_distance_gk_goalline.append(120 - x_gk)
        ls_distance_gk_bisector.append(distance_gk_bisector)
        ls_angular_deviation.append(round(math.degrees(abs(math.atan(m_shot_gk) - math.atan(m_bisector))), 2))
        ls_ratio_shotgk_gkbisector.append(ratio_shotgk_gkbisector)
        ls_distance_gkavg_bisector.append(round(abs(np.cross(shot_location - togkp_location, togkp_location - gk_avg_location) / norm(shot_location - togkp_location)), 2))
    
    # Add lists to dataframe
    df['goal_angle'] = ls_goal_angle
    df['distance_shot_goalcenter'] = ls_distance_shot_goalcenter
    df['distance_gk_goalline'] = ls_distance_gk_goalline
    df['distance_gk_bisector'] = ls_distance_gk_bisector
    df['angular_deviation'] = ls_angular_deviation
    df['ratio_shotgk_gkbisector'] = ls_ratio_shotgk_gkbisector
    df['distance_gkavg_bisector'] = ls_distance_gkavg_bisector
    
    # Return dataframe
    return df[['outcome', 'possession_team', 'shot_body_part', 'shot_technique', 'goalkeeper', 'x_shot', 'y_shot', 'x_gk', 'y_gk', 'x_gk_avg', 'y_gk_avg', 'x_togkp', 'y_togkp', 'distance', 'player_locations', 'polygon_players', 'polygon_players_count', 'shooting_angle', 'bisector_slope', 'goal_angle', 'distance_shot_goalcenter', 'distance_gk_goalline', 'distance_gk_bisector', 'angular_deviation', 'ratio_shotgk_gkbisector', 'distance_gkavg_bisector']]


# Apply method and display dataframe
df_reduce = reduce(df_all)
print('Number of rows:', len(df_reduce))
df_reduce.head(1)

In [None]:
# Plot one shot
def plot_shot(df, row):
    
    # Soccer pitch
    pitch = Pitch(pitch_color='grass', line_color='white', stripe=True)
    fig, ax = pitch.draw()

    # Player location
    x_shot = df.at[row, 'x_shot']
    y_shot = df.at[row, 'y_shot']
    # Red dots for goals and blue dots for the rest
    color = 'blue'
    if df.at[row, 'outcome'] == 'Goal':
        color = 'red'
    plt.scatter(x_shot, y_shot, color = color)

    # Goalkeeper location
    x_gk = df.at[row, 'x_gk']
    y_gk = df.at[row, 'y_gk']
    plt.scatter(x_gk, y_gk, color = 'brown')
    
    # Other players location
    for i in range(0, len(df.at[row, 'polygon_players'])):
        x_player = df.at[row, 'polygon_players'][i][0]
        y_player = df.at[row, 'polygon_players'][i][1]
        plt.scatter(x_player, y_player, color = 'yellow')

    # TOGKP
    x_togkp = df.at[row, 'x_togkp']
    y_togkp = df.at[row, 'y_togkp']
    plt.scatter(x_togkp, y_togkp, color = 'orange')

    # Line between player location and gp1
    point1 = [df.at[row, 'x_shot'], df.at[row, 'y_shot']]
    point2 = [120, 36]
    x_values = [point1[0], point2[0]]
    y_values = [point1[1], point2[1]]
    plt.plot(x_values, y_values, color = 'black')

    # Line between player location and gp2
    point1 = [df.at[row, 'x_shot'], df.at[row, 'y_shot']]
    point2 = [120, 44]
    x_values = [point1[0], point2[0]]
    y_values = [point1[1], point2[1]]
    plt.plot(x_values, y_values, color = 'black')
    
    """
    # Plot bisector
    point1 = [df.at[row, 'x_loc'], df.at[row, 'y_loc']]
    point2 = [x_opt, y_opt]
    x_values = [point1[0], point2[0]]
    y_values = [point1[1], point2[1]]
    plt.plot(x_values, y_values, color = 'black')
    """
    
    # Scale plot
    plt.axis('scaled')
    plt.xlim([-5, 125])
    plt.ylim([-5, 85])

    # Display plot
    plt.show()
    
    # Display shot location, goalkeeper location and TOGKP
    print('x_shot:', x_shot, 'y_shot:', y_shot)
    print('x_gk:', x_gk, 'y_gk:', y_gk)
    print('x_togkp:', x_togkp, 'y_togkp:', y_togkp)

In [None]:
# Plot all shots
def plot_all(df):
    
    # Soccer pitch
    pitch = Pitch(pitch_color='grass', line_color='white', stripe=True)
    fig, ax = pitch.draw()
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Player location
        x_shot = df.at[row, 'x_shot']
        y_shot = df.at[row, 'y_shot']
        # Red dots for goals and blue dots for the rest
        color = 'blue'
        if df.at[row, 'outcome'] == 'Goal':
            color = 'red'
        # Plot point
        plt.scatter(x_shot, y_shot, color = color)
        
    # Scale plot
    plt.axis('scaled')
    plt.xlim([-5, 125])
    plt.ylim([-5, 85])

    # Display plot
    plt.show()

In [None]:
# Create coordinate system
def coordinates(x, y):
    # Get inliine charts
    %matplotlib inline
    # Define x range, y range, and tick interval for both axes
    xmin, xmax, ymin, ymax = -x, x, -y, y
    ticks_frequency = 1
    # Create figure and axes object
    fig, ax = plt.subplots(figsize=(10, 10))
    # Set face color
    fig.patch.set_facecolor('#ffffff')
    # Apply ranges to axes
    ax.set(xlim=(xmin-1, xmax+1), ylim=(ymin-1, ymax+1), aspect='equal')
    # Set both axes to zero position
    ax.spines['bottom'].set_position('zero')
    ax.spines['left'].set_position('zero')
    # Hide the top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # Set x label, y label, and add an origin label
    ax.set_xlabel('$x$', size=14, labelpad=-24, x=1.02)
    ax.set_ylabel('$y$', size=14, labelpad=-21, y=1.02, rotation=0)
    plt.text(0.49, 0.49, r"$O$", ha='right', va='top',
        transform=ax.transAxes,
             horizontalalignment='center', fontsize=14)
    # Create x tick, y tick, and apply them to both axes
    x_ticks = np.arange(xmin, xmax+1, ticks_frequency)
    y_ticks = np.arange(ymin, ymax+1, ticks_frequency)
    ax.set_xticks(x_ticks[x_ticks != 0])
    ax.set_yticks(y_ticks[y_ticks != 0])
    ax.set_xticks(np.arange(xmin, xmax+1), minor=True)
    ax.set_yticks(np.arange(ymin, ymax+1), minor=True)
    # Add a grid
    ax.grid(which='both', color='grey', linewidth=1, linestyle='-', alpha=0.2)

### Data

In [None]:
# Apply method and display dataframe
df_final = reduce(df_all)
print('Number of rows:', len(df_final))
df_final.head(1)

In [None]:
# Filter for 1v1 situations
df_1v1 = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)
print('Number of rows:', len(df_1v1))
df_1v1.head(1)

### Plot

In [None]:
# Plot one shot
plot_shot(df_1v1, 1)

In [None]:
# Plot all shots
plot_all(df_final)

In [None]:
# Plot 1v1 shots
plot_all(df_1v1)

### Deviations

In [None]:
# Assess success of TOGKP by checking deviations from GKP to TOGKP for different outcomes
def check_deviation(df, col):
    
    # Average distance from GKP to TOGKP for saves
    df_saved = df.where(df['outcome'] == 'Saved').dropna().reset_index(drop = True)
    print('Deviation for saves:', round(df_saved[col].mean(), 2))
    
    # Average distance from GKP to TOGKP for nogoals
    df_nogoal = df.where(df['outcome'] != 'Goal').dropna().reset_index(drop = True)
    print('Deviation for nogoals:', round(df_nogoal[col].mean(), 2))

    # Average distance from GKP to TOGKP for goals
    df_goal = df.where(df['outcome'] == 'Goal').dropna().reset_index(drop = True)
    print('Deviation for goals:', round(df_goal[col].mean(), 2))
    
# Apply method for latest dataframe
check_deviation(df_1v1, 'distance')

In [None]:
def gk_vs_gkavg(df):
    d1 = round(df['distance_gk_bisector'].mean(), 2)
    d2 = round(df['distance_gkavg_bisector'].mean(), 2)
    print('Mean distance from goalkeeper to bisector:', d1)
    print('Mean distance from average goalkeeper to bisector:', d2)

# Apply method for latest dataframe
gk_vs_gkavg(df_final)

Plot GKP relative to centered TOGKP:
- https://pygmalion.nitri.org/cartesian-coordinates-with-matplotlib-1263.html)

In [None]:
# Create coordinate system
coordinates(12, 12)

# Initialize 1v1 dataframe
df = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)

# Iterate over all rows in dataframe
for row in range(0, len(df)):
    
    # Save goalkeeper position relative to centered TOGKP
    x = df.at[row, 'x_gk'] - df.at[row, 'x_togkp']
    y = df.at[row, 'y_gk'] - df.at[row, 'y_togkp']
    # Color for every outcome besides goals and saves
    color = 'orange'
    # Color for goals
    if df.at[row, 'outcome'] == 'Goal':
        color = 'red'
    # Color for saves
    if df.at[row, 'outcome'] == 'Saved':
        color = 'green'
    # Plot point
    plt.scatter(x, y, color = color)

Plot the distance from the goalkeeper to the goal center on x axis and the perpendicular distance on y axis:

In [None]:
# Create coordinate system
coordinates(24, 12)

# Initialize 1v1 dataframe
df = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)

# Iterate over all rows in dataframe
for row in range(0, len(df)):
    
    # Save goalkeeper position relative to centered TOGKP
    x = df.at[row, 'distance_gk_bisector']
    y = df.at[row, 'distance_gk_goalline']
    # Color for every outcome besides goals and saves
    color = 'orange'
    # Color for goals
    if df.at[row, 'outcome'] == 'Goal':
        color = 'red'
    # Color for saves
    if df.at[row, 'outcome'] == 'Saved':
        color = 'green'
    # Plot point
    plt.scatter(x, y, color = color)

Separately check average deviations from TOGKP in x and y direction:

In [None]:
# Initialize 1v1 dataframe
df = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)

# Initialize variables
x_tot = 0
y_tot = 0
x_tot_abs = 0
y_tot_abs = 0

# Iterate over all rows in dataframe
for row in range(0, len(df)):
    
    # Display goalkeeper position relative to centered TOGKP
    x = df_1v1.at[row, 'x_gk'] - df_1v1.at[row, 'x_togkp']
    y = df_1v1.at[row, 'y_gk'] - df_1v1.at[row, 'y_togkp']
    # Calculate total deviation
    x_tot = x_tot + x
    y_tot = y_tot + y
    # Calculate absolute total deviation
    x_tot_abs = x_tot_abs + abs(x)
    y_tot_abs = y_tot_abs + abs(y)

# Calculate average deviation
x_dev = round(x_tot / len(df), 2)
y_dev = round(y_tot / len(df), 2)
x_dev_abs = round(x_tot_abs / len(df), 2)
y_dev_abs = round(y_tot_abs / len(df), 2)

# Display results
print('Deviation in x direction:', x_dev)
print('Deviation in y direction:', y_dev)
print('Absolute deviation in x direction:', x_dev_abs)
print('Absolute deviation in y direction:', y_dev_abs)

### ML-Algorithm to predict Goal Probability

In [None]:
# Initialize settings
plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

# Initialize 1v1 dataframe
df = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)

# Select all features
# df = df[['outcome', 'x_shot', 'y_shot', 'x_gk', 'y_gk', 'x_togkp', 'y_togkp', 'shooting_angle', 'goal_angle', 'distance_shot_goalcenter', 'distance_gk_bisector', 'angular_deviation', 'ratio_shotgk_gkbisector']]

# Select reduced features
df = df[['outcome', 'shooting_angle', 'goal_angle', 'distance_shot_goalcenter', 'distance_gk_bisector', 'angular_deviation', 'ratio_shotgk_gkbisector']]

# Select reduced features without bisector features
# df = df[['outcome', 'shooting_angle', 'goal_angle', 'distance_shot_goalcenter']]

# Select coordinate features
# df = df[['outcome', 'x_shot', 'y_shot', 'x_gk', 'y_gk', 'x_togkp', 'y_togkp']]

# Select coordinate features without bisector features
# df = df[['outcome', 'x_shot', 'y_shot', 'x_gk', 'y_gk']]

# Select experimental features
# df = df[['outcome', 'distance_gk_bisector', 'angular_deviation', 'ratio_shotgk_gkbisector']]


# Encode outcome to binary
for row in range(0, len(df)):
    if df.at[row, 'outcome'] == 'Goal':
        df.at[row, 'outcome'] = 1
    else:
        df.at[row, 'outcome'] = 0    
# From object to int
df['outcome'] = df['outcome'].astype('int')

# Display dataframe
df.head(1)

In [None]:
# Import package
from sklearn.metrics import confusion_matrix

# Initialize input vector and label
X = df.loc[:, df.columns != 'outcome']
y = df.loc[:, df.columns == 'outcome']

# Implement model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Save predictions
y_pred = logreg.predict(X_test)

# Calculate confusion matrix and values
confusion_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = confusion_matrix.ravel()
accuracy =  (TP+TN) /(TP+FP+TN+FN)

# Display results
print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)
print('')
print('Number of correct predictions:', confusion_matrix[0][0] + confusion_matrix[1][1])
print('Number of incorrect predictions:', confusion_matrix[0][1] + confusion_matrix[1][0])
print('')
print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))
print('')
print(classification_report(y_test, y_pred))

In [None]:
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
# Show class imbalance
count_no_goal = len(df[df['outcome']==0])
count_goal = len(df[df['outcome']==1])
pct_of_no_goal = count_no_goal/(count_no_goal+count_goal)
pct_of_goal = count_goal/(count_no_goal+count_goal)
print("percentage of no goal is", pct_of_no_goal*100)
print("percentage of goal", pct_of_goal*100)

### Thesis

Display exemplary json file as dataframe:

In [None]:
# Create dataframe
df_test = pd.read_json(filepath)

# Display dataframe
df_test

Display column names of the dataframe:

In [None]:
# Display columns
df_test.columns.to_list()[:3]

Calculate total number of games and average number of events per game:

In [None]:
# Initialize variables
filecount = 0
eventcount = 0

# Iterate over all files in directory
for filename in os.listdir(directory):

    # Save current filepath
    filepath = os.path.join(directory, filename)
    # Check if current file has json ending
    if filename.split('.')[1] == 'json':

        # Create dataframe from current json file
        df_file = pd.read_json(filepath)
        # Update counters
        eventcount = eventcount + len(df_file)
        filecount = filecount + 1

# Calculate average number of events per game
avg_events = eventcount / filecount

# Display result
print("Number of games:", filecount)
print("Average number of events per game:", avg_events)

Caclulate total number of games with a goal difference of zero or one:

In [None]:
# Create dataframe
df_matches = pd.read_json(matchpath)

# Calculate goal difference
df_matches['goal_difference'] = abs(df_matches['home_score'] - df_matches['away_score'])

# Display goaldifference
df_matches['goal_difference'].where(df_matches['goal_difference'] <= 1).dropna().count()

Reason for setting ratio_shooter_perp values to 4030 in case perp = 0:

In [None]:
# Display maximum value for ratio_shooter_perp
df_1v1.where(df_1v1['ratio_shotgk_gkbisector'] > 4030).dropna()

### Experimental

Check which goalkeeper reach value results in smallest distance between GKP and TOGKP:
- Final goalkeeper reach: 3.5
- Final mean distance: 1.7

In [None]:
"""
goalkeeper_reach = 1.8

while goalkeeper_reach < 3.8:
    df_experimental = reduce(df_all)
    print('dr:', goalkeeper_reach)
    print('d:', df_experimental['distance'].where(df_experimental['outcome'] == 'Saved').mean())
    goalkeeper_reach = goalkeeper_reach + 0.1
"""

In [None]:
"""
# Initilaize variables
goalkeeper_reach_copy = goalkeeper_reach
goalkeeper_reach = 3.1
mean_distance = 100
count = 0

# Do as long as result does not get worse three times in a row
while count < 3:
    
    # Create dataframe with current dive radius
    df_experimental = reduce(df_all)
    
    # If mean distance is smaller update parameters and reset count to zero
    if df_experimental['distance'].mean() < mean_distance:
        mean_distance = df_experimental['distance'].mean()
        result = goalkeeper_reach
        count = 0
    # If mean distance is larger add one to count
    else:
        count = count + 1
    
    # Increase dive radius
    goalkeeper_reach = goalkeeper_reach + 0.1

# Recreate original dive radius
goalkeeper_reach = goalkeeper_reach_copy

# Print results
print("Final goalkeeper_reach:", result)
print("Final mean distance:", mean_distance)
"""

### xG
- https://github.com/hadisotudeh/cfg-datascience-task/blob/main/CFG_Data_Scientist_Task_HadiSotudeh.ipynb

In [None]:
# General libraries
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
import numpy as np
import itertools
import warnings
import math
import os

# Machine learning libraries
from sklearn.metrics import plot_roc_curve, roc_auc_score, brier_score_loss
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import joblib

# Model interpretation library
from sklearn.inspection import plot_partial_dependence

# Metrica functions
import Metrica_IO as mio
import Metrica_Viz as mviz

In [None]:
# Show plots inside the jupyter notebook
%matplotlib inline

# Pandas settings to show more columns are rows in the jupyter notebook
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50000)

# Increase font size of the plots 
plt.rcParams.update({'font.size': 18})

# Don't show warnings
warnings.filterwarnings('ignore')

# Target variable to predict
dep_var = 'outcome'

# Hyper-paramter tuning variables
cv = 5
seed = 42
scoring = 'roc_auc'

In [None]:
# Create and filter dataframe
df = reduce(df_all)
df = df[['polygon_players_count', 'distance_shot_goalcenter', 'shooting_angle', 'shot_body_part', 'outcome']]

# Encode shot body part to binary
for row in range(0, len(df)):
    if df.at[row, 'shot_body_part'] == 'Head':
        df.at[row, 'shot_body_part'] = 1
    else:
        df.at[row, 'shot_body_part'] = 0    
# From object to int
df['shot_body_part'] = df['shot_body_part'].astype('int')

# Encode outcome to binary
for row in range(0, len(df)):
    if df.at[row, 'outcome'] == 'Goal':
        df.at[row, 'outcome'] = 1
    else:
        df.at[row, 'outcome'] = 0    
# From object to int
df['outcome'] = df['outcome'].astype('int')

# Define features
features = [x for x in df.columns if x not in ["position_x","position_y",dep_var]]

# Display dataframe
df.head()

In [None]:
# Selecting correspondnig columns for training and test stes
X = df[features].values
y = df[dep_var].values

# Spliting train and test sets. 80% for the training and 20% for the test set.
xs, valid_xs, y, valid_y = train_test_split(X, y, test_size=0.20, random_state=seed, shuffle=True)

In [None]:
def calc_auc_roc(y, prob_pred):
  return roc_auc_score(y, prob_pred)

In [None]:
## Hyperparameters

lr_hyperparameters = {
    'lr__C': [0.01, 0.1, 1.0, 10, 100]
}

rf_hyperparameters = {
    'rf__n_estimators': np.arange(20,100,10),
    'rf__max_features': np.arange(0.5,1.0,0.1),
    'rf__max_depth': np.arange(1,20,5)
}

xgb_hyperparameters = {
    'xgb__max_depth': np.arange(2,12,2),  # the maximum depth of each tree
    'xgb__learning_rate': [0.1,0.3],  # the training step for each iteration
    'xgb__n_estimators': np.arange(1,80,10)
}

lgbm_hyperparameters = {
    'lgbm__n_estimators': np.arange(10,140,20),
    'lgbm__min_data_in_leaf': np.arange(100,1000,100),
    'lgbm__max_depth': np.arange(2,10,2)
}

hyperparameters = {
    'lr': lr_hyperparameters,
    'rf': rf_hyperparameters,
    'lgbm': lgbm_hyperparameters,
    'xgb': xgb_hyperparameters
}

# Pipeline of ML classiferis' pipielines 
pipelines = {
    'bl': Pipeline([('bl', DummyClassifier(strategy='most_frequent'))]), # base line
    'lr': Pipeline([('lr',LogisticRegression(random_state=seed, n_jobs=-1, penalty='l2'))]),
    'rf': Pipeline([('rf', RandomForestClassifier(random_state=seed, n_jobs=-1, oob_score=True))]),
    'xgb': Pipeline([('xgb', XGBClassifier(random_state=seed, n_jobs=-1))]),
    'lgbm': Pipeline([('lgbm', LGBMClassifier(random_state=seed, n_jobs=-1))])
}

In [None]:
# Start the training process

results = []
model_names = {"bl":"Baseline", "lr": "Logistic Regression" , "rf":"Random Forest", 
               "xgb": "XGBoost", "lgbm": "Light Gradient Boosting"}

fig, ax =  plt.subplots(figsize=(8, 8))
ax.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")

for key, pipeline in tqdm(pipelines.items()):
  if key == 'bl':
      model = pipeline
  else:
      model = GridSearchCV(pipeline, hyperparameters[key], cv=cv, scoring=scoring, n_jobs=-1)

  model.fit(xs,y)

  if hasattr(model,'best_estimator_'):
      best = model.best_estimator_.named_steps[key]
  else:
      best = model

  result = {}

  result["model"] = model_names[key]

  train_prob_pred = best.predict_proba(xs)[:,1]

  result["training (auc roc)"] = calc_auc_roc(y, train_prob_pred)
  
  validation_prob_pred = best.predict_proba(valid_xs)[:,1]

  result["validation (auc roc)"]  = calc_auc_roc(valid_y, validation_prob_pred)


  fraction_of_positives, mean_predicted_value = calibration_curve(valid_y, validation_prob_pred, n_bins=10)

  result["Brier score"] = brier_score_loss(valid_y, validation_prob_pred)

  results.append(result)

  # Do not log calibration information for baseline fitter, as I already have done it.
  if key == 'bl':
    continue

  # Plot the calibration plot  
  ax.plot(mean_predicted_value, fraction_of_positives, "s-", label="%s" % (model_names[key], ))
    
  # Save the model
  joblib.dump(best, f'{model_names[key]}.joblib')

results_df = pd.DataFrame(results).round(3)
display(results_df)

ax.set_xlabel("Mean predicted value")
ax.set_ylabel("Fraction of positives")
ax.set_ylim([-0.05, 1.05])
ax.legend(loc="lower right")
ax.set_title('Calibration plot  (reliability curve)')

plt.tight_layout()
plt.savefig('calibartion_plot.png', bbox_inches='tight')
plt.show()

In [None]:
best_model_name = "Logistic Regression"
selected_model = joblib.load(f"{best_model_name}.joblib")
print(f"selected model is {best_model_name}.\n")
print("Its parameters are:")
selected_model.get_params()

In [None]:
fig, ax =  plt.subplots(figsize=(7, 7))
ax.set_title('AUC ROC Curve of the Logistic Regression Model')
plot_roc_curve(selected_model, valid_xs, valid_y, ax=ax);

In [None]:
intercept = round(selected_model.intercept_[0],2)
print(f"The model intercept is {intercept}\n")

coefficients = [round(c,2) for c in selected_model.coef_[0]]

print("The model coefficients are:")
pd.DataFrame(coefficients, features, columns=['coef']).sort_values(by='coef', ascending=False)

In [None]:
explore_cols = ['polygon_players_count', 'distance_shot_goalcenter', 'shooting_angle']

valid_xs_df = pd.DataFrame(valid_xs, columns = features)

for index, col in enumerate(explore_cols):
    fig,ax = plt.subplots(figsize=(12, 4))
    plot_partial_dependence(selected_model, valid_xs_df, [col], grid_resolution=20, ax=ax);

In [None]:
paired_features = [
                  ("shooting_angle","distance_shot_goalcenter"),
                  ("shooting_angle","polygon_players_count"),
                  ("distance_shot_goalcenter","polygon_players_count"), 
                  ]

for index, pair in enumerate(paired_features):
  fig,ax = plt.subplots(figsize=(8, 8))
  plot_partial_dependence(selected_model, valid_xs_df, [pair], grid_resolution=20, ax=ax);

In [None]:
# Calculate xG value
def calc_xG(instance):
  '''Predict the goal-scoring probability of a shot by applying the trained model'''
  x = instance[features].values.reshape(1,-1)
  return round(selected_model.predict_proba(x)[:,1][0],2)

# Apply function
df['xG'] = df.apply(lambda instance: calc_xG(instance), axis=1)

# Display dataframe
df

In [None]:
# Binning xG

def calc_xg_quality(xg):
  if xg < 0.07:
    return "poor"
  elif 0.07 <= xg < 0.15:
    return "fair"
  elif 0.15 <= xg < 0.30:
    return "good"
  elif 0.30 <= xg:
    return "very good"

df["xG_quality"] = df["xG"].apply(lambda xg: calc_xg_quality(xg))
df

In [None]:
df['pred'] = 0

for row in range(0, len(df)):
    if df.at[row, 'xG'] >= 0.5:
        df.at[row, 'pred'] = 1
    else:
        df.at[row, 'pred'] = 0
df

In [None]:
df.sort_values(by=["xG"])

In [None]:
# Import package
from sklearn.metrics import confusion_matrix

# Calculate confusion matrix and values
y_test = np.array(df['outcome'].to_list())
y_pred = np.array(df['pred'].to_list())

confusion_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = confusion_matrix.ravel()
accuracy =  (TP+TN) /(TP+FP+TN+FN)

# Display results
print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)
print('')
print('Number of correct predictions:', confusion_matrix[0][0] + confusion_matrix[1][1])
print('Number of incorrect predictions:', confusion_matrix[0][1] + confusion_matrix[1][0])
print('')
print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))
print('')
print(classification_report(y_test, y_pred))