### Import

In [None]:
# Import standard packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import os
from numpy.linalg import norm

# Import sklearn packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn import preprocessing
from sklearn import metrics

# Import specific packages
from mplsoccer import Pitch
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

### Configuration

In [None]:
# Directory with all event files
directory = '/Users/gian-andreagottini/Documents/Masterarbeit (offline)/statsbomb360-euro2020/events/'

# Filepath to one event file
filepath = directory + '3788745.json'

# Filepath to match file
matchpath = '/Users/gian-andreagottini/Documents/Masterarbeit (offline)/statsbomb360-euro2020/matches.json'

# List with columns relevant for shots dataframe
ls_shots = ['possession_team', 'shot', 'location']

# Dive radius of gk
dive_radius = 3.12

### Methods

In [None]:
# Create dataframe including all json files
def create_dataframe(directory):
    
    # Create empty lists
    ls_possession_team = []
    ls_type = []
    ls_shot = []
    ls_location = []

    # Iterate over all files in directory
    for filename in os.listdir(directory):

        # Save current filepath
        filepath = os.path.join(directory, filename)
        # Check if current file has json ending
        if filename.split('.')[1] == 'json':

            # Create dataframe from current json file
            df_file = pd.read_json(filepath)
            # Add columns of current dataframe as lists to existing lists
            ls_possession_team.extend(df_file['possession_team'].tolist())
            ls_type.extend(df_file['type'].tolist())
            ls_shot.extend(df_file['shot'].tolist())
            ls_location.extend(df_file['location'].tolist())

    # Create empty dataframe and add completed lists to dataframe
    df = pd.DataFrame()
    df['possession_team'] = ls_possession_team
    df['type'] = ls_type
    df['shot'] = ls_shot
    df['location'] = ls_location

    # Return dataframe
    return df

# Apply method and display dataframe
df_all = create_dataframe(directory)
print('Number of rows:', len(df_all))
df_all.head(1)

In [None]:
# Create dataframe filtered for one specific type of event
def filter_event(df, ls):

    # Create event dataframe
    df = df[ls].dropna().reset_index(drop = True)
    
    # Unzip location to coordinates and delete location column
    df['x_loc'], df['y_loc'] = zip(*df['location'])
    del df['location']

    # Return event dataframe
    return df

# Apply method and display dataframe
df_filter_shots = filter_event(df_all, ls_shots)
print('Number of rows:', len(df_filter_shots))
df_filter_shots.head(1)

In [None]:
# Create shots dataframe with additional data
def complete_shots(df):
    
    # Create shots dataframe
    df = filter_event(df, ls_shots)

    # Create empty lists
    ls_possession_team = []
    ls_shot = []
    ls_x_loc = []
    ls_y_loc = []
    ls_outcome = []
    ls_x_gk = []
    ls_y_gk = []
    ls_players = []

    # Iterate over all rows in dataframe
    for j in range (0, len(df)):
        
        # Check if current row has freeze_frame information
        if 'freeze_frame' in df.iloc[j]['shot']:
            
            # Create or reset nested players list
            ls_players_nested = []
            # Add values to lists
            ls_possession_team.append(df.iloc[j]['possession_team'])
            ls_shot.append(df.iloc[j]['shot'])
            ls_x_loc.append(df.iloc[j]['x_loc'])
            ls_y_loc.append(df.iloc[j]['y_loc'])
            # Add nested outcome values to list
            ls_outcome.append(df.iloc[j]['shot']['outcome']['name'])
            # Save shot information from current freeze_frame
            shot = df.iloc[j]['shot']['freeze_frame']
            
            # Iterate over all players in one freeze_frame
            for i in range (0, len(shot)):
                
                # Add gk coordinates to lists
                if shot[i]['position']['name'] == 'Goalkeeper' and shot[i]['teammate'] == False:
                    ls_x_gk.append(shot[i]['location'][0])
                    ls_y_gk.append(shot[i]['location'][1])
                # Add other player locations to lists
                else:
                    ls_players_nested.append(shot[i]['location'])
                    
            # Add list or empty string to list
            if len(ls_players_nested) > 0:
                ls_players.append(ls_players_nested)
            else:
                ls_players.append('')
    
    # Create empty dataframe and add completed lists to dataframe
    df = pd.DataFrame()
    df['possession_team'] = ls_possession_team
    df['shot'] = ls_shot
    df['outcome'] = ls_outcome
    df['x_loc'] = ls_x_loc
    df['y_loc'] = ls_y_loc
    df['x_gk'] = ls_x_gk
    df['y_gk'] = ls_y_gk
    df['players'] = ls_players
    
    # Return complete shots dataframe
    return df

# Apply method and display dataframe
df_complete_shots = complete_shots(df_all)
print('Number of rows:', len(df_complete_shots))
df_complete_shots.head(1)

In [None]:
# Create shots dataframe with additional data about players in shot polygon
def polygon_players(df):
    
    # Create complete shots dataframe and exclude shots without coordinates from other players
    df = complete_shots(df).where(df_complete_shots['players'] != '').dropna().reset_index(drop = True)
    
    # Create empty list
    ls_players = []
        
    # Iterate over all rows in dataframe
    for j in range(0, len(df)):
        
        # Create and reset nested players list
        ls_players_nested = []
        
        # Iterate over all players in one shot
        for i in range (0, len(df.iloc[j]['players'])):
            
            # Define point and polygon
            point = Point(df.iloc[j]['players'][i][0], df.iloc[j]['players'][i][1])
            polygon = Polygon([(df.iloc[j]['x_loc'], df.iloc[j]['y_loc']), (120, 36), (120, 44)])
            # Add points inside polygon to list
            if polygon.contains(point) == True:
                ls_players_nested.append(df.iloc[j]['players'][i])
        
        # Add list or empty string to list
        if len(ls_players_nested) > 0:
            ls_players.append(ls_players_nested)
        else:
            ls_players.append('')
    
    # Add list to dataframe
    df['polygon_players'] = ls_players
    
    # Return polygon players dataframe
    return df

# Apply method and display dataframe
df_polygon_players = polygon_players(df_all)
print('Number of rows:', len(df_polygon_players))
df_polygon_players.head(2)

In [None]:
# Calculate angle between the two lines from event location to goalposts
def shooting_angle(x, y):
    
    # Calculate slope from event location to goalposts
    m1 = (36-y)/(120-x)
    m2 = (44-y)/(120-x)
    
    # Return absolute value of angle in radians
    return abs(math.atan(m1) - math.atan(m2))

# Apply method and display value
print('Angle in degrees:', math.degrees(shooting_angle(112, 44)))

In [None]:
# Calculate slope of bisector
def bisector_slope(x, y):
    
    # x, y = origin = event coordinates
    # x_p, y_p = point = lower goalpost coordinates
    # x_r, y_r = rotated point coordinates
    
    # Define variables
    ang = shooting_angle(x, y)/2
    x_p = 120
    y_p = 36
    
    # Rotate point counterclockwise by given angle in radians around given origin
    x_r = x + math.cos(ang) * (x_p - x) - math.sin(ang) * (y_p - y)
    y_r = y + math.sin(ang) * (x_p - x) + math.cos(ang) * (y_p - y)
    
    # Return slope from event location to rotated point
    return (y_r-y)/(x_r-x)

# Apply method and display value
print('Bisector slope:', bisector_slope(112, 40))

In [None]:
# Create dataframe with TOGKP
def get_togkp(df):
    
    # Create polygon players dataframe
    df = polygon_players(df)

    # Create empty list
    ls_x_opt = []
    ls_y_opt = []

    # Iterate over all rows in dataframe
    for i in range(0, len(df)):

        # Define linear equation of bisector
        x_loc = df.iloc[i]['x_loc']
        y_loc = df.iloc[i]['y_loc']
        m_loc = bisector_slope(x_loc, y_loc)
        q_loc = y_loc - m_loc*x_loc
        # Define shooting angle in radians
        ang = shooting_angle(x_loc, y_loc)/2

        # If central shot
        if m_loc == 0:
            x_shift = (math.cos(ang)*dive_radius)/math.sin(ang)
            x_opt = x_loc + x_shift
            y_opt = 40
        # If no central shot
        else:
            # Define linear equation perpendicular to bisector
            d1 = math.dist([x_loc, y_loc], [120, 36])
            d2 = math.dist([x_loc, y_loc], [120, 44])
            if d1 < d2:
                y_p = 36
            else:
                y_p = 44
            x_p = 120
            m_p = -(1/m_loc)
            q_p = y_p - m_p*x_p

            # Calculate intersection of two linear equations
            y_opt = (q_loc - q_p*m_loc/m_p)/(1-(m_loc/m_p))
            x_opt = (y_opt-q_p)/m_p

        # If covered length is not enough
        if math.dist([x_p, y_p], [x_opt, y_opt]) > dive_radius:

            # Distance from event location to optimal location to cover dive radius
            distance = (math.cos(ang)*dive_radius)/math.sin(ang)
            # Shift from shot location to optimal location
            x_shift = distance/(math.sqrt(1+m_loc*m_loc))
            y_shift = x_shift*m_loc
            # Calculate optimal location
            x_opt = round(x_loc + x_shift, 2)
            y_opt = round(y_loc + y_shift, 2)
        
        # Add value to list
        ls_x_opt.append(x_opt)
        ls_y_opt.append(y_opt)

    # Add list to dataframe
    df['x_opt'] = ls_x_opt
    df['y_opt'] = ls_y_opt

    # Return reordered togkp dataframe
    return df[['possession_team', 'shot', 'outcome', 'x_loc', 'y_loc', 'x_gk', 'y_gk', 'x_opt', 'y_opt', 'players', 'polygon_players',]]

# Apply method and display dataframe
df_togkp = get_togkp(df_all)
print('Number of rows:', len(df_togkp))
df_togkp.head(2)

In [None]:
# Calculate distance between GKP and TOGKP
def get_distance(df):
    
    # Create TOGKP dataframe
    df = get_togkp(df)
    
    # Create empty list
    ls_distance = []
    
    # Iterate over all rows in dataframe
    for i in range(0, len(df)):
        
        # Save coordinates
        x1 = df.at[i, 'x_opt']
        y1 = df.at[i, 'y_opt']
        x2 = df.at[i, 'x_gk']
        y2 = df.at[i, 'y_gk']
        # Add value to list
        ls_distance.append(round(math.dist([x1, y1], [x2, y2]), 2))
    
    # Add list to dataframe
    df['distance'] = ls_distance

    # Return reordered TOGKP dataframe
    return df[['possession_team', 'shot', 'outcome', 'x_loc', 'y_loc', 'x_gk', 'y_gk', 'x_opt', 'y_opt', 'distance', 'players', 'polygon_players',]]
    
# Apply method and display dataframe
df_distance = get_distance(df_all)
print('Number of rows:', len(df_distance))
df_distance.head(2)

In [None]:
# Reduce dimensionality
def get_reduced(df):
    
    # Create distance dataframe
    df = get_distance(df)
    
    # Create empty list
    ls_shooting_angle = []
    ls_goal_angle = []
    ls_goal_distance = []
    ls_gk_distance = []
    ls_perpendicular_distance = []
    ls_angular_deviation = []
    ls_ratio_shooter_perp = []
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Initialize variables necessary for calculation
        x_loc = df.at[row, 'x_loc']
        y_loc = df.at[row, 'y_loc']
        x_gk = df.at[row, 'x_gk']
        y_gk = df.at[row, 'y_gk']
        x_opt = df.at[row, 'x_opt']
        y_opt = df.at[row, 'y_opt']
        loc = np.asarray((x_loc, y_loc))
        gk = np.asarray((x_gk, y_gk))
        opt = np.asarray((x_opt, y_opt))
        m_loc_middle = (40-y_loc)/(120-x_loc)
        m_loc_gk = (y_gk-y_loc)/(x_gk-x_loc)
        m_bisector = bisector_slope(x_loc, y_loc)
        
        # Calculate values
        s_angle = round(math.degrees(shooting_angle(x_loc, y_loc)), 2)
        g_angle = round(math.degrees(abs(math.atan(m_loc_middle))), 2)
        g_distance = round(math.dist([x_loc, y_loc], [120, 40]), 2)
        gk_distance = round(math.dist([x_gk, y_gk], [120, 40]), 2)
        perpendicular_distance = round(abs(np.cross(loc - opt, opt - gk) / norm(loc - opt)), 2)
        angular_deviation = round(math.degrees(abs(math.atan(m_loc_gk) - math.atan(m_bisector))), 2)
        if perpendicular_distance != 0:
            ratio_shooter_perp = round(math.dist([x_loc, y_loc], [x_gk, y_gk]) / perpendicular_distance, 2)
        else:
            ratio_shooter_perp = 500
            
        # Add value to list
        ls_shooting_angle.append(s_angle)
        ls_goal_angle.append(g_angle)
        ls_goal_distance.append(g_distance)
        ls_gk_distance.append(gk_distance)
        ls_perpendicular_distance.append(perpendicular_distance)
        ls_angular_deviation.append(angular_deviation)
        ls_ratio_shooter_perp.append(ratio_shooter_perp)
    
    # Add list to dataframe
    df['shooting_angle'] = ls_shooting_angle
    df['goal_angle'] = ls_goal_angle
    df['goal_distance'] = ls_goal_distance
    df['gk_distance'] = ls_gk_distance
    df['perpendicular_distance'] = ls_perpendicular_distance
    df['angular_deviation'] = ls_angular_deviation
    df['ratio_shooter_perp'] = ls_ratio_shooter_perp

    # Return reduced dataframe
    return df[['outcome', 'x_loc', 'y_loc', 'x_gk', 'y_gk', 'x_opt', 'y_opt', 'distance', 'players', 'polygon_players', 'shooting_angle', 'goal_angle', 'goal_distance', 'gk_distance', 'perpendicular_distance', 'angular_deviation', 'ratio_shooter_perp']]


# Apply method and display dataframe
df_reduced = get_reduced(df_all)
print('Number of rows:', len(df_reduced))
df_reduced.head(1)

In [None]:
# Plot one shot
def plot_shot(df, row):
    
    # Soccer pitch
    pitch = Pitch(pitch_color='grass', line_color='white', stripe=True)
    fig, ax = pitch.draw()

    # Player location
    x_loc = df.at[row, 'x_loc']
    y_loc = df.at[row, 'y_loc']
    # Red dots for goals and blue dots for the rest
    color = 'blue'
    if df.at[row, 'outcome'] == 'Goal':
        color = 'red'
    plt.scatter(x_loc, y_loc, color = color)

    # Goalkeeper location
    x_gk = df.at[row, 'x_gk']
    y_gk = df.at[row, 'y_gk']
    plt.scatter(x_gk, y_gk, color = 'brown')
    
    # Other players location
    for i in range(0, len(df.iloc[row]['polygon_players'])):
        x_p = df.iloc[row]['polygon_players'][i][0]
        y_p = df.iloc[row]['polygon_players'][i][1]
        plt.scatter(x_p, y_p, color = 'yellow')

    # TOGKP
    x_opt = df.at[row, 'x_opt']
    y_opt = df.at[row, 'y_opt']
    plt.scatter(x_opt, y_opt, color = 'orange')

    # Line between player location and gp1
    point1 = [df.at[row, 'x_loc'], df.at[row, 'y_loc']]
    point2 = [120, 36]
    x_values = [point1[0], point2[0]]
    y_values = [point1[1], point2[1]]
    plt.plot(x_values, y_values, color = 'black')

    # Line between player location and gp2
    point1 = [df.at[row, 'x_loc'], df.at[row, 'y_loc']]
    point2 = [120, 44]
    x_values = [point1[0], point2[0]]
    y_values = [point1[1], point2[1]]
    plt.plot(x_values, y_values, color = 'black')
    
    """
    # Plot bisector
    point1 = [df.at[row, 'x_loc'], df.at[row, 'y_loc']]
    point2 = [x_opt, y_opt]
    x_values = [point1[0], point2[0]]
    y_values = [point1[1], point2[1]]
    plt.plot(x_values, y_values, color = 'black')
    """
    
    # Scale plot
    plt.axis('scaled')
    plt.xlim([-5, 125])
    plt.ylim([-5, 85])

    # Display plot
    plt.show()
    
    # Display player and gk location
    print('x_loc:', x_loc, 'y_loc:', y_loc)
    print('x_gk:', x_gk, 'y_gk:', y_gk)
    print('x_opt:', x_opt, 'y_opt:', y_opt)

In [None]:
# Plot all shots
def plot_all(df):
    
    # Soccer pitch
    pitch = Pitch(pitch_color='grass', line_color='white', stripe=True)
    fig, ax = pitch.draw()
    
    # Iterate over all rows in dataframe
    for row in range(0, len(df)):
        
        # Player location
        x_loc = df.at[row, 'x_loc']
        y_loc = df.at[row, 'y_loc']
        # Red dots for goals and blue dots for the rest
        color = 'blue'
        if df.at[row, 'outcome'] == 'Goal':
            color = 'red'
        # Plot point
        plt.scatter(x_loc, y_loc, color = color)
        
    # Scale plot
    plt.axis('scaled')
    plt.xlim([-5, 125])
    plt.ylim([-5, 85])

    # Display plot
    plt.show()

Check deviations from TOGKP for different outcomes:

In [None]:
# Assess success of TOGKP
def check_deviation(df):
    
    # Average distance from GKP to TOGKP for saves
    df_saved = df.where(df['outcome'] == 'Saved').dropna().reset_index(drop = True)
    print('Deviation for saves:', round(df_saved['distance'].mean(), 2))
    
    # Average distance from GKP to TOGKP for nogoals
    df_nogoal = df.where(df['outcome'] != 'Goal').dropna().reset_index(drop = True)
    print('Deviation for nogoals:', round(df_nogoal['distance'].mean(), 2))

    # Average distance from GKP to TOGKP for goals
    df_goal = df.where(df['outcome'] == 'Goal').dropna().reset_index(drop = True)
    print('Deviation for goals:', round(df_goal['distance'].mean(), 2))
    
# Apply method for latest dataframe
check_deviation(df_reduced)

### Data

In [None]:
# Apply method and display dataframe
df_final = get_reduced(df_all)
print('Number of rows:', len(df_final))
df_final.head(1)

In [None]:
# Filter for 1v1 situations
df_1v1 = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)
print('Number of rows:', len(df_1v1))
df_1v1.head(1)

### Plot

In [None]:
# Plot one shot
plot_shot(df_final, 105)

In [None]:
# Plot all shots
plot_all(df_final)

In [None]:
# Plot 1v1 shots
plot_all(df_1v1)

### Deviations

Plot GKP relative to centered TOGKP (https://pygmalion.nitri.org/cartesian-coordinates-with-matplotlib-1263.html):

In [None]:
# Get inliine charts
%matplotlib inline
# Define x range, y range, and tick interval for both axes
xmin, xmax, ymin, ymax = -12, 12, -12, 12
ticks_frequency = 1
# Create figure and axes object
fig, ax = plt.subplots(figsize=(10, 10))
# Set face color
fig.patch.set_facecolor('#ffffff')
# Apply ranges to axes
ax.set(xlim=(xmin-1, xmax+1), ylim=(ymin-1, ymax+1), aspect='equal')
# Set both axes to zero position
ax.spines['bottom'].set_position('zero')
ax.spines['left'].set_position('zero')
# Hide the top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Set x label, y label, and add an origin label
ax.set_xlabel('$x$', size=14, labelpad=-24, x=1.02)
ax.set_ylabel('$y$', size=14, labelpad=-21, y=1.02, rotation=0)
plt.text(0.49, 0.49, r"$O$", ha='right', va='top',
    transform=ax.transAxes,
         horizontalalignment='center', fontsize=14)
# Create x tick, y tick, and apply them to both axes
x_ticks = np.arange(xmin, xmax+1, ticks_frequency)
y_ticks = np.arange(ymin, ymax+1, ticks_frequency)
ax.set_xticks(x_ticks[x_ticks != 0])
ax.set_yticks(y_ticks[y_ticks != 0])
ax.set_xticks(np.arange(xmin, xmax+1), minor=True)
ax.set_yticks(np.arange(ymin, ymax+1), minor=True)
# Add a grid
ax.grid(which='both', color='grey', linewidth=1, linestyle='-', alpha=0.2)

# Initialize 1v1 dataframe
df = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)

# Iterate over all rows in dataframe
for row in range(0, len(df)):
    
    # Save goalkeeper position relative to centered TOGKP
    x = df.at[row, 'x_gk'] - df.at[row, 'x_opt']
    y = df.at[row, 'y_gk'] - df.at[row, 'y_opt']
    # Color for every outcome besides goals and saves
    color = 'orange'
    # Color for goals
    if df.at[row, 'outcome'] == 'Goal':
        color = 'red'
    # Color for saves
    if df.at[row, 'outcome'] == 'Saved':
        color = 'green'
    # Plot point
    plt.scatter(x, y, color = color)

In [None]:
# Get inliine charts
%matplotlib inline
# Define x range, y range, and tick interval for both axes
xmin, xmax, ymin, ymax = -24, 24, -12, 12
ticks_frequency = 1
# Create figure and axes object
fig, ax = plt.subplots(figsize=(10, 10))
# Set face color
fig.patch.set_facecolor('#ffffff')
# Apply ranges to axes
ax.set(xlim=(xmin-1, xmax+1), ylim=(ymin-1, ymax+1), aspect='equal')
# Set both axes to zero position
ax.spines['bottom'].set_position('zero')
ax.spines['left'].set_position('zero')
# Hide the top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Set x label, y label, and add an origin label
ax.set_xlabel('$x$', size=14, labelpad=-24, x=1.02)
ax.set_ylabel('$y$', size=14, labelpad=-21, y=1.02, rotation=0)
plt.text(0.49, 0.49, r"$O$", ha='right', va='top',
    transform=ax.transAxes,
         horizontalalignment='center', fontsize=14)
# Create x tick, y tick, and apply them to both axes
x_ticks = np.arange(xmin, xmax+1, ticks_frequency)
y_ticks = np.arange(ymin, ymax+1, ticks_frequency)
ax.set_xticks(x_ticks[x_ticks != 0])
ax.set_yticks(y_ticks[y_ticks != 0])
ax.set_xticks(np.arange(xmin, xmax+1), minor=True)
ax.set_yticks(np.arange(ymin, ymax+1), minor=True)
# Add a grid
ax.grid(which='both', color='grey', linewidth=1, linestyle='-', alpha=0.2)

# Initialize 1v1 dataframe
df = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)

# Iterate over all rows in dataframe
for row in range(0, len(df)):
    
    # Save goalkeeper position relative to centered TOGKP
    x = df.at[row, 'gk_distance']
    y = df.at[row, 'perpendicular_distance']
    # Color for every outcome besides goals and saves
    color = 'orange'
    # Color for goals
    if df.at[row, 'outcome'] == 'Goal':
        color = 'red'
    # Color for saves
    if df.at[row, 'outcome'] == 'Saved':
        color = 'green'
    # Plot point
    plt.scatter(x, y, color = color)

Separately check average deviations from TOGKP in x and y direction:

In [None]:
# Initialize 1v1 dataframe
df = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)

# Initialize variables
x_tot = 0
y_tot = 0
x_tot_abs = 0
y_tot_abs = 0

# Iterate over all rows in dataframe
for row in range(0, len(df)):
    
    # Display goalkeeper position relative to centered TOGKP
    x = df_1v1.at[row, 'x_gk'] - df_1v1.at[row, 'x_opt']
    y = df_1v1.at[row, 'y_gk'] - df_1v1.at[row, 'y_opt']
    # Calculate total deviation
    x_tot = x_tot + x
    y_tot = y_tot + y
    # Calculate absolute total deviation
    x_tot_abs = x_tot_abs + abs(x)
    y_tot_abs = y_tot_abs + abs(y)

# Calculate average deviation
x_dev = round(x_tot / len(df), 2)
y_dev = round(y_tot / len(df), 2)
x_dev_abs = round(x_tot_abs / len(df), 2)
y_dev_abs = round(y_tot_abs / len(df), 2)

# Display results
print('Deviation in x direction:', x_dev)
print('Deviation in y direction:', y_dev)
print('Absolute deviation in x direction:', x_dev_abs)
print('Absolute deviation in y direction:', y_dev_abs)

### ML-Algorithm to predict Goal Probability

In [None]:
# Initialize settings
plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

# Initialize 1v1 dataframe
df = df_final.where(df_final['polygon_players'] == '').dropna().reset_index(drop = True)

# Select all features
# df = df[['outcome', 'x_loc', 'y_loc', 'x_gk', 'y_gk', 'x_opt', 'y_opt', 'shooting_angle', 'goal_angle', 'goal_distance', 'perpendicular_distance', 'angular_deviation', 'ratio_shooter_perp']]

# Select reduced features
df = df[['outcome', 'shooting_angle', 'goal_angle', 'goal_distance', 'perpendicular_distance', 'angular_deviation', 'ratio_shooter_perp']]
# Select reduced features without bisector features
# df = df[['outcome', 'shooting_angle', 'goal_angle', 'goal_distance']]

# Select coordinate features
# df = df[['outcome', 'x_loc', 'y_loc', 'x_gk', 'y_gk', 'x_opt', 'y_opt']]
# Select coordinate features without bisector features
# df = df[['outcome', 'x_loc', 'y_loc', 'x_gk', 'y_gk']]

# Select experimental features
# df = df[['outcome', 'perpendicular_distance', 'angular_deviation', 'ratio_shooter_perp']]


# Encode outcome to binary
for row in range(0, len(df)):
    if df.at[row, 'outcome'] == 'Goal':
        df.at[row, 'outcome'] = 1
    else:
        df.at[row, 'outcome'] = 0
        
# From object to int
df['outcome'] = df['outcome'].astype('int')

# Display dataframe
df.head(1)

In [None]:
# Import package
from sklearn.metrics import confusion_matrix

# Initialize input vector and label
X = df.loc[:, df.columns != 'outcome']
y = df.loc[:, df.columns == 'outcome']

# Implement model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Save predictions
y_pred = logreg.predict(X_test)

# Calculate confusion matrix and values
confusion_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = confusion_matrix.ravel()
accuracy =  (TP+TN) /(TP+FP+TN+FN)

# Display results
print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)
print('')
print('Number of correct predictions:', confusion_matrix[0][0] + confusion_matrix[1][1])
print('Number of incorrect predictions:', confusion_matrix[0][1] + confusion_matrix[1][0])
print('')
print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))
print('')
print(classification_report(y_test, y_pred))

In [None]:
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
# Show class imbalance
count_no_goal = len(df[df['outcome']==0])
count_goal = len(df[df['outcome']==1])
pct_of_no_goal = count_no_goal/(count_no_goal+count_goal)
pct_of_goal = count_goal/(count_no_goal+count_goal)
print("percentage of no goal is", pct_of_no_goal*100)
print("percentage of goal", pct_of_goal*100)

### Thesis

Display exemplary json file as dataframe:

In [None]:
# Create dataframe
df_test = pd.read_json(filepath)

# Display dataframe
df_test

Display column names of the dataframe:

In [None]:
# Display columns
df_test.columns.to_list()[:3]

Calculate total number of games and average number of events per game:

In [None]:
# Initialize variables
filecount = 0
eventcount = 0

# Iterate over all files in directory
for filename in os.listdir(directory):

    # Save current filepath
    filepath = os.path.join(directory, filename)
    # Check if current file has json ending
    if filename.split('.')[1] == 'json':

        # Create dataframe from current json file
        df_file = pd.read_json(filepath)
        # Update counters
        eventcount = eventcount + len(df_file)
        filecount = filecount + 1

# Calculate average number of events per game
avg_events = eventcount / filecount

# Display result
print("Number of games:", filecount)
print("Average number of events per game:", avg_events)

Caclulate total number of games with a goal difference of zero or one:

In [None]:
# Create dataframe
df_matches = pd.read_json(matchpath)

# Calculate goal difference
df_matches['goal_difference'] = abs(df_matches['home_score'] - df_matches['away_score'])

# Display goaldifference
df_matches['goal_difference'].where(df_matches['goal_difference'] <= 1).dropna().count()

### Experimental

Check which dive radius value gets smallest distance between GKO and TOGKP:
- Final dive radius: 3.5
- Final mean distance: 1.7

In [None]:
# """
# Initilaize variables
dive_radius_copy = dive_radius
dive_radius = 3.1
mean_distance = 100
count = 0

# Do as long as result does not get worse three times in a row
while count < 3:
    
    # Create dataframe with current dive radius
    df_experimental = get_reduced(df_all)
    
    # If mean distance is smaller update parameters and reset count to zero
    if df_experimental['distance'].mean() < mean_distance:
        mean_distance = df_experimental['distance'].mean()
        result = dive_radius
        count = 0
    # If mean distance is larger add one to count
    else:
        count = count + 1
    
    # Increase dive radius
    dive_radius = dive_radius + 0.1

# Recreate original dive radius
dive_radius = dive_radius_copy

# Print results
print("Final dive radius:", result)
print("Final mean distance:", mean_distance)
# """