In [1]:
from statsbombpy import sb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix
import math

In [2]:
# Read the raw data for all the Euros
raw_events = sb.competition_events(
    country="Europe",
    division= "UEFA Euro",
    season="2020",
    split=True
)
raw_frames = sb.competition_frames(
    country="Europe",
    division= "UEFA Euro",
    season="2020"
)



In [3]:
# Convert to data frame
events = pd.DataFrame(raw_events['passes'])
frames = pd.DataFrame(raw_frames)
# Join tables
frames_subset = frames.rename(columns={'event_uuid': 'id'}).drop(columns=['match_id', 'visible_area'])
events = events.join(frames_subset.set_index('id'), on='id', how='left')

In [4]:
# See which columns are available
events.columns

Index(['id', 'index', 'period', 'timestamp', 'minute', 'second', 'type',
       'possession', 'possession_team', 'play_pattern', 'team', 'player',
       'position', 'location', 'duration', 'related_events', 'match_id',
       'pass_recipient', 'pass_length', 'pass_angle', 'pass_height',
       'pass_end_location', 'pass_body_part', 'pass_type',
       'possession_team_id', 'player_id', 'under_pressure', 'pass_aerial_won',
       'pass_outcome', 'pass_outswinging', 'pass_technique', 'pass_switch',
       'off_camera', 'pass_cross', 'pass_no_touch', 'pass_assisted_shot_id',
       'pass_shot_assist', 'out', 'pass_miscommunication', 'pass_through_ball',
       'pass_goal_assist', 'counterpress', 'pass_cut_back', 'pass_straight',
       'pass_inswinging', 'pass_deflected', 'freeze_frame'],
      dtype='object')

In [5]:
# Count successful passes leading up to this one
possession = None
count = 0
for index, row in events.sort_values(['match_id', 'index']).iterrows():
    if row['possession'] != possession:
        count = 0
        possession = row['possession']
    else:
        count += 1
    events.at[index, 'prior_passes'] = count

In [6]:
# Clean the data
events.pass_outcome = events.pass_outcome.fillna('Complete')
events.pass_technique = events.pass_technique.fillna('Other')
events.pass_body_part = events.pass_body_part.fillna('Other')
events.pass_type = events.pass_type.fillna('Other')
events = events.fillna(False)
events['location_x'] = [p[0] for p in events['location']]
events['location_y'] = [p[1] for p in events['location']]

In [7]:
# Simplify positions
events['position'] = events['position'].str.replace('Right ', '')
events['position'] = events['position'].str.replace('Left ', '')
events['position'] = events['position'].str.replace('Attacking ', '')
events['position'] = events['position'].str.replace('Center ', '')
events['position'] = events['position'].str.replace('Defensive ', '')

In [8]:
# Calculate attackers/defenders near the pass cone

# Calculate the distance between two points
def dist(a, b):
    return math.sqrt((a[0]-b[0])**2 + (a[1]-b[1])**2)

# Calculate the angle from point a to b 
def angle(a, b):
    if b[0] >= a[0]:
        return -1 * math.atan((a[1]-b[1])/(a[0]-b[0]))
    elif b[1] >= a[1]:
        return -1 * math.pi - math.atan((a[1]-b[1])/(a[0]-b[0]))
    else:
        return math.pi - math.atan((a[1]-b[1])/(a[0]-b[0]))

# Check if a given player is near an event
def is_nearby(event, player):
    # If players are within a roughly 3 yard radius, consider them nearby
    player_dist = dist(event['location'], player['location'])
    if player_dist < 3:
        return True
    # If a player is further than 120% of the pass distance, they are not nearby
    if player_dist > 1.25 * event['pass_length']:
        return False

    # If the player is within pi/8 degrees from the pass, consider them nearby
    player_angle = angle(event['location'], player['location'])
    if abs(event['pass_angle'] - player_angle) < math.pi / 4:
        return True
    elif player_angle < 0 and event['pass_angle'] > 0 and math.pi * 2 + player_angle - event['pass_angle'] < math.pi / 4:
        return True
    elif player_angle > 0 and event['pass_angle'] < 0 and math.pi * 2 + event['pass_angle'] - player_angle < math.pi / 4:
        return True
    return False

for index, event in events.iterrows():
    if not event['freeze_frame']:
        events.at[index, 'nearby_teammates'] = 0
        events.at[index, 'nearby_opponents'] = 0
        continue
    nearby_teammates = 0
    nearby_opponents = 0
    for player in event['freeze_frame']:
        if not player['actor'] and is_nearby(event, player):
            if player['teammate']:
                nearby_teammates += 1
            else:
                nearby_opponents += 1
    events.at[index, 'nearby_teammates'] = nearby_teammates
    events.at[index, 'nearby_opponents'] = nearby_opponents

In [9]:
events.head()

Unnamed: 0,id,index,period,timestamp,minute,second,type,possession,possession_team,play_pattern,...,pass_cut_back,pass_straight,pass_inswinging,pass_deflected,freeze_frame,prior_passes,location_x,location_y,nearby_teammates,nearby_opponents
0,94dbc5c3-ef37-445e-9154-3d9f9ea9245d,5,1,00:00:00.490,0,0,Pass,2,Russia,From Kick Off,...,False,False,False,False,False,0.0,60.0,40.0,0.0,0.0
1,c943a37b-f9ae-4784-beb7-f9efce2ca855,8,1,00:00:03.374,0,3,Pass,2,Russia,From Kick Off,...,False,False,False,False,"[{'teammate': True, 'actor': True, 'keeper': F...",1.0,41.4,42.2,4.0,6.0
2,2bf9d325-fba4-4dfa-9f9f-08a25e70b073,11,1,00:00:06.707,0,6,Pass,2,Russia,From Kick Off,...,False,False,False,False,"[{'teammate': True, 'actor': False, 'keeper': ...",2.0,85.1,23.9,2.0,5.0
3,842fd78c-c050-4a88-b48d-758414697864,13,1,00:00:08.441,0,8,Pass,2,Russia,From Kick Off,...,False,False,False,False,"[{'teammate': True, 'actor': False, 'keeper': ...",3.0,90.8,9.5,0.0,0.0
4,60034587-15ac-4edd-bf3d-19a059043557,17,1,00:00:10.361,0,10,Pass,2,Russia,From Kick Off,...,False,False,False,False,"[{'teammate': True, 'actor': False, 'keeper': ...",4.0,94.5,20.9,1.0,4.0


In [10]:
# Keep useful columns and remove any that include data from after the pass is made
events = events[['minute', 'play_pattern', 'team', 'position', 'location_x', 'location_y', 'duration',
                 'pass_length', 'pass_angle', 'pass_height','pass_body_part', 'pass_type',
                 'under_pressure', 'pass_outcome', 'pass_outswinging', 'pass_technique', 'pass_switch',
                 'pass_cross', 'pass_no_touch', 'pass_through_ball', 'pass_cut_back', 'pass_straight',
                 'pass_inswinging', 'prior_passes', 'nearby_teammates', 'nearby_opponents']]

# Remove ones with low feature importance that don't impact accuracy
events = events[['position', 'location_x', 'location_y', 'duration', 'pass_length',
                 'pass_angle', 'pass_height','pass_body_part', 'under_pressure', 'pass_outcome',
                 'pass_switch', 'pass_cross', 'prior_passes', 'nearby_teammates', 'nearby_opponents']]

In [11]:
# Preview the cleaned data
events.head()

Unnamed: 0,position,location_x,location_y,duration,pass_length,pass_angle,pass_height,pass_body_part,under_pressure,pass_outcome,pass_switch,pass_cross,prior_passes,nearby_teammates,nearby_opponents
0,Midfield,60.0,40.0,1.373215,22.357325,3.069967,Ground Pass,Left Foot,False,Complete,False,False,0.0,0.0,0.0
1,Back,41.4,42.2,3.332542,47.377,-0.396577,High Pass,Right Foot,False,Complete,False,False,1.0,4.0,6.0
2,Midfield,85.1,23.9,1.734028,15.487091,-1.193887,Low Pass,Other,True,Complete,False,False,2.0,2.0,5.0
3,Wing Back,90.8,9.5,0.863683,13.417153,1.352936,Ground Pass,Right Foot,False,Complete,False,False,3.0,0.0,0.0
4,Forward,94.5,20.9,1.99964,21.883327,-0.31597,Ground Pass,Left Foot,False,Complete,False,False,4.0,1.0,4.0


In [12]:
# Split into features and response variables
response = events['pass_outcome'] == 'Complete'
features = events.drop(columns=['pass_outcome'])

In [13]:
# Use one-hot encoding to get quantitative features
transformer = make_column_transformer(
    (OneHotEncoder(sparse=False), ['position', 'pass_height', 'pass_body_part']),
    remainder='passthrough')
transformed = transformer.fit_transform(features)
transformed_features = pd.DataFrame(transformed, columns=transformer.get_feature_names())
transformed_features.head()

Unnamed: 0,onehotencoder__x0_Back,onehotencoder__x0_Forward,onehotencoder__x0_Goalkeeper,onehotencoder__x0_Midfield,onehotencoder__x0_Wing,onehotencoder__x0_Wing Back,onehotencoder__x1_Ground Pass,onehotencoder__x1_High Pass,onehotencoder__x1_Low Pass,onehotencoder__x2_Drop Kick,...,location_y,duration,pass_length,pass_angle,under_pressure,pass_switch,pass_cross,prior_passes,nearby_teammates,nearby_opponents
0,0,0,0,1,0,0,1,0,0,0,...,40.0,1.37322,22.3573,3.06997,False,False,False,0,0,0
1,1,0,0,0,0,0,0,1,0,0,...,42.2,3.33254,47.377,-0.396577,False,False,False,1,4,6
2,0,0,0,1,0,0,0,0,1,0,...,23.9,1.73403,15.4871,-1.19389,True,False,False,2,2,5
3,0,0,0,0,0,1,1,0,0,0,...,9.5,0.863683,13.4172,1.35294,False,False,False,3,0,0
4,0,1,0,0,0,0,1,0,0,0,...,20.9,1.99964,21.8833,-0.31597,False,False,False,4,1,4


In [14]:
# Basic classifier just using pass location
subset_features = transformed_features[['location_x', 'location_y']]
x_train, x_test, y_train, y_test = train_test_split(subset_features, response, test_size=1/3, random_state=13)
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=7).fit(x_train, y_train)

In [15]:
# Test accuracy
classifier.score(x_test, y_test)

0.8330414796979315

In [16]:
# Train accuracy
classifier.score(x_train, y_train)

0.8407760083182838

In [17]:
# Confusion matrix
confusion_matrix(y_test, classifier.predict(x_test))

array([[  223,  2883],
       [  168, 15000]])

In [62]:
# Use Gradient Boosted Decision Tree Classifier on all data
x_train, x_test, y_train, y_test = train_test_split(transformed_features, response, test_size=1/3, random_state=13)
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=7).fit(x_train, y_train)

In [63]:
# Test accuracy
classifier.score(x_test, y_test)

0.8913210025172376

In [64]:
# Train accuracy
classifier.score(x_train, y_train)

0.8962677173972528

In [65]:
# Confusion matrix
confusion_matrix(y_test, classifier.predict(x_test))

array([[ 1781,  1325],
       [  661, 14507]])

In [66]:
# Analyze feature importance
pd.DataFrame({'feature': x_train.columns, 'importance': classifier.feature_importances_})

Unnamed: 0,feature,importance
0,onehotencoder__x0_Back,0.0
1,onehotencoder__x0_Forward,0.004045
2,onehotencoder__x0_Goalkeeper,0.000752
3,onehotencoder__x0_Midfield,0.001174
4,onehotencoder__x0_Wing,0.002647
5,onehotencoder__x0_Wing Back,0.001843
6,onehotencoder__x1_Ground Pass,0.465839
7,onehotencoder__x1_High Pass,0.002864
8,onehotencoder__x1_Low Pass,0.082624
9,onehotencoder__x2_Drop Kick,0.0


In [23]:
# Write likelihoods to csv
ind = np.where(classifier.classes_ == True)[0][0]
out = pd.DataFrame(raw_events['passes']['id'])
out['likelihood'] = classifier.predict_proba(transformed_features)[:, ind]
out.to_csv('../web/data/xp.csv', index=False)