In [1]:
from statsbombpy import sb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix

In [134]:
# Read the raw data for all the Euros
raw_events = sb.competition_events(
    country="Europe",
    division= "UEFA Euro",
    season="2020",
    split=True
)
raw_frames = sb.competition_frames(
    country="Europe",
    division= "UEFA Euro",
    season="2020"
)



In [345]:
# Convert to data frame
events = pd.DataFrame(raw_events['passes'])
frames = pd.DataFrame(raw_frames)

In [346]:
# See which columns are available
events.columns

Index(['id', 'index', 'period', 'timestamp', 'minute', 'second', 'type',
       'possession', 'possession_team', 'play_pattern', 'team', 'player',
       'position', 'location', 'duration', 'related_events', 'match_id',
       'pass_recipient', 'pass_length', 'pass_angle', 'pass_height',
       'pass_end_location', 'pass_body_part', 'pass_type',
       'possession_team_id', 'player_id', 'under_pressure', 'pass_aerial_won',
       'pass_outcome', 'pass_outswinging', 'pass_technique', 'pass_switch',
       'off_camera', 'pass_cross', 'pass_no_touch', 'pass_assisted_shot_id',
       'pass_shot_assist', 'out', 'pass_miscommunication', 'pass_through_ball',
       'pass_goal_assist', 'counterpress', 'pass_cut_back', 'pass_straight',
       'pass_inswinging', 'pass_deflected', 'prior_passes'],
      dtype='object')

In [332]:
# Count successful passes leading up to this one
possession = None
count = 0
for index, row in events.sort_values(['match_id', 'index']).iterrows():
    if row['possession'] != possession:
        count = 0
        possession = row['possession']
    else:
        count += 1
    events.at[index, 'prior_passes'] = count

In [333]:
# Clean the data
events.pass_outcome = events.pass_outcome.fillna('Complete')
events.pass_technique = events.pass_technique.fillna('Other')
events.pass_body_part = events.pass_body_part.fillna('Other')
events.pass_type = events.pass_type.fillna('Other')
events = events.fillna(False)
events['location_x'] = [p[0] for p in events['location']]
events['location_y'] = [p[1] for p in events['location']]

In [334]:
# Simplify positions
events['position'] = events['position'].str.replace('Right ', '')
events['position'] = events['position'].str.replace('Left ', '')
events['position'] = events['position'].str.replace('Attacking ', '')
events['position'] = events['position'].str.replace('Center ', '')
events['position'] = events['position'].str.replace('Defensive ', '')

In [335]:
# Keep useful columns and remove any that include data from after the pass is made
events = events[['minute', 'play_pattern', 'team', 'position', 'location_x', 'location_y', 'duration',
                 'pass_length', 'pass_angle', 'pass_height','pass_body_part', 'pass_type',
                 'under_pressure', 'pass_outcome', 'pass_outswinging', 'pass_technique', 'pass_switch',
                 'pass_cross', 'pass_no_touch', 'pass_through_ball', 'pass_cut_back', 'pass_straight',
                 'pass_inswinging', 'prior_passes']]

# Remove ones with low feature importance that don't impact accuracy
events = events[['play_pattern', 'position', 'location_x', 'location_y', 'duration', 'pass_length',
                 'pass_angle', 'pass_height','pass_body_part', 'under_pressure', 'pass_outcome',
                 'pass_switch', 'pass_cross', 'prior_passes']]

In [336]:
# TODO(agale): Increase features by adding:
# - # of passes in possession: Increased accuracy by 1%
# - # of teammates near pass
# - # of opponents near pass
# - unobstructed angle

In [337]:
# Preview the cleaned data
events.head()

Unnamed: 0,play_pattern,position,location_x,location_y,duration,pass_length,pass_angle,pass_height,pass_body_part,under_pressure,pass_outcome,pass_switch,pass_cross
0,From Kick Off,Midfield,60.0,40.0,1.373215,22.357325,3.069967,Ground Pass,Left Foot,False,Complete,False,False
1,From Kick Off,Back,41.4,42.2,3.332542,47.377,-0.396577,High Pass,Right Foot,False,Complete,False,False
2,From Kick Off,Midfield,85.1,23.9,1.734028,15.487091,-1.193887,Low Pass,Other,True,Complete,False,False
3,From Kick Off,Wing Back,90.8,9.5,0.863683,13.417153,1.352936,Ground Pass,Right Foot,False,Complete,False,False
4,From Kick Off,Forward,94.5,20.9,1.99964,21.883327,-0.31597,Ground Pass,Left Foot,False,Complete,False,False


In [338]:
# Split into features and response variables
response = events['pass_outcome'] == 'Complete'
features = events.drop(columns=['pass_outcome'])

In [339]:
# Use one-hot encoding to get quantitative features
transformer = make_column_transformer(
    (OneHotEncoder(sparse=False), ['play_pattern', 'position', 'pass_height', 'pass_body_part']),
    remainder='passthrough')
transformed = transformer.fit_transform(features)
transformed_features = pd.DataFrame(transformed, columns=transformer.get_feature_names())
transformed_features.head()

Unnamed: 0,onehotencoder__x0_From Corner,onehotencoder__x0_From Counter,onehotencoder__x0_From Free Kick,onehotencoder__x0_From Goal Kick,onehotencoder__x0_From Keeper,onehotencoder__x0_From Kick Off,onehotencoder__x0_From Throw In,onehotencoder__x0_Other,onehotencoder__x0_Regular Play,onehotencoder__x1_Back,...,onehotencoder__x3_Other,onehotencoder__x3_Right Foot,location_x,location_y,duration,pass_length,pass_angle,under_pressure,pass_switch,pass_cross
0,0,0,0,0,0,1,0,0,0,0,...,0,0,60.0,40.0,1.37322,22.3573,3.06997,False,False,False
1,0,0,0,0,0,1,0,0,0,1,...,0,1,41.4,42.2,3.33254,47.377,-0.396577,False,False,False
2,0,0,0,0,0,1,0,0,0,0,...,1,0,85.1,23.9,1.73403,15.4871,-1.19389,True,False,False
3,0,0,0,0,0,1,0,0,0,0,...,0,1,90.8,9.5,0.863683,13.4172,1.35294,False,False,False
4,0,0,0,0,0,1,0,0,0,0,...,0,0,94.5,20.9,1.99964,21.8833,-0.31597,False,False,False


In [340]:
# Basic classifier just using pass location
subset_features = transformed_features[['location_x', 'location_y']]
x_train, x_test, y_train, y_test = train_test_split(subset_features, response, test_size=1/3, random_state=13)
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=7).fit(x_train, y_train)
classifier.score(x_test, y_test)

0.8330414796979315

In [341]:
confusion_matrix(y_test, classifier.predict(x_test))

array([[  223,  2883],
       [  168, 15000]])

In [342]:
# Use Gradient Boosted Decision Tree Classifier on all data
x_train, x_test, y_train, y_test = train_test_split(transformed_features, response, test_size=1/3, random_state=13)
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=7).fit(x_train, y_train)
classifier.score(x_test, y_test)

0.889405713034913

In [343]:
confusion_matrix(y_test, classifier.predict(x_test))

array([[ 1767,  1339],
       [  682, 14486]])

In [344]:
# Analyze feature importance
pd.DataFrame({'feature': x_train.columns, 'importance': classifier.feature_importances_})

Unnamed: 0,feature,importance
0,onehotencoder__x0_From Corner,0.0
1,onehotencoder__x0_From Counter,0.001673
2,onehotencoder__x0_From Free Kick,0.0
3,onehotencoder__x0_From Goal Kick,0.0
4,onehotencoder__x0_From Keeper,0.0
5,onehotencoder__x0_From Kick Off,0.000144
6,onehotencoder__x0_From Throw In,0.0
7,onehotencoder__x0_Other,0.0
8,onehotencoder__x0_Regular Play,0.001639
9,onehotencoder__x1_Back,0.001523


In [44]:
# Write likelihoods to csv
ind = np.where(classifier.classes_ == True)[0][0]
out = pd.DataFrame(raw_events['passes']['id'])
out['likelihood'] = classifier.predict_proba(transformed_features)[:, ind]
out.to_csv('../web/data/xp.csv', index=False)