In [1]:
from statsbombpy import sb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix
import math
import xp_clean
from joblib import dump

In [2]:
# Read the raw data for all the Euros
raw_events = sb.competition_events(
    country="Europe",
    division= "UEFA Euro",
    season="2020",
    split=True
)
raw_frames = sb.competition_frames(
    country="Europe",
    division= "UEFA Euro",
    season="2020"
)



In [3]:
# Convert to data frame
events = pd.DataFrame(raw_events['passes'])
frames = pd.DataFrame(raw_frames)
# Join tables
frames_subset = frames.rename(columns={'event_uuid': 'id'}).drop(columns=['match_id', 'visible_area'])
events = events.join(frames_subset.set_index('id'), on='id', how='left')

In [4]:
# See which columns are available
events.columns

Index(['id', 'index', 'period', 'timestamp', 'minute', 'second', 'type',
       'possession', 'possession_team', 'play_pattern', 'team', 'player',
       'position', 'location', 'duration', 'related_events', 'match_id',
       'pass_recipient', 'pass_length', 'pass_angle', 'pass_height',
       'pass_end_location', 'pass_body_part', 'pass_type',
       'possession_team_id', 'player_id', 'under_pressure', 'pass_aerial_won',
       'pass_outcome', 'pass_outswinging', 'pass_technique', 'pass_switch',
       'off_camera', 'pass_cross', 'pass_no_touch', 'pass_assisted_shot_id',
       'pass_shot_assist', 'out', 'pass_miscommunication', 'pass_through_ball',
       'pass_goal_assist', 'counterpress', 'pass_cut_back', 'pass_straight',
       'pass_inswinging', 'pass_deflected', 'freeze_frame'],
      dtype='object')

In [5]:
# Add new features and clean features so they can be used in the model
(response, features) = xp_clean.clean_data(events)

In [7]:
# Basic classifier just using pass location
subset_features = features[['location_x', 'location_y']]
x_train, x_test, y_train, y_test = train_test_split(subset_features, response, test_size=1/3, random_state=13)
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=7).fit(x_train, y_train)

In [8]:
# Test accuracy
classifier.score(x_test, y_test)

0.8330414796979315

In [9]:
# Train accuracy
classifier.score(x_train, y_train)

0.8407760083182838

In [10]:
# Confusion matrix
confusion_matrix(y_test, classifier.predict(x_test))

array([[  223,  2883],
       [  168, 15000]])

In [11]:
# Use Gradient Boosted Decision Tree Classifier on all data
x_train, x_test, y_train, y_test = train_test_split(features, response, test_size=1/3, random_state=13)
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=7).fit(x_train, y_train)

In [12]:
# Test accuracy
classifier.score(x_test, y_test)

0.8913210025172376

In [13]:
# Train accuracy
classifier.score(x_train, y_train)

0.8962677173972528

In [14]:
# Confusion matrix
confusion_matrix(y_test, classifier.predict(x_test))

array([[ 1781,  1325],
       [  661, 14507]])

In [15]:
# Analyze feature importance
pd.DataFrame({'feature': x_train.columns, 'importance': classifier.feature_importances_})

Unnamed: 0,feature,importance
0,onehotencoder__x0_Back,0.0
1,onehotencoder__x0_Forward,0.004045
2,onehotencoder__x0_Goalkeeper,0.000752
3,onehotencoder__x0_Midfield,0.001174
4,onehotencoder__x0_Wing,0.002647
5,onehotencoder__x0_Wing Back,0.001843
6,onehotencoder__x1_Ground Pass,0.465839
7,onehotencoder__x1_High Pass,0.002864
8,onehotencoder__x1_Low Pass,0.082624
9,onehotencoder__x2_Drop Kick,0.0


In [23]:
# Write likelihoods to csv
ind = np.where(classifier.classes_ == True)[0][0]
out = pd.DataFrame(raw_events['passes']['id'])
out['likelihood'] = classifier.predict_proba(transformed_features)[:, ind]
out.to_csv('../web/data/xp.csv', index=False)

In [60]:
# Dump model for use in evaluating other games
dump(classifier, 'xp.joblib') 

['xp.joblib']