In [60]:
from statsbombpy import sb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import math
import xp_clean
from joblib import dump

In [61]:
# Read the raw data for all the Euros
raw_events = sb.competition_events(
    country="Europe",
    division= "UEFA Euro",
    season="2020",
    split=True
)
raw_frames = sb.competition_frames(
    country="Europe",
    division= "UEFA Euro",
    season="2020"
)



In [62]:
# Convert to data frame
events = pd.DataFrame(raw_events['passes'])
frames = pd.DataFrame(raw_frames)
# Join tables
frames_subset = frames.rename(columns={'event_uuid': 'id'}).drop(columns=['match_id', 'visible_area'])
events = events.join(frames_subset.set_index('id'), on='id', how='left')

In [63]:
# See which columns are available
events.columns

Index(['id', 'index', 'period', 'timestamp', 'minute', 'second', 'type',
       'possession', 'possession_team', 'play_pattern', 'team', 'player',
       'position', 'location', 'duration', 'related_events', 'match_id',
       'pass_recipient', 'pass_length', 'pass_angle', 'pass_height',
       'pass_end_location', 'pass_body_part', 'pass_type',
       'possession_team_id', 'player_id', 'under_pressure', 'pass_aerial_won',
       'pass_outcome', 'pass_outswinging', 'pass_technique', 'pass_switch',
       'off_camera', 'pass_cross', 'pass_no_touch', 'pass_assisted_shot_id',
       'pass_shot_assist', 'out', 'pass_miscommunication', 'pass_through_ball',
       'pass_goal_assist', 'counterpress', 'pass_cut_back', 'pass_straight',
       'pass_inswinging', 'pass_deflected', 'freeze_frame'],
      dtype='object')

In [64]:
# Add new features and clean features so they can be used in the model
(response, features) = xp_clean.clean_data(events)

In [65]:
# Basic classifier just using pass location
subset_features = features[['location_x', 'location_y']]
x_train, x_test, y_train, y_test = train_test_split(subset_features, response, test_size=1/3, random_state=13)
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=7).fit(x_train, y_train)

In [66]:
# Test accuracy
classifier.score(x_test, y_test)

0.8330414796979315

In [67]:
# Train accuracy
classifier.score(x_train, y_train)

0.8407760083182838

In [68]:
# Confusion matrix
confusion_matrix(y_test, classifier.predict(x_test))

array([[  223,  2883],
       [  168, 15000]])

In [69]:
# Use KNN Classifier on all data
from sklearn.neighbors import KNeighborsClassifier
x_train, x_test, y_train, y_test = train_test_split(features, response, test_size=1/3, random_state=13)
classifier = KNeighborsClassifier(3, weights="distance").fit(x_train, y_train)

In [70]:
# Test accuracy
classifier.score(x_test, y_test)

0.8474882346503229

In [71]:
# Train accuracy
classifier.score(x_train, y_train)

1.0

In [72]:
# Confusion matrix
confusion_matrix(y_test, classifier.predict(x_test))

array([[ 1121,  1985],
       [  802, 14366]])

In [73]:
# Use a Neural Network Classifier
x_train, x_test, y_train, y_test = train_test_split(features, response, test_size=1/3, random_state=13)
scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
classifier = MLPClassifier(random_state=7, max_iter=300).fit(x_train, y_train)

In [74]:
# Test accuracy
classifier.score(x_test, y_test)

0.8863959724198315

In [75]:
# Train accuracy
classifier.score(x_train, y_train)

0.9183768401466644

In [76]:
# Confusion matrix
confusion_matrix(y_test, classifier.predict(x_test))

array([[ 1741,  1365],
       [  711, 14457]])

In [91]:
# Use Gradient Boosted Decision Tree Classifier on all data
x_train, x_test, y_train, y_test = train_test_split(features, response, test_size=1/3, random_state=13)
classifier = GradientBoostingClassifier().fit(x_train, y_train)

In [92]:
# Test accuracy
classifier.score(x_test, y_test)

0.895808252161541

In [93]:
# Train accuracy
classifier.score(x_train, y_train)

0.9029715974388441

In [94]:
# Confusion matrix
confusion_matrix(y_test, classifier.predict(x_test))

array([[ 1934,  1172],
       [  732, 14436]])

In [95]:
# Analyze feature importance
pd.DataFrame({'feature': x_train.columns, 'importance': classifier.feature_importances_})

Unnamed: 0,feature,importance
0,onehotencoder__x0_Back,0.002035
1,onehotencoder__x0_Forward,0.003154
2,onehotencoder__x0_Goalkeeper,0.001063
3,onehotencoder__x0_Midfield,0.0
4,onehotencoder__x0_Wing,0.001367
5,onehotencoder__x0_Wing Back,0.000256
6,onehotencoder__x1_Ground Pass,0.360698
7,onehotencoder__x1_High Pass,0.067907
8,onehotencoder__x1_Low Pass,0.022998
9,onehotencoder__x2_Drop Kick,0.0


In [103]:
# Optimize hyperparameters for Gradient Boosted Decision Tree Classifier
from sklearn.model_selection import GridSearchCV
gbc = GradientBoostingClassifier()
parameter_space = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.05, 0.1, 1.0],
    'max_depth': [3, 5, 7],
}
classifier = GridSearchCV(gbc, parameter_space).fit(x_train, y_train)

In [104]:
classifier.best_params_

{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 300}

In [105]:
# Test accuracy
classifier.score(x_test, y_test)

0.8969574258509357

In [106]:
# Train accuracy
classifier.score(x_train, y_train)

0.9209489410605812

In [107]:
# Confusion matrix
confusion_matrix(y_test, classifier.predict(x_test))

array([[ 1942,  1164],
       [  719, 14449]])

In [112]:
# Use Gradient Boosted Decision Tree with original parameters which have better test performance
classifier = GradientBoostingClassifier().fit(features, response)
classifier.score(features, response)

0.9009485589201022

In [110]:
# Write likelihoods from the original Gradient Boosted Decision Tree Classifier to csv
ind = np.where(classifier.classes_ == True)[0][0]
out = pd.DataFrame(raw_events['passes']['id'])
out['likelihood'] = classifier.predict_proba(features)[:, ind]
out.to_csv('../web/data/xp.csv', index=False)

In [113]:
# Dump model for use in evaluating other games
dump(classifier, 'xp.joblib') 

['xp.joblib']