In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import log_loss
from sklearn.calibration import calibration_curve, CalibratedClassifierCV


In [2]:
datasets=['England', 'European_Championship', 'France', 'Germany', 'Spain', 'Italy', 'World_Cup']

In [3]:
event_dataframes = []
match_dataframes =  []
for dataset in datasets:
    with open('Wyscout/events_'+dataset+'.json') as f:
        json_data = json.load(f)
        pandas_data = pd.DataFrame(json_data)
        event_dataframes.append(pandas_data)
    with open('Wyscout/matches_'+dataset+'.json') as f:
        json_data = json.load(f)
        pandas_data = pd.DataFrame(json_data)
        match_dataframes.append(pandas_data)


all_events_df = pd.concat(event_dataframes, axis=0).reset_index(drop=True)
matches_df = pd.concat(match_dataframes, axis=0).reset_index(drop=True)
with open('Wyscout/players.json') as f:
    player_json=json.load(f)
    player_df = pd.DataFrame(player_json)

In [4]:
all_events_df=pd.merge(all_events_df, player_df[['wyId', 'foot', 'firstName', 'lastName']], 
                       left_on='playerId', right_on='wyId')
all_events_df=pd.merge(all_events_df, matches_df[['wyId', 'label', 'venue', 'date']], left_on='matchId', right_on='wyId')
all_events_df=all_events_df.drop(columns=['wyId_x', 'wyId_y'])
all_events_df=all_events_df.sort_values(['matchId', 'matchPeriod', 'eventSec'])

In [5]:
all_events_df['previous_event'] = all_events_df['subEventName'].shift(1)

In [6]:
all_shots = all_events_df[(all_events_df['subEventName'] == 'Shot') |(all_events_df['subEventName'] =='Free kick shot')].copy()

all_shots['free_kick'] = 1*(all_shots['subEventName'] == 'Free kick shot')


In [7]:
all_shots['rebound']= 1*(all_shots['previous_event'] == 'Penalty') \
                    + 1*(all_shots['previous_event'] == 'Free kick shot') \
                    + 1*(all_shots['previous_event'] == 'Shot') \
                    + 1*(all_shots['previous_event'] == 'Save attempt') 

all_shots['prev_cross'] = 1*(all_shots['previous_event'] == 'Corner') \
                        + 1*(all_shots['previous_event'] == 'Free kick cross') \
                        + 1*(all_shots['previous_event'] == 'Cross') 

all_shots['prev_touch'] = 1*(all_shots['previous_event'] == 'Touch')

all_shots['prev_pass'] = 1*(all_shots['previous_event'] == 'Simple pass') \
                       + 1*(all_shots['previous_event'] == 'Head pass') \
                       + 1*(all_shots['previous_event'] == 'Goal kick') \

all_shots['prev_smart_pass'] = 1*(all_shots['previous_event'] == 'Smart pass')

all_shots['prev_duel'] = 1*(all_shots['previous_event'] == 'Air duel') \
                       + 1*(all_shots['previous_event'] == 'Ground defending duel')  \
                       + 1*(all_shots['previous_event'] == 'Ground attacking duel') \
                       + 1*(all_shots['previous_event'] == 'Ground loose ball duel duel') 


In [8]:
shots_model=pd.DataFrame(columns=['Goal','X','Y', 'side_of_field', 'left_foot', 
                                  'right_foot', 'header', 'counter_attack', 'strong_foot'])

Note, this code takes a while to run. If you have any suggestions to improve this, please submit a PR.

Special thanks to David Sumpter for the code to determine the distance and angle from the goal

In [9]:
for i,shot in all_shots.iterrows():
    shots_model.at[i,'X']=100-shot['positions'][0]['x']
    shots_model.at[i,'Y']=shot['positions'][0]['y']
    shots_model.at[i,'side_of_field']= 1*(shot['positions'][0]['y'] <  50)
    shots_model.at[i,'C']=abs(shot['positions'][0]['y']-50)
    
    #Distance in metres and shot angle in radians.
    x=shots_model.at[i,'X']*105/100
    y=shots_model.at[i,'C']*65/100
    shots_model.at[i,'Distance']=np.sqrt(x**2 + y**2)
    a = np.arctan(7.32 *x /(x**2 + y**2 - (7.32/2)**2))
    if a<0:
        a=np.pi+a
    shots_model.at[i,'Angle'] =a
    shottags=[tag['id'] for tag in shot['tags']]
    if 101 in shottags:
        shots_model.at[i,'Goal']=1
    if 401 in shottags:
        shots_model.at[i, 'left_foot']=1
        if shot.loc['foot']=='left':
            shots_model.at[i, 'strong_foot'] = 1
    if 402 in shottags:
        shots_model.at[i, 'right_foot']=1
        if shot.loc['foot']=='right':
            shots_model.at[i, 'strong_foot'] = 1
    if 403 in shottags:
        shots_model.at[i, 'header']=1
    if 1901 in shottags:
        shots_model.at[i, 'counter_attack'] = 1
shots_model = shots_model.fillna(0)


In [10]:
shots_model['out_swinging'] = 1*(shots_model['side_of_field'] ==0)*(shots_model['right_foot']==1) \
                           + 1*(shots_model['side_of_field'] ==1)*(shots_model['left_foot']==1)
shots_model['in_swinging'] = 1*(shots_model['side_of_field'] ==0)*(shots_model['left_foot']==1) \
                            + 1*(shots_model['side_of_field'] == 1)*(shots_model['right_foot']==1)


In [11]:
shots_model = pd.merge(shots_model, all_shots[['rebound', 'prev_cross', 'prev_touch',  'prev_pass','prev_smart_pass',  
                                               'free_kick', 'prev_duel',  'firstName', 'lastName', 'label', 'venue', 
                                               'date','eventSec', 'matchPeriod']], left_index=True, right_index=True, how='left')
shots_model = shots_model.reset_index(drop=True)


In [12]:
shots_model

Unnamed: 0,Goal,X,Y,side_of_field,left_foot,right_foot,header,counter_attack,strong_foot,C,...,prev_smart_pass,free_kick,prev_duel,firstName,lastName,label,venue,date,eventSec,matchPeriod
0,0,9,29,1,0,1,0,0,0,21.0,...,0,0,0,Blaise,Matuidi,"France - Romania, 2 - 1",Stade de France,"June 10, 2016 at 9:00:00 PM GMT+2",31.226217,1H
1,0,29,29,1,0,1,0,0,1,21.0,...,0,0,0,Mihai Doru,Pintilii,"France - Romania, 2 - 1",Stade de France,"June 10, 2016 at 9:00:00 PM GMT+2",143.119551,1H
2,0,4,57,0,0,1,0,0,1,7.0,...,0,0,1,Bogdan Sorin,Stancu,"France - Romania, 2 - 1",Stade de France,"June 10, 2016 at 9:00:00 PM GMT+2",219.576026,1H
3,0,4,61,0,0,0,1,0,0,11.0,...,0,0,1,Florin,Andone,"France - Romania, 2 - 1",Stade de France,"June 10, 2016 at 9:00:00 PM GMT+2",247.532561,1H
4,0,25,33,1,1,0,0,0,1,17.0,...,0,0,0,Antoine,Griezmann,"France - Romania, 2 - 1",Stade de France,"June 10, 2016 at 9:00:00 PM GMT+2",557.319065,1H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45279,0,5,45,1,1,0,0,0,1,5.0,...,0,0,0,Diego Sebasti\u00e1n,Laxalt Su\u00e1rez,"Genoa - Torino, 1 - 2",,"May 20, 2018 at 3:00:00 PM GMT+2",1152.032980,2H
45280,0,7,38,1,1,0,0,0,1,12.0,...,1,0,0,Giuseppe,Rossi,"Genoa - Torino, 1 - 2",,"May 20, 2018 at 3:00:00 PM GMT+2",1251.730517,2H
45281,1,10,46,1,1,0,0,0,1,4.0,...,0,0,1,Goran,Pandev,"Genoa - Torino, 1 - 2",,"May 20, 2018 at 3:00:00 PM GMT+2",2065.034482,2H
45282,0,21,32,1,0,1,0,0,1,18.0,...,0,0,0,Stephane,Omeonga,"Genoa - Torino, 1 - 2",,"May 20, 2018 at 3:00:00 PM GMT+2",2367.252041,2H


In [13]:
shots_model.to_csv('shots_design_matrix.csv')