In [1]:
import json
import pandas as pd
from glob import glob
import re
import numpy as np
import pickle

In [11]:
df = pd.DataFrame()

for f in [f for f in glob('data/events*.json') if 'World' not in f]:
    print('Loading {}...'.format(f))
    with open(f, 'r') as js:
        x = pd.read_json(js)
        ln = f.split('events_')[1].split('.json')[0]
        x['League'] = ln
        df = df.append(x)
        
print('{} events collected.'.format(df.shape[0]))
display(df.head(2))

Loading data/events_France.json...
Loading data/events_Spain.json...
Loading data/events_England.json...
Loading data/events_European_Championship.json...
Loading data/events_Italy.json...
Loading data/events_Germany.json...
3149535 events collected.


Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,League
0,8,Simple pass,[{'id': 1801}],253784,"[{'y': 51, 'x': 50}, {'y': 46, 'x': 31}]",2500686,Pass,3799,1H,1.935181,85,176505119,France
1,8,High pass,[{'id': 1801}],29474,"[{'y': 46, 'x': 31}, {'y': 74, 'x': 68}]",2500686,Pass,3799,1H,3.599295,83,176505121,France


In [12]:
def unlist(x):
    res = []
    for a in x:
        res += [str(a['id'])]
    return ', '.join(res)

df['tags'] = df['tags'].map(lambda x: unlist(x))
display(df.head(2))

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,League
0,8,Simple pass,1801,253784,"[{'y': 51, 'x': 50}, {'y': 46, 'x': 31}]",2500686,Pass,3799,1H,1.935181,85,176505119,France
1,8,High pass,1801,29474,"[{'y': 46, 'x': 31}, {'y': 74, 'x': 68}]",2500686,Pass,3799,1H,3.599295,83,176505121,France


In [13]:
pos = [re.findall('[0-9]+',str(x)) for x in df['positions']]
df['x0'] = [p[1] for p in pos]
df['y0'] = [p[0] for p in pos]
df['x1'] = [p[3] if len(p) > 2 else np.nan for p in pos]
df['y1'] = [p[2] if len(p) > 2 else np.nan for p in pos]

del df['positions']

display(df.head(2))

Unnamed: 0,eventId,subEventName,tags,playerId,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,League,x0,y0,x1,y1
0,8,Simple pass,1801,253784,2500686,Pass,3799,1H,1.935181,85,176505119,France,50,51,31,46
1,8,High pass,1801,29474,2500686,Pass,3799,1H,3.599295,83,176505121,France,31,46,68,74


In [14]:
with open('data/teams.json', 'r') as js:
    teams = pd.read_json(js)

display(teams.head(2))
teams['League'] = teams['area'].map(lambda x: x['name'])
teams = teams[['wyId','name']]
teams['name'] = [n.encode().decode('unicode-escape') for n in teams['name']]
teams.columns = ['teamId','teamName']
display(teams.head(2))

Unnamed: 0,city,name,wyId,officialName,area,type
0,Newcastle upon Tyne,Newcastle United,1613,Newcastle United FC,"{'name': 'England', 'id': '0', 'alpha3code': '...",club
1,Vigo,Celta de Vigo,692,Real Club Celta de Vigo,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club


Unnamed: 0,teamId,teamName
0,1613,Newcastle United
1,692,Celta de Vigo


In [15]:
with open('data/players.json', 'r') as js:
    players = pd.read_json(js)

players['role'] = players['role'].map(lambda x: x['name'])
players = players[['shortName','role', 'wyId']]
players.columns = ['playerName', 'playerRole', 'playerId']
players['playerName'] = [n.encode().decode('unicode-escape') for n in players['playerName']]

display(players.head(2))

Unnamed: 0,playerName,playerRole,playerId
0,H. Tekin,Goalkeeper,32777
1,M. Sarr,Defender,393228


In [16]:
df = pd.merge(df, teams, on = 'teamId', how = 'left')
df = pd.merge(df, players, on = 'playerId', how = 'left')
df = df.loc[df['playerId'] != 0]

display(df.head(3))

Unnamed: 0,eventId,subEventName,tags,playerId,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,League,x0,y0,x1,y1,teamName,playerName,playerRole
0,8,Simple pass,1801,253784,2500686,Pass,3799,1H,1.935181,85,176505119,France,50,51,31,46,Angers,B. Guillaume,Forward
1,8,High pass,1801,29474,2500686,Pass,3799,1H,3.599295,83,176505121,France,31,46,68,74,Angers,R. Thomas,Defender
2,1,Air duel,"703, 1801",253784,2500686,Duel,3799,1H,6.827043,10,176505122,France,68,74,72,54,Angers,B. Guillaume,Forward


In [17]:
df = df.sort_values(['matchId','matchPeriod','eventSec'])
df.to_csv('clean/events.csv', index = 'False')

In [18]:
italy = df.loc[df['League'] == 'Italy']
italy.to_csv('clean/italy.csv', index = 'False')

In [5]:
events_name = ['Pass']
subevents_id = [10, 11, 70, 72] # Air duel, Ground attacking duel, Acceleration, Touch

def get_passes(df):
    tmp = df.copy()
    tmp['halfmatch'] = tmp['matchId'].astype(str) + '_' + tmp['matchPeriod']
    tmp['teamId2'] = tmp[['teamId']].shift(-1)
    tmp['playerId2'] = tmp[['playerId']].shift(-1)
    tmp['pass'] = 0
    
    tmp.loc[((tmp['teamId'] == tmp['teamId2']) & 
              ((tmp['eventName'].isin(events_name)) | (tmp['subEventId'].isin(subevents_id)))), 'pass'] = 1
    
    passes = dict()
    for teamId in df['teamId'].unique():
        passes[teamId] = dict()
        
    for hm in tmp['halfmatch'].unique():
        x = tmp.loc[tmp['halfmatch'] == hm,]
        
        passlist = []
        
        for index, row in x.iterrows():
            
            # se c'è una catena di passaggi
            if row['pass'] == 1:
                passlist += [{'player1': row['playerId'],
                              'player2': row['playerId2'],
                              'matchPeriod': row['matchPeriod'],
                              'eventSec': row['eventSec'],
                              'x0': row['x0'], 'y0': row['y0'],
                              'x1': row['x1'], 'y1': row['y1']}]
                
            # catena di passaggi interrotta
            else:
                if len(passlist) > 0:
                    if row['matchId'] in passes[row['teamId']]:
                        passes[row['teamId']][row['matchId']] += [passlist]
                    else:
                        passes[row['teamId']][row['matchId']] = [passlist]
                    
                    passlist = []
                  
                # catena di passaggi non in corso
                else:
                    pass
        
    return passes, tmp

passes, _ = get_passes(df)


In [6]:
with open('clean/passes.pickle', 'wb') as pick:
    pickle.dump(passes, pick)

In [4]:
events = events.loc[events['League'] != 'European_Championship']

events.to_csv('clean/events_no_champions.csv', index = False)

In [2]:
feats = events[['teamId', 'teamName']].drop_duplicates()
feats.to_csv('clean/feats.csv', index = False)