This notebook takes the Wyscout data and turns them into parquet files. These are extremely fast to load so good for this prototyping kind of analysis.

##### References:
Pappalardo, Luca; Massucco, Emanuele (2019): Soccer match event dataset. figshare. Collection. https://doi.org/10.6084/m9.figshare.c.4415000

Pappalardo, L., Cintia, P., Rossi, A. et al. A public data set of spatio-temporal match events in soccer competitions. Sci Data 6, 236 (2019). https://doi.org/10.1038/s41597-019-0247-7

Data link: https://figshare.com/collections/Soccer_match_event_dataset/4415000/2

In [None]:
import requests
import zipfile
import os
import pandas as pd
import numpy as np
import glob
from mplsoccer.statsbomb import _split_location_cols, _split_dict_col, _list_dictionary_to_df

# Change these paths/ parameters
You will need to change these paths/ parameters depending on where the StatsBomb open-data is located, how and where you want to save the resulting data, and if you only want the new files to be processed.

In [None]:
# save files in folder in current directory. To change if want to save elsewhere
DATA_FOLDER = os.path.join('..', '..', 'data', 'wyscout')

# Links to the data

In [None]:
# files that are jsons
JSON_LINKS = ['https://ndownloader.figshare.com/files/15073868',  # coaches
              # 'https://ndownloader.figshare.com/files/15074030',  # referees <- not downloaded as corrupt
              'https://ndownloader.figshare.com/files/15073721',  # players
              'https://ndownloader.figshare.com/files/15073697',  # teams
              'https://ndownloader.figshare.com/files/15073685',  # competitions
              'https://raw.githubusercontent.com/andrewRowlinson/mplsoccer/master/wyscout_event_tags.json',  # my decode tags
              ]  # competitions
JSON_FILES = ['coach.json', 
              #'referees.json',  # <- not downloaded as corrupt
              'player.json', 'team.json', 'competition.json',
              'event_tag.json']

In [None]:
# Files that are zipped
ZIP_LINKS = ['https://ndownloader.figshare.com/files/14464685',  # events
             'https://ndownloader.figshare.com/files/14464622']  # matches
ZIP_FILES = ['events.zip', 'matches.zip']

# Make the directory structure

In [None]:
# make the directory structure
for folder in ['json', 'event_raw', 'match_raw']:
    path = os.path.join(DATA_FOLDER, folder)
    if not os.path.exists(path):
        os.mkdir(path)

# Download files

In [None]:
def download_url(url, save_path, chunk_size=128, json=False):
    '''Souce: https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url '''
    r = requests.get(url, stream=True)
    if json:
        r.encoding = 'unicode-escape'
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

In [None]:
# download json files
for i, link in enumerate(JSON_LINKS):
    download_url(link, os.path.join(DATA_FOLDER, 'json', JSON_FILES[i]), json=True)

In [None]:
# download zip files, extract jsons, and remove original zip files
for i, link in enumerate(ZIP_LINKS):
    save_path = os.path.join(DATA_FOLDER, 'json', ZIP_FILES[i])
    download_url(link, save_path)
    with zipfile.ZipFile(save_path, 'r') as zip_ref:
        zip_ref.extractall(os.path.join(DATA_FOLDER, 'json'))
    os.remove(save_path)

# Coach

In [None]:
df_coach = pd.read_json(os.path.join(DATA_FOLDER, 'json', 'coach.json'), encoding='unicode-escape')
for col in ['passportArea', 'birthArea']:
    df_coach = _split_dict_col(df_coach, col)
df_coach.to_parquet(os.path.join(DATA_FOLDER, 'coach.parquet'))
df_coach.rename({'wyId': 'coach_id'}, axis=1, inplace=True)
df_coach.info()

# Players

In [None]:
df_player = pd.read_json(os.path.join(DATA_FOLDER, 'json', 'player.json'), encoding='unicode-escape')
for col in ['passportArea', 'role', 'birthArea']:
    df_player = _split_dict_col(df_player, col)
# some of the ids are null some are 'null' as text :)
for col in ['currentTeamId', 'currentNationalTeamId', 'passportArea_id', 'birthArea_id']:
    mask_null = (df_player[col].isnull())|(df_player[col] == 'null')
    df_player.loc[mask_null, col] = np.nan
    df_player[col] = df_player[col].astype(np.float32)
df_player.rename({'wyId': 'player_id'}, axis=1, inplace=True)
df_player.to_parquet(os.path.join(DATA_FOLDER, 'player.parquet'))
df_player.info()

# Teams

In [None]:
df_team = pd.read_json(os.path.join(DATA_FOLDER, 'json', 'team.json'), encoding='unicode-escape')
df_team = _split_dict_col(df_team, 'area')
df_team['area_id'] = df_team.area_id.astype(np.int32)
df_team.to_parquet(os.path.join(DATA_FOLDER, 'team.parquet'))
df_team.rename({'wyId': 'team_id'}, axis=1, inplace=True)
team_rename = {'Real Club Celta de Vigo': 'Celta Vigo',
               'Valencia Club de Fútbol': 'Valencia',
               'FC Barcelona': 'Barcelona',
               'Real Betis Balompié': 'Real Betis',
               'Girona FC': 'Girona',
               'CD Leganés': 'Leganés',
               'Real Sociedad de Fútbol': 'Real Sociedad',
               'Real Club Deportivo de La Coruña': 'Deportivo La Coruna',
               'Sevilla FC': 'Sevilla',
               'Getafe Club de Fútbol': 'Getafe',
               'Athletic Club Bilbao': 'Athletic Bilbao',
               'Real Madrid Club de Fútbol': 'Real Madrid',
               'Málaga Club de Fútbol': 'Málaga',
               'Levante UD': 'Levante',
               'Reial Club Deportiu Espanyol': 'Espanyol',
               'UD Las Palmas': 'Las Palmas',
               'SD Eibar': 'Eibar',
               'Villarreal Club de Fútbol': 'Villarreal',
               'Club Atlético de Madrid': 'Atlético Madrid',
               'Korea Republic': 'South Korea'}
df_team.officialName.replace(team_rename, inplace=True)
df_team.info()

# Competitions

In [None]:
df_competition = pd.read_json(os.path.join(DATA_FOLDER, 'json', 'competition.json'), encoding='unicode-escape')
df_competition = _split_dict_col(df_competition, 'area')
# if the area id is '0' as text for internationals set to missing
df_competition.loc[df_competition.format=='International cup', 'area_id'] = np.nan
df_competition['area_id'] = df_competition.area_id.astype(np.float32)
# make same format as StatsBomb: competition_country_name
mask = df_competition.type=='club'
df_competition.loc[mask, 'competition_country_name'] = df_competition.loc[mask, 'area_name']
mask = df_competition.type=='international'
df_competition.loc[mask, 'competition_country_name'] = 'International'
# add gender
df_competition['competition_gender'] = 'male'
# replace with competition real names
df_competition.name.replace({'Spanish first division': 'La Liga',
                             'World Cup': 'FIFA World Cup',
                             'Italian first division': 'Serie A',
                             'English first division': 'Premier League',
                             'French first division': 'Ligue 1',
                             'German first division': 'Bundesliga',
                             'European Championship': 'UEFA Euro'}, inplace=True)
# rename competition name
df_competition.rename({'name': 'competition_name', 'wyId': 'competition_id'}, axis=1, inplace=True)
# add season name
df_competition.loc[df_competition.type == 'club', 'season_name'] = '2017/2018'
df_competition.loc[df_competition.competition_name == 'UEFA Euro', 'season_name'] = '2016'
df_competition.loc[df_competition.competition_name == 'FIFA World Cup', 'season_name'] = '2018'
df_competition.to_parquet(os.path.join(DATA_FOLDER, 'competition.parquet'))
df_competition.info()

# Matches

I am not interested in formations or lineups so did not parse them

In [None]:
# list of match files
match_list = glob.glob(os.path.join(DATA_FOLDER, 'json', 'matches*.json'))

# loop through match files as save as seperate parquet files
for file in match_list:
    
    # match dataframe
    df_match = pd.read_json(file, encoding='unicode-escape')
    
    # split the team information from the teamsData column into two seperate columns
    col = 'teamsData'
    df_match[col] = df_match[col].apply(lambda x: {} if pd.isna(x) else x)
    df_match['team1'] = df_match.teamsData.apply(lambda x: x[list(x.keys())[0]])
    df_match['team2'] = df_match.teamsData.apply(lambda x: x[list(x.keys())[1]])
    
    # split team information stored as a dictionary into seperate columns
    df_match = _split_dict_col(df_match, 'team1')
    df_match = _split_dict_col(df_match, 'team2')
    
    # add home and away teams and scores up to extra time
    mask = df_match.team1_side == 'home'
    mask_et = (df_match.team1_scoreET > 0) | (df_match.team2_scoreET > 0)
    df_match.loc[mask,'home_score'] = df_match.loc[mask,'team1_score']
    df_match.loc[mask,'away_score'] = df_match.loc[mask,'team2_score']
    df_match.loc[~mask,'home_score'] = df_match.loc[~mask,'team2_score']
    df_match.loc[~mask,'away_score'] = df_match.loc[~mask,'team1_score']
    df_match.loc[mask_et & mask,'home_score'] = df_match.loc[mask_et & mask,'team1_scoreET']
    df_match.loc[mask_et & mask,'away_score'] = df_match.loc[mask_et & mask,'team2_scoreET']
    df_match.loc[mask_et & ~mask,'home_score'] = df_match.loc[mask_et & ~mask,'team2_scoreET']
    df_match.loc[mask_et & ~mask,'away_score'] = df_match.loc[mask_et & ~mask,'team1_scoreET']    
    
    # add away/ home team info
    df_match.loc[mask, 'home_team_id'] = df_match.loc[mask, 'team1_teamId']
    df_match.loc[~mask, 'home_team_id'] = df_match.loc[~mask, 'team2_teamId']
    df_match.loc[mask, 'away_team_id'] = df_match.loc[mask, 'team2_teamId']
    df_match.loc[~mask, 'away_team_id'] = df_match.loc[~mask, 'team1_teamId']
    
    # add away/home coach info
    df_match.loc[mask, 'home_team_coach_id'] = df_match.loc[mask, 'team1_coachId']
    df_match.loc[~mask, 'home_team_coach_id'] = df_match.loc[~mask, 'team2_coachId']
    df_match.loc[mask, 'away_team_coach_id'] = df_match.loc[mask, 'team2_coachId']
    df_match.loc[~mask, 'away_team_coach_id'] = df_match.loc[~mask, 'team1_coachId']

    # format date columns
    df_match['dateutc'] = pd.to_datetime(df_match.dateutc)
    df_match['kick_off'] = pd.to_datetime(df_match.date.astype(str).str[:-6])
    
    # rename columns
    df_match.rename({'wyId': 'match_id',
                     'gameweek': 'match_week',
                     'seasonId': 'season_id',
                     'competitionId': 'competition_id',
                     'venue': 'stadium_name'}, axis=1, inplace=True)
    
    # add competition info
    df_match = df_match.merge(df_competition[['competition_id',
                                              'competition_country_name',
                                              'competition_name',
                                              'season_name',
                                              'competition_gender']], on='competition_id', how='left')
    
    # add team info
    df_match = df_match.merge(df_team[['team_id', 'officialName']],
                              left_on='home_team_id', right_on='team_id', how='left')
    df_match = df_match.merge(df_team[['team_id', 'officialName']],
                              left_on='away_team_id', right_on='team_id', how='left', suffixes=['_home', '_away'])
    
    df_match.rename({'officialName_home': 'home_team_name',
                     'officialName_away': 'away_team_name'}, axis=1, inplace=True)
    
    # drop columns
    df_match.drop(['date', 'status', 'winner', 'referees', 'team_id_away', 'team_id_home',
                   'team1_formation_bench', 'team1_formation_lineup', 'team1_formation_substitutions',
                   'team2_formation_bench', 'team2_formation_lineup', 'team2_formation_substitutions',
                   'team1_hasFormation', 'team2_hasFormation',
                   'team1_score', 'team1_scoreP', 'team1_scoreHT', 'team1_scoreET',
                   'team2_score', 'team2_scoreP', 'team2_scoreHT', 'team2_scoreET',
                   'teamsData', 'team1_teamId', 'team2_teamId', 'team2_side',
                   'team1_side', 'team1_coachId', 'team2_coachId'], axis=1, inplace=True)
    
    save_path = os.path.join(DATA_FOLDER, 'match_raw', f'{os.path.basename(file)[:-4]}parquet')
    df_match.to_parquet(save_path)

Get matches as a single dataframe

In [None]:
match_files = glob.glob(os.path.join(DATA_FOLDER, 'match_raw', '*.parquet'))
df_match = pd.concat([pd.read_parquet(file) for file in match_files])
df_match.to_parquet(os.path.join(DATA_FOLDER, 'match.parquet'))
df_match.info()

# Events

In [None]:
# list of event files
events_list = glob.glob(os.path.join(DATA_FOLDER, 'json', 'events*.json'))

# loop through event files as save as seperate parquet files
for file in events_list:
    
    print(os.path.basename(file))
    
    # load as dataframe
    df_event = pd.read_json(file, encoding='unicode-escape')
    
    # split start and end positions
    _split_location_cols(df_event, 'positions', ['start', 'end'])
    
    # create seperate columns for the x/y coordinates
    for col in ['start', 'end']:
        df_event = _split_dict_col(df_event, col)
        
    # set dodgy end coordinates to null
    mask = df_event.eventName.isin(['Shot', 'Interruption', 'Offside'])
    mask2 = df_event.subEventName.isin(['Free kick shot', 'Hand foul', 'Late card foul', 'Out of game foul', 'Protest',
                                        'Simulation', 'Time lost foul', 'Violent Foul'])
    df_event.loc[mask | mask2, 'end_x'] = np.nan
    df_event.loc[mask | mask2, 'end_y'] = np.nan
    
    # wyscout has some dodgy end_y/ end_x near the corners. Convert to np.nan
    mask_dodgy_end = (((df_event.end_y == 100) & (df_event.end_x == 100)) | 
                      ((df_event.end_x == 0) & (df_event.end_y == 0)))
    df_event.loc[mask_dodgy_end, 'end_y'] = np.nan
    df_event.loc[mask_dodgy_end, 'end_x'] = np.nan
    
    # set dodgy start coordinates to null
    df_event.loc[df_event.eventName.isin(['Save attempt', 'Goalkeeper leaving line']), 'start_x'] = np.nan
    df_event.loc[df_event.eventName.isin(['Save attempt', 'Goalkeeper leaving line']), 'start_y'] = np.nan
    
    # fix start coordinates for goal kicks
    df_event.loc[df_event.subEventName == 'Goal kick', 'start_x'] = 6.
    df_event.loc[df_event.subEventName == 'Goal kick', 'start_y'] = 50.
    
    # create a seperate column for each tag in the dictionary
    df_new = pd.DataFrame(df_event['tags'].tolist(), index=df_event.index)
    for tag in df_new.columns:
        df_new.loc[df_new[tag].notnull(), tag] = df_new.loc[df_new[tag].notnull(), tag].apply(lambda x: x['id'])
        
    # summarise tag id columns into boolean columns for each tag and a string column for position 
    cols_to_drop = df_new.columns
    df_tag = pd.read_json(os.path.join(DATA_FOLDER, 'json', 'event_tag.json'))
    position_tags = df_tag.loc[df_tag.tag_name.str[:8] == 'position', 'tag_id'].values
    for i, row in df_tag.iterrows():
        if row['tag_id'] not in position_tags:
            df_new.loc[(df_new==row['tag_id']).any(axis=1), row['tag_name']] = True
        else:
            df_new.loc[(df_new==row['tag_id']).any(axis=1), 'position'] = row['tag_name']
            
    # remove 'position' and '_' from text in the position column
    df_new['position'] = df_new.position.str[9:].str.replace('_', ' ')
    df_new.loc[df_new['position'].isnull(), 'position'] = None
    
    # replace missing with False for boolean columns
    other_tags = df_tag.loc[df_tag.tag_name.str[:8] != 'position', 'tag_name'].values
    df_new[other_tags] = df_new[other_tags].replace({np.nan: False})
    
    # drop tag id columns
    df_new.drop(cols_to_drop, axis=1, inplace=True)                                               
                                        
    # add tags to the dataset
    df_event = pd.concat([df_event, df_new], axis=1)
    
    # drop tag column
    df_event.drop('tags', axis=1, inplace=True)
    
    # deal with blank subEventId
    df_event.loc[df_event.subEventId=='', 'subEventId'] = None
    df_event['subEventId'] = df_event['subEventId'].astype(np.float32)
    
    # rename columns for consistency with other datasets
    df_event.rename({'playerId': 'player_id',
                     'start_y': 'y',
                     'start_x': 'x',
                     'matchId': 'match_id',
                     'teamId': 'team_id',}, axis=1, inplace=True)
    


    # save to parquet
    save_path = os.path.join(DATA_FOLDER, 'event_raw', f'{os.path.basename(file)[:-4]}parquet')
    df_event.to_parquet(save_path)

Get events as a single dataframe

In [None]:
event_files = glob.glob(os.path.join(DATA_FOLDER, 'event_raw', '*.parquet'))
df_event = pd.concat([pd.read_parquet(file) for file in event_files])
df_event.sort_values(['match_id', 'matchPeriod', 'eventSec'], inplace=True)
df_event.to_parquet(os.path.join(DATA_FOLDER, 'event.parquet'))
df_event.info(verbose=True, null_counts=True)