Download events (/events/) & matches (/matches/) & players (./players.json) from https://figshare.com/collections/Soccer_match_event_dataset/4415000/3

In [None]:
import pandas as pd
import numpy as np
from collections import ChainMap
competition = 'England'
events = pd.read_json('events/events_{}.json'.format(competition)).set_index('id')
matches = pd.read_json('matches/matches_{}.json'.format(competition)).set_index('wyId')
#players = pd.read_json('players.json').set_index('wyId')

# Matches 

In [None]:
def filter_last_minutes_events(events, matchId, minutes=10):
    return events[(events['matchId']==matchId) & (events['matchPeriod']=='2H') & (events['eventSec']>(45-minutes)*60)]

In [None]:
minutes = 10

In [None]:
def get_shots_season(events):
    events_shot = events[events.eventName == 'Shot']
    fkShots = ['Free kick shot', 'Penalty']
    events_fk = events[ (events.eventName == 'Free Kick') & (events.subEventName.isin(fkShots) )]
    events_shot['Shots'] = events_shot.apply(lambda _: '1', axis=1)
    events_fk['FreeKicks'] = events_fk.apply(lambda _: '1', axis=1)
    fkShotsMatch = events_fk[['matchId','FreeKicks']].groupby('matchId').count()
    shotsMatch = events_shot[['matchId','Shots']].groupby('matchId').count()
    shots = pd.merge(fkShotsMatch, shotsMatch, on ='matchId')
    shots['Total'] = shots.FreeKicks + shots.Shots
    shots_array = np.array(shots.Total)
    return np.mean(shots_array), np.std(shots_array)
get_shots(events)

# Classification

In [None]:
def get_goals_per_game(teamsData):
    keys_team_data = list(teamsData.keys())
    return {keys_team_data[0]:teamsData[keys_team_data[0]]['score'],keys_team_data[1]:teamsData[keys_team_data[1]]['score']}

In [None]:
def get_points_per_score(teamsData): #{'3783': 2, '3767': 1}    
    keys_team_data = list(teamsData.keys())    
    score_match = get_goals_per_game(teamsData)
    if score_match[keys_team_data[0]]==score_match[keys_team_data[1]]:
        return {keys_team_data[0]:1,keys_team_data[1]:1}
    if score_match[keys_team_data[0]]>score_match[keys_team_data[1]]:
        return {keys_team_data[0]:3,keys_team_data[1]:0}
    if score_match[keys_team_data[0]]<score_match[keys_team_data[1]]:
        return {keys_team_data[0]:0,keys_team_data[1]:3}

In [None]:
def get_points_round(round_matches):
    matches_selected = round_matches.sort_values('gameweek')
    dicts = matches_selected.apply(lambda x: get_points_per_score(x['teamsData']), axis=1)
    return dict(ChainMap(*list(dicts)))

In [None]:
def get_points_league(matches):
    all_rounds = {}
    for gameweek, matches_played in matches.groupby('gameweek'):
        all_rounds[gameweek]=get_points_round(matches_played)
    return pd.DataFrame.from_dict(all_rounds)

In [None]:
# points = get_points_league(matches)

In [None]:
def get_points_aggregated_by_round(matches):
    points = get_points_league(matches)
    for round_i in range(2,len(points.columns)+1):
        points[round_i]=points[[round_i-1,round_i]].sum(axis=1)
    return points

In [None]:
# points_agg = get_points_aggregated_by_round(matches)

In [None]:
def get_classification_by_round(matches):
    points_agg = get_points_aggregated_by_round(matches)
    return points_agg.rank(method='first',ascending=False).astype(int)

In [None]:
#classification = get_classification_by_round(matches)

In [None]:
def get_frequency_of_position_changes(matches):
    classification = get_classification_by_round(matches)
    all_ranks = [rank for team in classification.values
         for rank in [rank for i, rank in enumerate(team) 
            if i==0 or team[i-1]!=rank]]
    unique, counts = np.unique(all_ranks, return_counts=True)
    return pd.Series(dict(zip(unique, counts)))

In [None]:
import matplotlib.pyplot
%matplotlib inline
get_frequency_of_position_changes(matches).plot.barh()

In [None]:
# win / month