In [39]:
import os
import warnings
import pandas as pd
pd.set_option('display.max_columns', None)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings(action="ignore", message="credentials were not supplied. open data access only")
from tqdm import tqdm
import statsbombpy

In [40]:
%reload_ext autoreload
%autoreload 2
from socceraction.data.statsbomb import StatsBombLoader
import socceraction.spadl as spadl

In [56]:
#!pip install mplsoccer

In [55]:
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.image as mpimg 
import seaborn as sns

import matplotsoccer as mps
import matplotlib.pyplot as plt
from mplsoccer import Pitch, VerticalPitch
import altair as alt

import warnings
warnings.filterwarnings("ignore")

In [51]:
SBL = StatsBombLoader(getter="remote", creds={"user": None, "passwd": None})

In [12]:
competitions = SBL.competitions()

In [13]:
competitions

Unnamed: 0,season_id,competition_id,competition_name,country_name,competition_gender,season_name
0,4,16,Champions League,Europe,male,2018/2019
1,1,16,Champions League,Europe,male,2017/2018
2,2,16,Champions League,Europe,male,2016/2017
3,27,16,Champions League,Europe,male,2015/2016
4,26,16,Champions League,Europe,male,2014/2015
5,25,16,Champions League,Europe,male,2013/2014
6,24,16,Champions League,Europe,male,2012/2013
7,23,16,Champions League,Europe,male,2011/2012
8,22,16,Champions League,Europe,male,2010/2011
9,21,16,Champions League,Europe,male,2009/2010


In [14]:
selected_competitions = competitions[competitions.competition_name == "Premier League"]
selected_competitions

Unnamed: 0,season_id,competition_id,competition_name,country_name,competition_gender,season_name
37,44,2,Premier League,England,male,2003/2004


In [15]:
games = pd.concat([
    SBL.games(row.competition_id, row.season_id)
    for row in selected_competitions.itertuples()
])


In [19]:
games_verbose = tqdm.tqdm(list(games.itertuples()), desc="Loading game data")
teams, players = [], []
actions = {}
for game in games_verbose:
    # load data
    teams.append(SBL.teams(game.game_id))
    players.append(SBL.players(game.game_id))
    events = SBL.events(game.game_id)
    # convert data
    actions[game.game_id] = spadl.statsbomb.convert_to_actions(events, game.home_team_id)

teams = pd.concat(teams).drop_duplicates(subset="team_id")
players = pd.concat(players)


Loading game data: 100%|████████████████████████| 33/33 [00:43<00:00,  1.32s/it]


In [28]:
datafolder = "data"

# Create data folder if it doesn't exist
if not os.path.exists(datafolder):
    os.mkdir(datafolder)
    print(f"Directory {datafolder} created.")

spadl_h5 = os.path.join(datafolder, "spadl-statsbomb.h5")

# Store all spadl data in h5-file
with pd.HDFStore(spadl_h5) as spadlstore:
    spadlstore["competitions"] = selected_competitions
    spadlstore["games"] = games
    spadlstore["teams"] = teams
    spadlstore["players"] = players[['player_id', 'player_name', 'nickname']].drop_duplicates(subset='player_id')
    spadlstore["player_games"] = players[['player_id', 'game_id', 'team_id', 'is_starter', 'starting_position_id', 'starting_position_name', 'minutes_played']]
    for game_id in actions.keys():
        spadlstore[f"actions/game_{game_id}"] = actions[game_id]

print(spadlstore)


<class 'pandas.io.pytables.HDFStore'>
File path: data/spadl-statsbomb.h5



In [36]:
#Player Position Plot - combining
data = "data/spadl-statsbomb.h5"
games = pd.read_hdf(data,key="games")
epl16 = games[(games.competition_id == 2) & (games.season_id == 44)]

In [67]:
def get_actions(games, hdf_url):
    actions = []
    for game in tqdm(list(games.itertuples())):
        a = pd.read_hdf(hdf_url, key="actions/game_" + str(game.game_id))
        a["left_to_right"] = a["team_id"] == game.home_team_id
        actions.append(a)
    
    actions = pd.concat(actions)
    actions = always_ltr(actions)

    return actions


def always_ltr(actions):
    away_idx = ~actions.left_to_right
    actions.loc[away_idx, "start_x"] = 105 - actions[away_idx].start_x.values
    actions.loc[away_idx, "start_y"] = 68 - actions[away_idx].start_y.values
    actions.loc[away_idx, "end_x"] = 105 - actions[away_idx].end_x.values
    actions.loc[away_idx, "end_y"] = 68 - actions[away_idx].end_y.values
    return actions

actions = get_actions(epl16,data)
players = pd.read_hdf(data,key="players")
actions = actions.merge(players)

#sort player_name list for dropdown
player_name = list(set(actions.player_name))[0]

#for player position
pa = actions[actions.player_name.str.contains(player_name)]
x = pa.start_x
y = pa.start_y

#for player passing
pa2 = actions[actions.player_name.str.contains(player_name)]
pa2 = actions[actions["type_id"]==0]
x2 = pa2.start_x
y2 = pa2.start_y

#for player shot
pa3 = actions[actions.player_name.str.contains(player_name)]
pa3 = actions[actions["type_id"]==11]
x3 = pa3.start_x
y3 = pa3.start_y

#for player dribble
pa4 = actions[actions.player_name.str.contains(player_name)]
pa4 = actions[actions["type_id"]==21]
x4 = pa4.start_x
y4 = pa4.start_y

#call Pitch()
pitch = Pitch()
fig, ax = pitch.draw(nrows=2, ncols=2)

#try/except for players who don't have data for position/pass/shot/dribble
try:
    sns.kdeplot(x, y, cmap="Reds", shade=True, bw=.15, cbar=True, n_levels=6, ax=ax[0,0]).collections[0].set_alpha(0)
    sns.kdeplot(x2, y2, cmap="Blues", shade=True, bw=.15, cbar=True, n_levels=6, ax=ax[0,1]).collections[0].set_alpha(0)
    sns.kdeplot(x3, y3, cmap="YlOrBr", shade=True, bw=.15, cbar=True, n_levels=6, ax=ax[1,0]).collections[0].set_alpha(0)
    sns.kdeplot(x4, y4, cmap="Purples", shade=True, bw=.15, cbar=True, n_levels=6, ax=ax[1,1]).collections[0].set_alpha(0)
except:
    None

#set title
ax[0, 0].set_title('Position for '+ player_name)
ax[0, 1].set_title('Passing for '+ player_name)
ax[1, 0].set_title('Shot for '+ player_name)
ax[1, 1].set_title('Dribble for '+ player_name)

fig.tight_layout()
plt.show()



100%|███████████████████████████████████████████| 33/33 [00:00<00:00, 77.62it/s]


In [64]:
#Player Position Plot 
data = "data/spadl-statsbomb.h5"
games = pd.read_hdf(data,key="games")
epl16 = games[(games.competition_id == 2) & (games.season_id == 44)]

def get_actions(games, hdf_url):
    actions = []
    for game in tqdm(list(games.itertuples())):
        a = pd.read_hdf(hdf_url, key="actions/game_" + str(game.game_id))
        a["left_to_right"] = a["team_id"] == game.home_team_id
        actions.append(a)
    
    actions = pd.concat(actions)
    actions = always_ltr(actions)

    return actions


def always_ltr(actions):
    away_idx = ~actions.left_to_right
    actions.loc[away_idx, "start_x"] = 105 - actions[away_idx].start_x.values
    actions.loc[away_idx, "start_y"] = 68 - actions[away_idx].start_y.values
    actions.loc[away_idx, "end_x"] = 105 - actions[away_idx].end_x.values
    actions.loc[away_idx, "end_y"] = 68 - actions[away_idx].end_y.values
    return actions

actions = get_actions(epl16,data)
players = pd.read_hdf(data,key="players")
actions = actions.merge(players)

# choose a player to visualize, feel free to change the index
player_name = sorted(list(set(actions.player_name)))
    
pa = actions[actions.player_name.str.contains(player_name)]

x = pa.start_x
y = pa.start_y

base = mps.field("green",figsize=8,show=False)
hmax = sns.kdeplot(x, y, cmap="Reds", shade=True, bw=.15, cbar=True, n_levels=6)
hmax.collections[0].set_alpha(0)


plt.title("Position Visualization for " + player_name)
plt.show()

100%|███████████████████████████████████████████| 33/33 [00:00<00:00, 74.17it/s]


Aaron Hughes
Aaron Lennon
Adrian Mutu
Alain Goma
Alan Shearer
Alan Smith
Alan Stubbs
Alan Wright
Alessandro Pistone
Alex Rae
Alexei Gennadyevich Smertin
Aliou Cissé
Amdy Moustapha Faye
Andrew Davies
Andy Cole
Andy Melville
Andy O"Brien
Andy Todd
Anthony Gardner
Anthony Le Tallec
Antoine Sibierski
Antti Mikko Niemi
Arjan de Zeeuw
Ashley Cole
Augustine Azuka "Jay-Jay" Okocha
Barry Ferguson
Barry Hayles
Billy McKinlay
Bobby Zamora
Boris Živković
Boudewijn Zenden
Brad Friedel
Brett Emerton
Brett Ormerod
Brian McBride
Bruno N"Gotty
Bryan Hughes
Carl Cort
Carlo Cudicini
Carlos Manuel Bocanegra
Carlton Cole Okirie
Chris Baird
Chris Perry
Chris Riggott
Claude Makélélé
Claudio Reyna
Claus Jensen
Clinton Morrison
Colin Cameron
Colin Cooper
Collins John
Craig Bellamy
Craig Short
Cristiano Ronaldo dos Santos Aveiro
Damien Duff
Damien Johnson
Danny Coyne
Danny Higginbotham
Danny Mills
Danny Murphy
Danny Tiatto
Darren Ambrose
Darren Anderton
Darren Fletcher
Darren Kenton
David Batty
David Dunn
David

TypeError: unhashable type: 'list'

In [None]:
#Passing Vis
pa2 = actions[actions.player_name.str.contains(player_name)]
#filter only passing actions (==0)
pa2 = actions[actions["type_id"]==0]

x = pa2.start_x
y = pa2.start_y

base = mps.field("green",figsize=8,show=False)
hmax = sns.kdeplot(x, y, cmap="Blues", shade=True, bw=.15, cbar=True, n_levels=6)
hmax.collections[0].set_alpha(0)

plt.title("Passing Visualization for " + player_name)
plt.show()

In [141]:
#Shot Vis
pa3 = actions[actions.player_name.str.contains(player_name)]
#filter only passing actions (==0)
pa3 = actions[actions["type_id"]==11]

x = pa3.start_x
y = pa3.start_y

base = mps.field("green",figsize=8,show=False)
hmax = sns.kdeplot(x, y, cmap="YlOrBr", shade=True, bw=.15, cbar=True, n_levels=6)
hmax.collections[0].set_alpha(0)

plt.title("Shot Visualization for " + player_name)
plt.show()



In [136]:
#Dribble Vis
pa4 = actions[actions.player_name.str.contains(player_name)]
pa4 = actions[actions["type_id"]==20]

x = pa4.start_x
y = pa4.start_y

base = mps.field("green",figsize=8,show=False)
hmax = sns.kdeplot(x, y, cmap="Purples", shade=True, bw=.15, cbar=True, n_levels=6)
hmax.collections[0].set_alpha(0)

plt.title("Dribble Visualization for " + player_name)
plt.show()


  0%|                                                    | 0/33 [00:00<?, ?it/s][A
 21%|█████████▎                                  | 7/33 [00:00<00:00, 65.40it/s][A
 45%|███████████████████▌                       | 15/33 [00:00<00:00, 68.97it/s][A
 70%|█████████████████████████████▉             | 23/33 [00:00<00:00, 70.39it/s][A
100%|███████████████████████████████████████████| 33/33 [00:00<00:00, 69.46it/s][A


IndexError: list index out of range

In [133]:
actions[actions["type_id"] == 22]

Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,type_id,result_id,bodypart_id,action_id,left_to_right,player_name,nickname
12221,3749052,887bb629-e316-4309-8beb-c4d889765596,1,770.0,46,40319.0,4.411765,37.873418,78.705882,46.136709,22,0,0,261,True,Paul Jones,
12228,3749052,69b23be0-1ae4-4d40-9e6e-d3f2839f377b,1,1312.0,46,40319.0,4.411765,37.873418,63.970588,47.083544,22,0,0,399,True,Paul Jones,
12230,3749052,ec640769-f858-4247-a416-f868654834d2,1,1686.0,46,40319.0,4.411765,37.873418,77.735294,47.341772,22,1,0,501,True,Paul Jones,
12236,3749052,49c591fe-2e9b-4c49-9474-1b051c8c617a,1,2369.0,46,40319.0,4.411765,37.873418,74.382353,57.412658,22,1,0,718,True,Paul Jones,
12237,3749052,6b9c97dc-d9cb-4f0b-b4db-a43ac8e91337,1,2534.0,46,40319.0,4.411765,30.987342,62.117647,43.468354,22,0,0,773,True,Paul Jones,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63368,3749403,fd605d39-8c8c-4d95-b6a7-0a0bb04aa46f,2,453.0,328,40385.0,4.411765,37.873418,67.676471,47.427848,22,1,0,1262,False,Jussi Jääskeläinen,
63373,3749403,a08a5ebb-64e5-4fe3-9401-866874f9c18e,2,1179.0,328,40385.0,4.411765,37.873418,73.764706,50.956962,22,0,0,1484,False,Jussi Jääskeläinen,
63374,3749403,9aa04dc3-83ad-4e6e-b6ec-0f15cd806edd,2,1305.0,328,40385.0,4.411765,37.873418,70.852941,47.858228,22,1,0,1522,False,Jussi Jääskeläinen,
63377,3749403,204a2283-6bf5-46aa-9827-dcac9bc723c3,2,1882.0,328,40385.0,4.411765,37.873418,57.882353,43.210127,22,0,0,1723,False,Jussi Jääskeläinen,


In [39]:
player_name

'Aaron Hughes'

In [60]:

data = "data/spadl-statsbomb.h5"
games = pd.read_hdf(data,key="games")
epl16 = games[(games.competition_id == 2) & (games.season_id == 44)]


def get_actions(games, hdf_url):
    actions = []
    for game in tqdm(list(games.itertuples())):
        a = pd.read_hdf(hdf_url, key="actions/game_" + str(game.game_id))
        a["left_to_right"] = a["team_id"] == game.home_team_id
        actions.append(a)
    
    actions = pd.concat(actions)
    actions = always_ltr(actions)

    return actions


def always_ltr(actions):
    away_idx = ~actions.left_to_right
    actions.loc[away_idx, "start_x"] = 105 - actions[away_idx].start_x.values
    actions.loc[away_idx, "start_y"] = 68 - actions[away_idx].start_y.values
    actions.loc[away_idx, "end_x"] = 105 - actions[away_idx].end_x.values
    actions.loc[away_idx, "end_y"] = 68 - actions[away_idx].end_y.values
    return actions

actions = get_actions(epl16,data)
players = pd.read_hdf(data,key="players")
actions = actions.merge(players)

# choose a player to visualize, feel free to change the index
player_name = list(set(actions.player_name))[0]

pa = actions[actions.player_name.str.contains(player_name)]

x = pa.start_x
y = pa.start_y

mps.field("green",figsize=8,show=False)
    
plt.scatter(x,y)
plt.axis("on")
plt.title("position visualization for " + player_name)
plt.show()


  0%|                                                    | 0/33 [00:00<?, ?it/s][A
 21%|█████████▎                                  | 7/33 [00:00<00:00, 68.48it/s][A
 45%|███████████████████▌                       | 15/33 [00:00<00:00, 71.74it/s][A
 70%|█████████████████████████████▉             | 23/33 [00:00<00:00, 73.73it/s][A
100%|███████████████████████████████████████████| 33/33 [00:00<00:00, 73.25it/s][A
