In [24]:
import pandas as pd
import numpy as np
import seaborn as sns

Read data either in parquet or CSV format

In [2]:
position_df = pd.read_parquet('sample_tracking_data.parquet')
# position_df = pd.read_csv('sample_tracking_data.csv', index_col=0)

In [3]:
position_df

Unnamed: 0,N,T,X,Y,Z,D,A,S,M,GameSection,TeamId,PersonId,BallPossession,BallStatus,IS_LTR,X_LTR,Y_LTR
0,34855,2014-01-15 10:07:40+01:00,5.49,-5.57,0.0,12.86,-1.90,12.02,17,_Half,DFL-CLU-ZZZZ01,DFL-OBJ-ZZZZ06,2,1,True,5.49,-5.57
1,34855,2014-01-15 10:07:40+01:00,5.33,-24.32,0.0,3.31,-0.61,2.81,17,_Half,DFL-CLU-ZZZZ00,DFL-OBJ-ZZZZ09,2,1,False,-5.33,24.32
2,34855,2014-01-15 10:07:40+01:00,-4.61,-7.26,0.0,4.97,-0.44,5.00,17,_Half,DFL-CLU-ZZZZ00,DFL-OBJ-ZZZZ08,2,1,False,4.61,7.26
3,34855,2014-01-15 10:07:40+01:00,-5.68,-31.77,0.0,4.27,0.20,3.96,17,_Half,DFL-CLU-ZZZZ00,DFL-OBJ-ZZZZ07,2,1,False,5.68,31.77
4,34855,2014-01-15 10:07:40+01:00,-37.97,-1.59,0.0,1.55,0.33,0.24,17,_Half,DFL-CLU-ZZZZ01,DFL-OBJ-ZZZZ05,2,1,True,-37.97,-1.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1365021,87355,2014-01-15 10:42:40+01:00,20.56,-4.68,0.0,5.42,-0.33,4.96,52,_Half,DFL-CLU-ZZZZ00,DFL-OBJ-ZZZZ22,2,0,False,-20.56,4.68
1365022,87355,2014-01-15 10:42:40+01:00,3.25,14.81,0.0,8.20,-1.17,7.47,52,_Half,DFL-CLU-ZZZZ01,DFL-OBJ-ZZZZ23,2,0,True,3.25,14.81
1365023,87355,2014-01-15 10:42:40+01:00,12.08,11.80,0.0,2.57,-0.07,4.06,52,_Half,referee,DFL-OBJ-ZZZZ24,2,0,False,-12.08,-11.80
1365024,87355,2014-01-15 10:42:40+01:00,17.25,0.89,0.0,5.27,-0.05,4.70,52,_Half,DFL-CLU-ZZZZ01,DFL-OBJ-ZZZZ14,2,0,True,17.25,0.89


Positional data is collected at 25 frames per second. Each row of the DataFrame represents the gathered data of one "agent", i.e. player, referee, or ball, at one frame.


* N: FrameID. Numeric ID of the current frame
* T: Real time of that frame
* X: X-position of the agent at frame N. This represents the dimension between the two goals in meters, where the "left" goal is at X=-52.5 and the "right" goal at 52.5.
* Y: Y-position of the agent at frame N. This represents the dimension between the two sidelines in meters, with range -34 to 34.
* Z: Z-Position of the ball at frame N. This is always 0 except for the ball where it represents the height of the ball in meters.
* A: Acceleration of agent.
* S: Speed of agent.
* M: Minute of play
* GameSection: Generally represents the half of the game (firstHalf, secondHalf)
* TeamId: ID of the team. Can be one of 'referree', 'BALL', and the two teams
* PersonId: ID of the agent
* BallPossession. Determines which team has ball possession. One of {1,2}.
* BallStatus: Determines whether the ball is in play or if the game has stopped, e.g. due to a foul. One of {0,1}
* IS_LTR: Determines whether the row's player and team is playing 'from left to right'
* X_LTR: The X-Position of the player IF he would play from left to right. I.e. if he isn't, the X-position is reversed.
* Y_LTR: The Y-Position of the player IF he would play from left to right. I.e. if he isn't, the Y-position is reversed.


Overall, this sample data consists of 35 minutes of play. Player and team IDs, frames, times, minutes, and GameSections are anonymized. 

In [4]:
position_df.query('TeamId == "BALL"')

Unnamed: 0,N,T,X,Y,Z,D,A,S,M,GameSection,TeamId,PersonId,BallPossession,BallStatus,IS_LTR,X_LTR,Y_LTR
19,34855,2014-01-15 10:07:40+01:00,5.59,-7.77,0.21,43.02,0.0,29.22,17,_Half,BALL,DFL-OBJ-ZZZZ13,2,1,False,-5.59,7.77
39,34856,2014-01-15 10:07:40.040000+01:00,5.98,-7.58,0.28,43.41,0.0,9.96,17,_Half,BALL,DFL-OBJ-ZZZZ13,2,1,False,-5.98,7.58
71,34857,2014-01-15 10:07:40.080000+01:00,6.36,-7.37,0.35,42.82,0.0,24.01,17,_Half,BALL,DFL-OBJ-ZZZZ13,2,1,False,-6.36,7.37
94,34858,2014-01-15 10:07:40.120000+01:00,6.75,-7.15,0.42,45.09,0.0,39.49,17,_Half,BALL,DFL-OBJ-ZZZZ13,2,1,False,-6.75,7.15
123,34859,2014-01-15 10:07:40.160000+01:00,7.10,-6.94,0.44,40.88,0.0,40.14,17,_Half,BALL,DFL-OBJ-ZZZZ13,2,1,False,-7.10,6.94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364909,87351,2014-01-15 10:42:39.840000+01:00,8.36,34.06,0.00,0.00,0.0,29.28,52,_Half,BALL,DFL-OBJ-ZZZZ13,2,0,False,-8.36,-34.06
1364935,87352,2014-01-15 10:42:39.880000+01:00,8.36,34.06,0.00,0.00,0.0,29.28,52,_Half,BALL,DFL-OBJ-ZZZZ13,2,0,False,-8.36,-34.06
1364967,87353,2014-01-15 10:42:39.920000+01:00,8.36,34.06,0.00,0.00,0.0,29.28,52,_Half,BALL,DFL-OBJ-ZZZZ13,2,0,False,-8.36,-34.06
1364990,87354,2014-01-15 10:42:39.960000+01:00,8.36,34.06,0.00,0.00,0.0,29.28,52,_Half,BALL,DFL-OBJ-ZZZZ13,2,0,False,-8.36,-34.06


Get the teams and their players (including referees and ball)

In [5]:
position_df.TeamId.unique()

array(['DFL-CLU-ZZZZ01', 'DFL-CLU-ZZZZ00', 'referee', 'BALL'],
      dtype=object)

In [6]:
team_to_player_dict = position_df.groupby("TeamId")['PersonId'].unique().to_dict()
team_to_player_dict

{'BALL': array(['DFL-OBJ-ZZZZ13'], dtype=object),
 'DFL-CLU-ZZZZ00': array(['DFL-OBJ-ZZZZ09', 'DFL-OBJ-ZZZZ08', 'DFL-OBJ-ZZZZ07',
        'DFL-OBJ-ZZZZ10', 'DFL-OBJ-ZZZZ02', 'DFL-OBJ-ZZZZ01',
        'DFL-OBJ-ZZZZ18', 'DFL-OBJ-ZZZZ15', 'DFL-OBJ-ZZZZ16',
        'DFL-OBJ-ZZZZ22', 'DFL-OBJ-ZZZZ25'], dtype=object),
 'DFL-CLU-ZZZZ01': array(['DFL-OBJ-ZZZZ06', 'DFL-OBJ-ZZZZ05', 'DFL-OBJ-ZZZZ03',
        'DFL-OBJ-ZZZZ11', 'DFL-OBJ-ZZZZ20', 'DFL-OBJ-ZZZZ12',
        'DFL-OBJ-ZZZZ17', 'DFL-OBJ-ZZZZ19', 'DFL-OBJ-ZZZZ21',
        'DFL-OBJ-ZZZZ23', 'DFL-OBJ-ZZZZ14'], dtype=object),
 'referee': array(['DFL-OBJ-ZZZZ00', 'DFL-OBJ-ZZZZ04', 'DFL-OBJ-ZZZZ24'], dtype=object)}

Find out which team ID corresponds to which BallPossession indicator

In [7]:
# compute distances of all agents to the ball (including ball)
position_df = position_df.set_index('N')
ball_df = position_df.query('TeamId == "BALL"')
dist_df = pd.merge(position_df, ball_df[['X','Y']].rename(columns={'X':'Ball_X','Y':'Ball_Y'}), left_index=True, right_index=True)
dist_df['Dist'] = np.linalg.norm(dist_df[['X','Y']] - dist_df[['Ball_X','Ball_Y']].values, axis=1)

In [12]:
# get all players of team DFL-CLU-ZZZZ01 that are closer to the ball than 0.5m 
ball_candidates = dist_df.sort_values('Dist').groupby('N').nth(1)
# check average BallPossession flag
ball_candidates.query('Dist < 0.5').query('TeamId == "DFL-CLU-ZZZZ01"').describe()

Unnamed: 0,X,Y,Z,D,A,S,M,BallPossession,BallStatus,X_LTR,Y_LTR,Ball_X,Ball_Y,Dist
count,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0,2088.0
mean,-5.395273,-0.783012,0.0,8.032438,-0.384373,7.31512,32.918103,1.061782,0.874521,-5.395273,-0.783012,-5.347974,-0.82579,0.336289
std,28.8146,19.67565,0.0,5.931759,2.054636,5.16294,9.958154,0.240816,0.331341,28.8146,19.67565,28.851345,19.64645,0.135897
min,-50.62,-34.42,0.0,0.0,-7.99,0.0,17.0,1.0,0.0,-50.62,-34.42,-50.53,-34.24,0.01
25%,-27.7225,-19.6825,0.0,3.29,-1.4825,3.12,24.0,1.0,1.0,-27.7225,-19.6825,-27.5725,-19.825,0.25
50%,-6.965,-1.85,0.0,7.21,0.0,6.8,30.0,1.0,1.0,-6.965,-1.85,-6.985,-1.92,0.380132
75%,16.1625,15.73,0.0,11.74,0.84,10.6025,41.0,1.0,1.0,16.1625,15.73,16.255,15.4175,0.447214
max,50.74,36.32,0.0,32.71,5.71,28.11,52.0,2.0,1.0,50.74,36.32,50.54,35.91,0.5


In [13]:
# check average BallPossession flag of other team. It turns out that average BallPossession is roughly 2 here -> DFL-CLU-ZZZZ00 corresponds to BallPossession 2!
ball_candidates.query('Dist < 0.5').query('TeamId == "DFL-CLU-ZZZZ00"').describe()

Unnamed: 0,X,Y,Z,D,A,S,M,BallPossession,BallStatus,X_LTR,Y_LTR,Ball_X,Ball_Y,Dist
count,2253.0,2253.0,2253.0,2253.0,2253.0,2253.0,2253.0,2253.0,2253.0,2253.0,2253.0,2253.0,2253.0,2253.0
mean,18.406813,5.437958,0.0,6.882477,-0.431611,6.346951,34.538393,1.896138,0.728806,-18.406813,-5.437958,18.359458,5.446853,0.259724
std,23.312411,22.153031,0.0,6.890326,1.822879,6.120226,9.967352,0.305149,0.444675,23.312411,22.153031,23.312313,22.175632,0.135222
min,-50.17,-34.15,0.0,0.0,-7.99,0.0,17.0,1.0,0.0,-53.47,-34.58,-50.27,-33.9,0.0
25%,5.15,-11.5,0.0,1.15,-1.06,1.1,23.0,2.0,0.0,-37.82,-32.53,5.32,-11.49,0.15
50%,25.61,-1.16,0.0,4.83,-0.05,4.57,37.0,2.0,1.0,-25.61,1.16,25.7,-1.29,0.243516
75%,37.82,32.53,0.0,11.48,0.32,10.08,43.0,2.0,1.0,-5.15,11.5,37.76,32.74,0.382099
max,53.47,34.58,0.0,34.09,5.72,30.58,52.0,2.0,1.0,50.17,34.15,53.1,34.47,0.5
