In [1]:
import pandas as pd
data = pd.read_json('sb_events.json')

In [3]:
with open('data_columns.txt', 'w') as f:
    for col in data.columns:
        f.write(col + '\n')

In [2]:
print(data['obv_for_after'].unique())
print(data['obv_for_before'].unique())
print(data['obv_for_net'].unique())
print(data['obv_against_after'].unique())
print(data['obv_against_before'].unique())
print(data['obv_against_net'].unique())
print(data['obv_total_net'].unique())


[   nan 0.0149 0.0215 ... 0.3202 0.3872 0.7519]
[   nan 0.0074 0.0149 ... 0.1875 0.2329 0.2338]
[        nan  7.5000e-03 -1.0000e-04 ... -2.3330e-06 -6.4448e-06
  1.4270e-01]
[   nan 0.0111 0.0112 ... 0.1935 0.2338 0.1062]
[   nan 0.0086 0.0111 ... 0.6314 0.5233 0.1654]
[        nan  2.5000e-03  1.0000e-04 ... -1.0730e-01  1.4594e-06
 -1.6750e-01]
[        nan  5.0000e-03 -2.0000e-04 ... -6.4196e-06  2.4214e-06
 -5.2536e-06]


In [3]:
print(data['play_pattern.name'].unique())

['Regular Play' 'From Kick Off' 'From Throw In' 'From Counter'
 'From Free Kick' 'From Goal Kick' 'From Corner' 'From Keeper' 'Other']


In [7]:
gk = data[data['play_pattern.name']== 'From Goal Kick']
gk_under_pressure = gk[gk['under_pressure'] == True]
print(f"Range of obv_for_net values: {gk_under_pressure['obv_for_net'].min()} to {gk_under_pressure['obv_for_net'].max()}")

Range of obv_for_net values: -0.2225 to 0.727


In [8]:
gk_under_pressure.head()

Unnamed: 0,id,index,period,timestamp,minute,second,possession,duration,related_events,location,...,shot.saved_to_post,goalkeeper.shot_saved_to_post,shot.shot_goal_assist,shot.follows_dribble,goalkeeper.lost_out,goalkeeper.lost_in_play,player_off.permanent,shot.redirect,goalkeeper.success_out,goalkeeper.success_in_play
76,3cbfc7da-a0b0-40f1-a62b-142d3be8345e,77,1,2026-02-02 00:03:17.671,3,17,8,0.0,3e475b94-8e4c-4e2d-9a37-a616c294b7da,"[68, 68.3]",...,,,,,,,,,,
77,3e475b94-8e4c-4e2d-9a37-a616c294b7da,78,1,2026-02-02 00:03:17.671,3,17,8,0.0,"[3cbfc7da-a0b0-40f1-a62b-142d3be8345e, 0e43c91...","[52.1, 11.8]",...,,,,,,,,,,
632,6025932c-db6f-42b1-8cd2-4218025c255c,633,1,2026-02-02 00:34:19.237,34,19,48,1.0731,"[0a07b231-bd60-426d-8bc3-c5c91902941b, 7ce6219...","[13.9, 13.8]",...,,,,,,,,,,
634,7ce62196-627c-4fd0-a0be-18af8e8d44e5,635,1,2026-02-02 00:34:20.310,34,20,48,2.128,"[2fe597ce-5d64-4a71-a6cf-b67899fb9ae6, 1341518...","[15.4, 11.5]",...,,,,,,,,,,
639,e6d5b1f2-a07a-4145-a705-d0d87cfd974c,640,1,2026-02-02 00:34:23.794,34,23,48,1.0669,"[ba841be5-065c-4fa2-bf0c-bbf0cfc32fda, b55ba45...","[34.4, 20.1]",...,,,,,,,,,,


In [11]:
test_event = data[data['id'] == '3e475b94-8e4c-4e2d-9a37-a616c294b7da']

In [12]:
print(test_event['play_pattern.name'])

77    From Goal Kick
Name: play_pattern.name, dtype: object


In [13]:
# Check what columns help identify sequences
print("Columns related to possession/sequence:")
for col in data.columns:
    if 'possession' in col.lower() or 'index' in col.lower():
        print(f"  {col}")

Columns related to possession/sequence:
  index
  possession
  possession_team.id
  possession_team.name


In [14]:
# Look at a sample of the gk dataframe
print(gk[['id', 'index', 'possession', 'type.name', 'play_pattern.name']].head(10))

                                       id  index  possession      type.name  \
74   0e43c912-b1e0-445f-a0c4-c648f0eace10     75           8           Pass   
75   46b36d6e-57bd-470c-90db-c20bc240061c     76           8  Ball Receipt*   
76   3cbfc7da-a0b0-40f1-a62b-142d3be8345e     77           8           Duel   
77   3e475b94-8e4c-4e2d-9a37-a616c294b7da     78           8      Clearance   
78   103fbfba-4c3e-47b3-b892-89991ec64264     79           8           Pass   
79   385d0b5b-51e8-4f14-9eb9-360d652424b6     80           8       Pressure   
80   4818b9c1-83c2-4cf8-bfbd-73c70cf3779b     81           8  Ball Receipt*   
81   6dde9062-a617-4c6b-a01f-b2a362cbefd6     82           8    Goal Keeper   
102  30fa38a2-ebea-45fe-a282-beb2f39c6b4e    103          11           Pass   
103  66cf4e41-c18c-4217-bce0-85915eaa183a    104          11  Ball Receipt*   

    play_pattern.name  
74     From Goal Kick  
75     From Goal Kick  
76     From Goal Kick  
77     From Goal Kick  
78     Fro

## Find the First Event of Each Goal Kick Sequence

Since each sequence is identified by the `possession` column, we can group by possession and find the event with the minimum `index` value.

In [15]:
# Find the first event (minimum index) for each goal kick possession sequence
first_events = gk.loc[gk.groupby('possession')['index'].idxmin()]

print(f"Total goal kick sequences: {len(first_events)}")
print(f"\nFirst few first events:")
print(first_events[['id', 'index', 'possession', 'type.name', 'play_pattern.name']].head(10))

Total goal kick sequences: 201

First few first events:
                                          id  index  possession type.name  \
64832   a4f59348-9ad8-4b9d-bb70-2ca5cf9fd3b2      9           3      Pass   
483096  fc2d95f5-353b-4248-ae6d-cfff626be073     12           4      Pass   
211428  bfe6b737-1eb9-40d8-93ac-60a2b553d33e     17           5      Pass   
92310   0f417b24-b778-47dc-8572-d38c857b1494     25           6      Pass   
144699  939ceb02-3910-490d-ace4-7fef4270de79     38           7      Pass   
230727  1e5e58a1-726c-4d9e-b099-ca7f8566180c     54           8      Pass   
659744  067c29ac-e8bb-4c98-9ad1-d8ab8b227c61     49           9      Pass   
556046  06cb3788-a32c-4306-8a9f-cea4b1f32828     85          10      Pass   
36153   7f2a3fbb-f385-427d-b67d-718a3ff958a0     78          11      Pass   
174645  6fadf8fc-90a9-4bf6-87f0-c1897e8c09ec    110          12      Pass   

       play_pattern.name  
64832     From Goal Kick  
483096    From Goal Kick  
211428    From 

In [16]:
# See what types of events start goal kick sequences
print("Distribution of first event types in goal kick sequences:")
print(first_events['type.name'].value_counts())

Distribution of first event types in goal kick sequences:
type.name
Pass    201
Name: count, dtype: int64


In [17]:
print(first_events['obv_for_net'].min())
print(first_events['obv_for_net'].max())

-0.0014
0.0245


In [18]:
print(first_events['obv_against_net'].min())
print(first_events['obv_against_net'].max())

-0.0054
0.0148
