*Copyright 2025 Jaeyoung Chun / Winning Twelve*

You may not make copies of this and use or distribute it for any purpose.

# Preparation

```
pip install statsbombpy
pip install pyarrow
```

In [1]:
import os
import pandas as pd
from statsbombpy import sb
import warnings
warnings.filterwarnings("ignore", message="credentials were not supplied")

## Load

FIFA World Cup - South Korea vs. Portugal

In [2]:
df_events = sb.events(match_id=3857262)

In [3]:
df_events

Unnamed: 0,bad_behaviour_card,ball_receipt_outcome,ball_recovery_offensive,ball_recovery_recovery_failure,block_deflection,carry_end_location,clearance_aerial_won,clearance_body_part,clearance_head,clearance_left_foot,...,substitution_outcome,substitution_outcome_id,substitution_replacement,substitution_replacement_id,tactics,team,team_id,timestamp,type,under_pressure
0,,,,,,,,,,,...,,,,,"{'formation': 433, 'lineup': [{'player': {'id'...",South Korea,791,00:00:00.000,Starting XI,
1,,,,,,,,,,,...,,,,,"{'formation': 433, 'lineup': [{'player': {'id'...",Portugal,780,00:00:00.000,Starting XI,
2,,,,,,,,,,,...,,,,,,Portugal,780,00:00:00.000,Half Start,
3,,,,,,,,,,,...,,,,,,South Korea,791,00:00:00.000,Half Start,
4,,,,,,,,,,,...,,,,,,South Korea,791,00:00:00.000,Half Start,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3374,,,,,,,,,,,...,Tactical,103.0,Ui-Jo Hwang,29966.0,,South Korea,791,00:36:13.753,Substitution,
3375,,,,,,,,,,,...,Tactical,103.0,William Silva de Carvalho,5214.0,,Portugal,780,00:36:13.762,Substitution,
3376,,,,,,,,,,,...,Tactical,103.0,Bernardo Mota Veiga de Carvalho e Silva,3193.0,,Portugal,780,00:36:13.790,Substitution,
3377,,,,,,,,,,,...,Tactical,103.0,Yu-Min Cho,99479.0,,South Korea,791,00:47:46.061,Substitution,


## Extract and Transform

### Fix Timestamp

In [4]:
df_events.timestamp

0       00:00:00.000
1       00:00:00.000
2       00:00:00.000
3       00:00:00.000
4       00:00:00.000
            ...     
3374    00:36:13.753
3375    00:36:13.762
3376    00:36:13.790
3377    00:47:46.061
3378    00:47:36.651
Name: timestamp, Length: 3379, dtype: object

In [5]:
df_events["timestamp"] = pd.to_timedelta(df_events.timestamp)

In [6]:
df_events.timestamp

0             0 days 00:00:00
1             0 days 00:00:00
2             0 days 00:00:00
3             0 days 00:00:00
4             0 days 00:00:00
                ...          
3374   0 days 00:36:13.753000
3375   0 days 00:36:13.762000
3376   0 days 00:36:13.790000
3377   0 days 00:47:46.061000
3378   0 days 00:47:36.651000
Name: timestamp, Length: 3379, dtype: timedelta64[ns]

### Remove Events with No Location

In [7]:
df_pass = df_events[ ~pd.isna(df_events.location) ].copy()

In [8]:
df_pass

Unnamed: 0,bad_behaviour_card,ball_receipt_outcome,ball_recovery_offensive,ball_recovery_recovery_failure,block_deflection,carry_end_location,clearance_aerial_won,clearance_body_part,clearance_head,clearance_left_foot,...,substitution_outcome,substitution_outcome_id,substitution_replacement,substitution_replacement_id,tactics,team,team_id,timestamp,type,under_pressure
6,,,,,,,,,,,...,,,,,,South Korea,791,0 days 00:00:00.462000,Pass,
7,,,,,,,,,,,...,,,,,,South Korea,791,0 days 00:00:02.792000,Pass,
8,,,,,,,,,,,...,,,,,,South Korea,791,0 days 00:00:08.623000,Pass,
9,,,,,,,,,,,...,,,,,,South Korea,791,0 days 00:00:11.725000,Pass,
10,,,,,,,,,,,...,,,,,,South Korea,791,0 days 00:00:12.629000,Pass,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3354,,,,,,,,,,,...,,,,,,Portugal,780,0 days 00:44:44.620000,Referee Ball-Drop,
3355,,,,,,,,,,,...,,,,,,South Korea,791,0 days 00:09:00.383000,Referee Ball-Drop,
3356,,,,,,,,,,,...,,,,,,Portugal,780,0 days 00:09:02.163000,Referee Ball-Drop,
3357,,,,,,,,,,,...,,,,,,Portugal,780,0 days 00:34:02.851000,Referee Ball-Drop,


### Start (X, Y)

In [9]:
df_pass.location

6        [60.0, 40.0]
7        [39.7, 30.0]
8         [37.4, 8.9]
9         [42.1, 3.4]
10        [30.6, 4.1]
            ...      
3354    [105.8, 35.7]
3355      [60.8, 5.6]
3356     [59.3, 74.5]
3357     [92.1, 56.6]
3358     [28.0, 23.5]
Name: location, Length: 3338, dtype: object

In [10]:
df_pass[["x_start", "y_start"]] = pd.DataFrame(
    df_pass.location.to_list(),
    index=df_pass.index
)

In [11]:
df_pass.drop(columns="location", inplace=True)

### Pass Only

In [12]:
df_pass = df_pass[ (df_pass.type == "Pass") ].copy()

### End (X, Y)

In [13]:
df_pass[["x_end", "y_end"]] = pd.DataFrame(
    df_pass.pass_end_location.to_list(),
    index=df_pass.index
)

In [14]:
df_pass.drop(columns="pass_end_location", inplace=True)

## Write to File

In [15]:
path_data = "./data"

In [16]:
os.makedirs(path_data, exist_ok=True)

### Parquet (Full)

In [17]:
df_pass.columns

Index(['bad_behaviour_card', 'ball_receipt_outcome', 'ball_recovery_offensive',
       'ball_recovery_recovery_failure', 'block_deflection',
       'carry_end_location', 'clearance_aerial_won', 'clearance_body_part',
       'clearance_head', 'clearance_left_foot', 'clearance_right_foot',
       'counterpress', 'dribble_nutmeg', 'dribble_outcome', 'dribble_overrun',
       'duel_outcome', 'duel_type', 'duration', 'foul_committed_card',
       'foul_committed_type', 'foul_won_defensive', 'goalkeeper_body_part',
       'goalkeeper_end_location', 'goalkeeper_outcome', 'goalkeeper_position',
       'goalkeeper_technique', 'goalkeeper_type', 'id', 'index',
       'injury_stoppage_in_chain', 'interception_outcome', 'match_id',
       'minute', 'miscontrol_aerial_won', 'off_camera', 'out',
       'pass_aerial_won', 'pass_angle', 'pass_assisted_shot_id',
       'pass_body_part', 'pass_cross', 'pass_cut_back', 'pass_deflected',
       'pass_goal_assist', 'pass_height', 'pass_inswinging', 'pass_l

In [18]:
df_pass.to_parquet(
    os.path.join(path_data, "events_pass_full.parquet"),
    index=False
)

### CSV (Simplified)

In [19]:
df_simple = df_pass.loc[:, ["period", "team", "type", "x_start", "y_start", "x_end", "y_end"]]
df_simple

Unnamed: 0,period,team,type,x_start,y_start,x_end,y_end
6,1,South Korea,Pass,60.0,40.0,39.9,29.0
7,1,South Korea,Pass,39.7,30.0,31.9,16.7
8,1,South Korea,Pass,37.4,8.9,49.8,3.6
9,1,South Korea,Pass,42.1,3.4,31.5,3.6
10,1,South Korea,Pass,30.6,4.1,57.7,11.7
...,...,...,...,...,...,...,...
1008,2,South Korea,Pass,94.3,76.4,89.5,78.9
1009,2,South Korea,Pass,88.1,78.2,79.6,76.2
1010,2,South Korea,Pass,79.3,75.4,81.9,72.3
1011,2,Portugal,Pass,38.2,7.8,68.1,12.9


In [20]:
df_simple.to_csv(
    os.path.join(path_data, "events_pass_simple.csv"),
    index=False
)