This notebook will be used to extract events positions from the statsbomb dataset
and use them as starting positions to evaluate a multiagent policy

The focus will be on shots

In [2]:
import subprocess
import sys

# List of required packages
required_packages = [
    "pandas", "numpy", "matplotlib", "seaborn", "networkx"
]

def install_package(package):
    """Installs a package using pip if it's not already installed."""
    try:
        __import__(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Check and install missing packages
for package in required_packages:
    install_package(package)


pandas is already installed.
numpy is already installed.
matplotlib is already installed.
seaborn is already installed.
networkx is already installed.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import random

from matplotlib.patches import Arc

In [4]:
install_package("statsbombpy")

statsbombpy is already installed.


In [1]:
from statsbombpy import sb

In [40]:
# Suppress the authentication warning
import warnings
from statsbombpy.api_client import NoAuthWarning
warnings.simplefilter("ignore", NoAuthWarning)

In [6]:
competition_id = 55   # e.g., Euro 2024
season_id = 282       # 2024
all_matches = sb.matches(competition_id=competition_id, season_id=season_id)

# Sort by date just to see them in order
all_matches = all_matches.sort_values(by='match_date')

# Let’s pick a game at random (e.g., the 3rd match)
match_pos = 2
chosen_match = all_matches.iloc[match_pos]
match_id = chosen_match['match_id']

# Print out some basic info
print("Chosen Match ID:", match_id)
print("Competition:", all_matches['competition'][match_pos])
print("Stage:", chosen_match['competition_stage'])
print("Date:", chosen_match['match_date'])
print("Teams:", chosen_match['home_team'], "vs.", chosen_match['away_team'])
print("Score:", chosen_match['home_score'], "-", chosen_match['away_score'])
print("Stadium:", chosen_match['stadium'])
print("Referee", chosen_match['referee'])

Chosen Match ID: 3930161
Competition: Europe - UEFA Euro
Stage: Group Stage
Date: 2024-06-15
Teams: Italy vs. Albania
Score: 2 - 1
Stadium: Signal-Iduna-Park
Referee Felix Zwayer




In [7]:
events = sb.events(match_id=match_id)
shots = events[events['type'] == 'Shot'].copy()



In [31]:
positions = shots['location'].apply(lambda loc: (loc[0], loc[1]) if isinstance(loc, list) else np.nan)

In [46]:
# Load all competitions
comps = sb.competitions()

# Extract the numeric year from 'season_name' (e.g. "2023/2024" -> 2023)
comps['season_year'] = comps['season_name'].str[:4].astype(int)

# Sort competition ascending by'season_year'
comps_sorted = comps.sort_values(by='season_year', ascending=True).reset_index(drop=True)

all_matches_list = []
total_matches = 0

# Iterate over competitions in chronological order
for idx, row in comps_sorted.iterrows():
    cid = row['competition_id']
    sid = row['season_id']
    comp_name = row['competition_name']
    season_name = row['season_name']
    season_year = row['season_year']
    
    # Load matches for the competition + season
    comp_matches = sb.matches(competition_id=cid, season_id=sid)
    
    # Sort by match_date ascending
    comp_matches = comp_matches.sort_values(by='match_date', ascending=True)
    
    # Print matches in chronological order
    match_count = 0
    for m_idx, m_row in comp_matches.iterrows():
        match_date = m_row['match_date']
        home_team  = m_row['home_team']
        away_team  = m_row['away_team']
        home_score = m_row['home_score']
        away_score = m_row['away_score']
        
        # Print each match line: date, home vs away, final score
        match_count += 1

    # Print the total number of matches for this competition + season
    
    # Accumulate into an overall total and list
    total_matches += match_count
    all_matches_list.append(comp_matches)

# This will create a single DataFrame with all matches
all_matches = pd.concat(all_matches_list, ignore_index=True)

# Print the grand total of matches across all competitions/seasons
print("\n-------------------------------------------")
print(f"Grand total across all comps/seasons: {total_matches} matches")


-------------------------------------------
Grand total across all comps/seasons: 3464 matches


In [82]:
all_assists_list = []
i = 0

for match_id in all_matches['match_id']:
    # Fetch events for the current match
    events = sb.events(match_id=match_id)

    # Check if the column exists and filter for assists
    if 'pass_shot_assist' in events.columns:
        assists = events[events['pass_shot_assist'] == True].copy()
        
        if not assists.empty:
            i += 1
            # Add the match_id column so you know which match the assist belongs to
            assists['match_id'] = match_id
            
            # Append this match's assists to the list
            all_assists_list.append(assists)

    if i == 10: break

# Concatenate everything into one single DataFrame
if all_assists_list:
    final_assists_df = pd.concat(all_assists_list, ignore_index=True)
    print("--- Success! ---")
    print(f"Total assists collected: {len(final_assists_df)}")
else:
    print("No assists found in any of the matches.")
    final_assists_df = pd.DataFrame() # Return empty df to avoid errors later



--- Success! ---
Total assists collected: 286


In [88]:
final_assists_df.get('pass_end_location')

0      [106.3, 27.4]
1      [100.4, 39.7]
2      [105.2, 50.8]
3      [106.8, 29.5]
4      [109.6, 32.3]
           ...      
281    [105.6, 31.4]
282     [99.5, 42.5]
283    [104.4, 31.7]
284    [103.9, 28.6]
285     [65.1, 37.2]
Name: pass_end_location, Length: 286, dtype: object

In [107]:
print(len(final_assists_df.iloc[0].dropna()))
for col, el in final_assists_df.iloc[0].dropna().items():
    print(f"{col}: {el}")

29
duration: 1.241699
id: ebbe861a-64e6-44f7-a628-931dba508bc0
index: 56
location: [85.6, 11.1]
match_id: 3888704
minute: 0
pass_angle: 0.6670351
pass_assisted_shot_id: 6302f961-10dc-47af-b393-92a93e5cc4df
pass_body_part: Right Foot
pass_end_location: [106.3, 27.4]
pass_height: Ground Pass
pass_length: 26.347296
pass_recipient: Édson Arantes do Nascimento
pass_recipient_id: 39712.0
pass_shot_assist: True
period: 1
play_pattern: Regular Play
player: Waldyr Pereira
player_id: 397703.0
position: Left Defensive Midfield
possession: 4
possession_team: Brazil
possession_team_id: 781
related_events: ['467455c9-f4fd-4106-b972-875fd02be4da']
second: 38
team: Brazil
team_id: 781
timestamp: 00:00:38.408
type: Pass


## Creating the new dataset

From the selected failed assists we select the useful information and save it in a dataset

Dataset:
match_id
pass_id
start_position
end_position