### Install required Libraries (if needed)

In [8]:
import subprocess
import sys

# List of required packages
required_packages = [
    "pandas", "numpy", "statsbombpy"
]

def install_package(package):
    """Installs a package using pip if it's not already installed."""
    try:
        __import__(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Check and install missing packages
for package in required_packages:
    install_package(package)

# Suppress the authentication warning
import warnings
from statsbombpy.api_client import NoAuthWarning
warnings.simplefilter("ignore", NoAuthWarning)


pandas is already installed.
numpy is already installed.
statsbombpy is already installed.


### Extracting Formations and Player Roles from StatsBomb Open Data

Here, we iterate over **all competitions and seasons** available in StatsBomb library to extract:

- The **formations** (e.g., 4-3-3, 3-5-2, ...) used by teams in every match
- The **player roles/positions** (e.g., Left Back, Attacking Midfield, ...) assigned in the starting lineups

By doing this, we aim to:

1. **Validate** whether our custom set of tactical formations and role abbreviations covers all real-world cases
2. **Identify missing roles or rare formations** that we may want to include or support
3. **Build a complete reference** of how real teams are structured in StatsBomb data

We will also count how many matches were skipped due to data errors (if any), and optionally map full role names to standard abbreviations like `CM`, `CB`, `RW`, etc.

In [9]:
from statsbombpy import sb
import pandas as pd
from tqdm.notebook import tqdm

# Initialize containers to store unique formations and player roles
all_formations = set()
all_roles = set()

# Counter for skipped matches due to missing or broken data
skipped_matches = 0

# Load all available competitions from StatsBomb
comps = sb.competitions()

# Sort competitions by season to process them in chronological order
comps = comps.sort_values(by='season_name')

# Loop over each competition and season
for _, comp in tqdm(comps.iterrows(), total=len(comps), desc="Competitions"):
    comp_id = comp['competition_id']
    season_id = comp['season_id']
    comp_name = comp['competition_name']
    season_name = comp['season_name']

    print(f"\nProcessing: {comp_name} - {season_name}")

    try:
        # Load all matches for this competition and season
        matches = sb.matches(competition_id=comp_id, season_id=season_id)
    except Exception as e:
        print(f"Failed to load matches for {comp_name} - {season_name}: {e}")
        continue

    # Loop over each match
    for match_id in tqdm(matches['match_id'].unique(), desc="  ↳ Matches", leave=False):
        try:
            # Load the event data for this match
            events = sb.events(match_id=match_id)

            # Filter for events of type "Starting XI"
            starting_xi = events[events['type'] == 'Starting XI']

            # For each Starting XI event (one per team)
            for i in range(len(starting_xi)):
                # Extract and store the formation
                formation = starting_xi.iloc[i]["tactics"].get("formation")
                if formation:
                    all_formations.add(str(formation))

                # Extract and store all player roles from the lineup
                lineup = starting_xi.iloc[i]["tactics"].get("lineup", [])
                for player in lineup:
                    role = player.get("position", {}).get("name")
                    if role:
                        all_roles.add(role)

        except Exception as e:
            # If something goes wrong, log the match ID and continue
            skipped_matches += 1
            print(f"Skipped match {match_id}: {e}")
            continue

# Print the final list of unique formations found across all competitions
print("\nUnique formations found:")
for f in sorted(all_formations):
    print("-", f)

# Print the final list of unique player roles found
print("\nUnique player roles found:")
for r in sorted(all_roles):
    print("-", r)

# Summary of skipped matches
print(f"\nTotal skipped matches due to errors: {skipped_matches}")

Competitions:   0%|          | 0/74 [00:00<?, ?it/s]


Processing: FIFA World Cup - 1958


  ↳ Matches:   0%|          | 0/2 [00:00<?, ?it/s]


Processing: FIFA World Cup - 1962


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: FIFA World Cup - 1970


  ↳ Matches:   0%|          | 0/6 [00:00<?, ?it/s]


Processing: Champions League - 1970/1971


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Champions League - 1971/1972


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Champions League - 1972/1973


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: La Liga - 1973/1974


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: FIFA World Cup - 1974


  ↳ Matches:   0%|          | 0/6 [00:00<?, ?it/s]


Processing: North American League - 1977


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Copa del Rey - 1977/1978


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: FIFA U20 World Cup - 1979


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Liga Profesional - 1981


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Copa del Rey - 1982/1983


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Copa del Rey - 1983/1984


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: FIFA World Cup - 1986


  ↳ Matches:   0%|          | 0/3 [00:00<?, ?it/s]


Processing: Serie A - 1986/1987


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: UEFA Europa League - 1988/1989


  ↳ Matches:   0%|          | 0/3 [00:00<?, ?it/s]


Processing: FIFA World Cup - 1990


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Liga Profesional - 1997/1998


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Champions League - 1999/2000


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Premier League - 2003/2004


  ↳ Matches:   0%|          | 0/38 [00:00<?, ?it/s]


Processing: Champions League - 2003/2004


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Champions League - 2004/2005


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: La Liga - 2004/2005


  ↳ Matches:   0%|          | 0/7 [00:00<?, ?it/s]


Processing: La Liga - 2005/2006


  ↳ Matches:   0%|          | 0/17 [00:00<?, ?it/s]


Processing: Champions League - 2006/2007


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: La Liga - 2006/2007


  ↳ Matches:   0%|          | 0/26 [00:00<?, ?it/s]


Processing: La Liga - 2007/2008


  ↳ Matches:   0%|          | 0/28 [00:00<?, ?it/s]


Processing: Champions League - 2008/2009


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: La Liga - 2008/2009


  ↳ Matches:   0%|          | 0/31 [00:00<?, ?it/s]


Processing: La Liga - 2009/2010


  ↳ Matches:   0%|          | 0/35 [00:00<?, ?it/s]


Processing: Champions League - 2009/2010


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: La Liga - 2010/2011


  ↳ Matches:   0%|          | 0/33 [00:00<?, ?it/s]


Processing: Champions League - 2010/2011


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Champions League - 2011/2012


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: La Liga - 2011/2012


  ↳ Matches:   0%|          | 0/37 [00:00<?, ?it/s]


Processing: Champions League - 2012/2013


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: La Liga - 2012/2013


  ↳ Matches:   0%|          | 0/32 [00:00<?, ?it/s]


Processing: La Liga - 2013/2014


  ↳ Matches:   0%|          | 0/31 [00:00<?, ?it/s]


Processing: Champions League - 2013/2014


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Champions League - 2014/2015


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: La Liga - 2014/2015


  ↳ Matches:   0%|          | 0/38 [00:00<?, ?it/s]


Processing: 1. Bundesliga - 2015/2016


  ↳ Matches:   0%|          | 0/306 [00:00<?, ?it/s]


Processing: Ligue 1 - 2015/2016


  ↳ Matches:   0%|          | 0/377 [00:00<?, ?it/s]


Processing: Premier League - 2015/2016


  ↳ Matches:   0%|          | 0/380 [00:00<?, ?it/s]


Processing: Champions League - 2015/2016


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: La Liga - 2015/2016


  ↳ Matches:   0%|          | 0/380 [00:00<?, ?it/s]


Processing: Serie A - 2015/2016


  ↳ Matches:   0%|          | 0/380 [00:00<?, ?it/s]

Skipped match 3879551: 503 Server Error: between bytes timeout for url: https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/3879551.json
Skipped match 3879550: 503 Server Error: between bytes timeout for url: https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/3879550.json
Skipped match 3879549: 503 Server Error: between bytes timeout for url: https://raw.githubusercontent.com/statsbomb/open-data/master/data/events/3879549.json

Processing: La Liga - 2016/2017


  ↳ Matches:   0%|          | 0/34 [00:00<?, ?it/s]


Processing: Champions League - 2016/2017


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Champions League - 2017/2018


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: La Liga - 2017/2018


  ↳ Matches:   0%|          | 0/36 [00:00<?, ?it/s]


Processing: NWSL - 2018


  ↳ Matches:   0%|          | 0/36 [00:00<?, ?it/s]


Processing: FIFA World Cup - 2018


  ↳ Matches:   0%|          | 0/64 [00:00<?, ?it/s]


Processing: La Liga - 2018/2019


  ↳ Matches:   0%|          | 0/34 [00:00<?, ?it/s]


Processing: FA Women's Super League - 2018/2019


  ↳ Matches:   0%|          | 0/108 [00:00<?, ?it/s]


Processing: Champions League - 2018/2019


  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing: Women's World Cup - 2019


  ↳ Matches:   0%|          | 0/52 [00:00<?, ?it/s]


Processing: FA Women's Super League - 2019/2020


  ↳ Matches:   0%|          | 0/87 [00:00<?, ?it/s]


Processing: La Liga - 2019/2020


  ↳ Matches:   0%|          | 0/33 [00:00<?, ?it/s]


Processing: UEFA Euro - 2020


  ↳ Matches:   0%|          | 0/51 [00:00<?, ?it/s]


Processing: La Liga - 2020/2021


  ↳ Matches:   0%|          | 0/35 [00:00<?, ?it/s]


Processing: FA Women's Super League - 2020/2021


  ↳ Matches:   0%|          | 0/131 [00:00<?, ?it/s]


Processing: Ligue 1 - 2021/2022


  ↳ Matches:   0%|          | 0/26 [00:00<?, ?it/s]


Processing: Indian Super league - 2021/2022


  ↳ Matches:   0%|          | 0/115 [00:00<?, ?it/s]


Processing: FIFA World Cup - 2022


  ↳ Matches:   0%|          | 0/64 [00:00<?, ?it/s]


Processing: UEFA Women's Euro - 2022


  ↳ Matches:   0%|          | 0/31 [00:00<?, ?it/s]


Processing: Ligue 1 - 2022/2023


  ↳ Matches:   0%|          | 0/32 [00:00<?, ?it/s]


Processing: Major League Soccer - 2023


  ↳ Matches:   0%|          | 0/6 [00:00<?, ?it/s]


Processing: Women's World Cup - 2023


  ↳ Matches:   0%|          | 0/64 [00:00<?, ?it/s]


Processing: African Cup of Nations - 2023


  ↳ Matches:   0%|          | 0/52 [00:00<?, ?it/s]


Processing: 1. Bundesliga - 2023/2024


  ↳ Matches:   0%|          | 0/34 [00:00<?, ?it/s]


Processing: Copa America - 2024


  ↳ Matches:   0%|          | 0/32 [00:00<?, ?it/s]


Processing: UEFA Euro - 2024


  ↳ Matches:   0%|          | 0/51 [00:00<?, ?it/s]


Unique formations found:
- 3142
- 32221
- 3232
- 3412
- 3421
- 343
- 3511
- 352
- 41212
- 41221
- 4141
- 42121
- 42211
- 4222
- 4231
- 4312
- 4321
- 433
- 4411
- 442
- 451
- 5221
- 532
- 541

Unique player roles found:
- Center Attacking Midfield
- Center Back
- Center Defensive Midfield
- Center Forward
- Center Midfield
- Goalkeeper
- Left Attacking Midfield
- Left Back
- Left Center Back
- Left Center Forward
- Left Center Midfield
- Left Defensive Midfield
- Left Midfield
- Left Wing
- Left Wing Back
- Right Attacking Midfield
- Right Back
- Right Center Back
- Right Center Forward
- Right Center Midfield
- Right Defensive Midfield
- Right Midfield
- Right Wing
- Right Wing Back
- Secondary Striker

Total skipped matches due to errors: 3


In [10]:
# Save the unique formations and roles to CSV files
pd.Series(sorted(all_formations)).to_csv("../Csv/unique_formations.csv", index=False, header=False)
pd.Series(sorted(all_roles)).to_csv("../Csv/unique_roles.csv", index=False, header=False)

### Extracting Player Roles per Formation from StatsBomb Open Data

In the next section, we extract the unique player roles associated with each formation used in real matches, based on the official StatsBomb dataset. 

For every formation (e.g., "4-3-3", "4-2-3-1", "3-5-2"), we collect all player roles that appear in the starting lineup when that formation is used. This allows us to validate and cross-check our own formation definitions against real-world data and ensure we are modeling player positions correctly in our reinforcement learning environment.

In [11]:
# Load formations from CSV
target_formations = pd.read_csv("../Csv/unique_formations.csv", header=None)[0].astype(str).tolist()

# Prepare dictionary and set
formation_roles = {}
found_formations = set()

# Sort competitions by season
comps = sb.competitions().sort_values(by='season_name')

# Loop through competitions and matches
for _, comp in tqdm(comps.iterrows(), total=len(comps), desc="Competitions"):
    comp_id = comp["competition_id"]
    season_id = comp["season_id"]

    # Load all matches for this competition and season
    try:
        matches = sb.matches(competition_id=comp_id, season_id=season_id)
    except:
        continue

    # Loop through matches
    for match_id in tqdm(matches["match_id"].unique(), desc="  ↳ Matches", leave=False):
        try:
            # Load the event data for this match
            events = sb.events(match_id=match_id)
            starting_xi = events[events["type"] == "Starting XI"]
        except:
            continue
        
        # Loop through starting XI events
        for i in range(len(starting_xi)):
            try:
                # Extract formation and lineup
                tactics = starting_xi.iloc[i]["tactics"]
                formation = str(tactics.get("formation"))
                lineup = tactics.get("lineup", [])

                # Check if formation is in target formations and not already found
                if formation in target_formations and formation not in found_formations:
                    # Extract and store all player roles from the lineup
                    roles = {
                        player.get("position", {}).get("name")
                        for player in lineup
                        if player.get("position", {}).get("name")
                    }
                    # Store formation and roles if at least 10 roles are found
                    if len(roles) >= 10:
                        formation_roles[formation] = sorted(roles)
                        found_formations.add(formation)

                # Stop early if all found
                if set(target_formations) == found_formations:
                    break

            except:
                continue
        # Stop early if all found
        if set(target_formations) == found_formations:
            break
    # Stop early if all found
    if set(target_formations) == found_formations:
        break

Competitions:   0%|          | 0/74 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/2 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/6 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/6 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/3 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/3 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/38 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/7 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/17 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/26 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/28 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/31 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/35 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/33 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/37 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/32 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/31 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/38 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/306 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/377 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/380 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/380 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/380 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/34 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/1 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/36 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/36 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/64 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/34 [00:00<?, ?it/s]

  ↳ Matches:   0%|          | 0/108 [00:00<?, ?it/s]

In [26]:
# Order of roles for better readability
role_order = [
    "Goalkeeper",
    
    "Left Back", "Left Center Back", "Center Back", "Right Center Back", "Right Back",
    
    "Left Wing Back", "Left Defensive Midfield", "Left Center Midfield", "Center Defensive Midfield", "Center Midfield", "Center Attacking Midfield", "Right Center Midfield", "Right Defensive Midfield", "Right Wing Back",
    
    "Left Midfield", "Right Midfield", "Left Wing", "Right Wing",
    
    "Left Attacking Midfield", "Right Attacking Midfield",
    
    "Left Center Forward", "Center Forward", "Right Center Forward"
]


# Sort roles for each formation
for formation, roles in formation_roles.items():
    roles = list(roles)
    ordered_roles = [r for r in role_order if r in roles]
    extra_roles = [r for r in roles if r not in role_order]
    formation_roles[formation] = ordered_roles + extra_roles

# Save to CSV
df = pd.DataFrame([
    {"Formation": f"{f}, " + " | ".join(roles)}
    for f, roles in formation_roles.items()
])
df.to_csv("../Csv/formation_roles.csv", index=False)