In [1]:
import warnings
import sys
import numpy as np
from tqdm import tqdm
import pandas as pd
from statsbombpy.api_client import NoAuthWarning
warnings.simplefilter('ignore', NoAuthWarning)

from socceraction.data.statsbomb import StatsBombLoader

api = StatsBombLoader(getter="remote", creds=None)
from statsbombpy import sb 


In [2]:
def get_statsbomb_data(competition_name='Champions League'):

    competitions = pd.DataFrame(sb.competitions())
    
    comp_seasons = competitions[
        (competitions['competition_name'] == competition_name) &
        (competitions['competition_gender'] == 'male') &
        (competitions['competition_youth'] == False)
    ]

    full_metadata = []
    full_events = {}
    full_lineups = {}

    for _, row in tqdm(comp_seasons.iterrows(), total=len(comp_seasons), desc=f"Fetching all seasons for {competition_name}"):
        
        season_id = row["season_id"]
        comp_id = row["competition_id"]
        season_name = row["season_name"]

        try:
            matches = sb.matches(comp_id, season_id)
            for _, match in matches.iterrows():
                match_id = match["match_id"]

                try:
                    events = sb.events(match_id = match_id)
                    lineups = sb.lineups(match_id = match_id)

                    full_events[match_id] = events
                    full_lineups[match_id] = lineups
                    full_metadata.append({**row.to_dict(), **match.to_dict()})

                except:
                    print(f"Failed to get events for match {match_id}")
                    continue
        except:
            print(f"Failed to get matches for season {season_name}")
            continue

    finals_df = pd.DataFrame(full_metadata)
    return finals_df, full_events, full_lineups

cl_finals_df, cl_events, cl_lineups = get_statsbomb_data("Champions League")
epl_df, epl_events, epl_lineups = get_statsbomb_data("Premier League")
ity_df, ity_events, ity_lineups = get_statsbomb_data("Serie A")


Fetching all seasons for Champions League: 100%|██████████| 18/18 [00:13<00:00,  1.31it/s]
Fetching all seasons for Premier League: 100%|██████████| 2/2 [04:43<00:00, 141.81s/it]
Fetching all seasons for Serie A: 100%|██████████| 2/2 [06:05<00:00, 182.83s/it]


In [3]:
all_events_combined = pd.concat([ # MERGE UCL, EPL, AND SERIE A EVENTS
    pd.concat([df.assign(match_id = mid) for mid, df in cl_events.items()]),
    pd.concat([df.assign(match_id = mid) for mid, df in epl_events.items()]),
    pd.concat([df.assign(match_id = mid) for mid, df in ity_events.items()]),
], ignore_index=True)

# FLATTEN COLUMNS SINCE SOME ARE NESTED DICTIONARIES WHICH IS WACK ASF 

# Flatten location
all_events_combined['x'] = all_events_combined['location'].apply(lambda loc: loc[0] if isinstance(loc, list) else None)
all_events_combined['y'] = all_events_combined['location'].apply(lambda loc: loc[1] if isinstance(loc, list) else None)

# Flatten pass end location
all_events_combined['pass_end_x'] = all_events_combined['pass_end_location'].apply(lambda loc: loc[0] if isinstance(loc, list) and len(loc) > 1 else None)
all_events_combined['pass_end_y'] = all_events_combined['pass_end_location'].apply(lambda loc: loc[1] if isinstance(loc, list) and len(loc) > 1 else None)

# Flatten carry end location
all_events_combined['carry_end_x'] = all_events_combined['carry_end_location'].apply(lambda loc: loc[0] if isinstance(loc, list) and len(loc) > 1 else None)
all_events_combined['carry_end_y'] = all_events_combined['carry_end_location'].apply(lambda loc: loc[1] if isinstance(loc, list) and len(loc) > 1 else None)


  all_events_combined['x'] = all_events_combined['location'].apply(lambda loc: loc[0] if isinstance(loc, list) else None)
  all_events_combined['y'] = all_events_combined['location'].apply(lambda loc: loc[1] if isinstance(loc, list) else None)
  all_events_combined['pass_end_x'] = all_events_combined['pass_end_location'].apply(lambda loc: loc[0] if isinstance(loc, list) and len(loc) > 1 else None)
  all_events_combined['pass_end_y'] = all_events_combined['pass_end_location'].apply(lambda loc: loc[1] if isinstance(loc, list) and len(loc) > 1 else None)
  all_events_combined['carry_end_x'] = all_events_combined['carry_end_location'].apply(lambda loc: loc[0] if isinstance(loc, list) and len(loc) > 1 else None)
  all_events_combined['carry_end_y'] = all_events_combined['carry_end_location'].apply(lambda loc: loc[1] if isinstance(loc, list) and len(loc) > 1 else None)


Attempting to *predict* the Man of the Match (using the 2018 Champions League Final as an example)

I want to kinda create a report for each player in a match, kinda summarizing their performance frrom Offensive and Defensive perspective

For now just assign basic information available of the players


In [None]:
# -------- Aggregate action counts and xG --------


agg_df = all_events_combined.groupby(['match_id', 'possession']).agg(
    sequence_length=('type', 'size'),
    num_passes=('type', lambda x: (x == 'Pass').sum()),
    num_shots=('type', lambda x: (x == 'Shot').sum()),
    xg_total=('shot_statsbomb_xg', 'sum')
).reset_index()

# -------- Compute goal_occured and shot_outcome for each possession --------
shots = all_events_combined[all_events_combined['type'] == 'Shot']
shot_summary = shots.groupby(['match_id', 'possession']).agg(
    goal_occured = ('shot_outcome', lambda x: (x == 'Goal').any()),
    shot_outcome=('shot_outcome', 'first')
).reset_index()

agg_df = agg_df.merge(shot_summary, on=['match_id', 'possession'], how='left')

goal_occured = (shots['shot_outcome'] == 'Goal').any()

agg_df['goal_occurred'] = goal_occured

# print(agg_df["shot_outcome"].unique())
# sys.exit()

agg_df['goal_occurred'] = agg_df['goal_occurred'].fillna(False)
agg_df['shot_outcome'] = agg_df['shot_outcome'].fillna(False)


# --- 3. Get start and end events for each possession ---
start_events = all_events_combined.groupby(['match_id', 'possession']).head(1).copy()
end_events = all_events_combined.groupby(['match_id', 'possession']).tail(1).copy()

start_events = start_events[['match_id', 'possession', 'possession_team', 'period', 'minute', 'second', 'type', 'player', 'x', 'y']]
end_events = end_events[['match_id', 'possession', 'minute', 'second', 'type', 'pass_end_x', 'pass_end_y', 'x', 'y']]

start_events = start_events.rename(columns={
    'minute': 'start_minute', 'second': 'start_second', 'type': 'start_type',
    'player': 'start_player', 'x': 'start_x', 'y': 'start_y'
})
end_events = end_events.rename(columns={
    'minute': 'end_minute', 'second': 'end_second', 'type': 'end_type',
    'pass_end_x': 'end_x', 'pass_end_y': 'end_y', 'x': 'end_x_fallback', 'y': 'end_y_fallback'
})

# --- 4. Merge all together ---
summary = agg_df.merge(start_events, on=['match_id', 'possession'])
summary = summary.merge(end_events, on=['match_id', 'possession'])

# Use pass_end_x/y if available, else fallback to x/y
summary['end_x'] = summary['end_x'].combine_first(summary['end_x_fallback'])
summary['end_y'] = summary['end_y'].combine_first(summary['end_y_fallback'])
summary = summary.drop(['end_x_fallback', 'end_y_fallback'], axis=1)

# --- 5. Final possession summary DataFrame ---
possession_summary_df = summary

print(possession_summary_df)


        match_id  possession  sequence_length  num_passes  num_shots  \
0          18235           1                4           0          0   
1          18235           2               31          12          0   
2          18235           3                8           3          0   
3          18235           4                6           1          0   
4          18235           5                2           1          0   
...          ...         ...              ...         ...        ...   
160004   3889182         223               15           5          0   
160005   3889182         224               17           2          0   
160006   3889182         225                9           2          1   
160007   3889182         226                7           1          1   
160008   3889182         227                7           1          0   

        xg_total goal_occured shot_outcome  goal_occurred possession_team  \
0       0.000000          NaN        False           True 

Finalize the player summary for the match:
- Take in the KPI Groupings (Offense, Defense)
- Take in the Event that is being wanted, should I take in the match_id as well?

In [10]:

def summarize_player_kpis_by_group(events):
    player_summary_off = {}
    players = events['player'].dropna().unique()

    for player in players:
        player_events = events[events['player'] == player]
        player_id = player_events['player_id'].dropna().unique()

        summary = {
            'Player': player,
            'Team': player_events['team'].dropna().unique()[0] if 'team' in player_events.columns and not player_events['team'].dropna().empty else None,
            'Position': player_events['position'].dropna().unique()[0] if 'position' in player_events.columns and not player_events['position'].dropna().empty else None,
            'Player ID': int(player_id[0]) if len(player_id) > 0 else None
        }

        summary['Goals'] = (player_events['shot_outcome'] == 'Goal').sum() if 'shot_outcome' in player_events.columns else 0
        summary['Assists'] = player_events['pass_goal_assist'].fillna(False).sum() if 'pass_goal_assist' in player_events.columns else 0
        summary['Accumulated xG'] = round(player_events['shot_statsbomb_xg'].dropna().sum(), 3) if 'shot_statsbomb_xg' in player_events.columns else 0
        summary['Shots_On_Target'] = player_events['shot_outcome'].isin(['Goal', 'Saved']).sum() if 'shot_outcome' in player_events.columns else 0

        if 'dribble_outcome' in player_events.columns:
            dribble_data = player_events[~player_events['dribble_outcome'].isna()]
            dribbles_completed = (dribble_data['dribble_outcome'] == 'Complete').sum()
            dribbles_attempted = len(dribble_data)
            summary['Dribbles Completed'] = dribbles_completed
            summary['Dribbles Attempted'] = dribbles_attempted
            summary['Dribble Completion %'] = round(dribbles_completed / dribbles_attempted * 100, 2) if dribbles_attempted > 0 else 0
        else:
            summary['Dribbles Completed'] = 0
            summary['Dribbles Attempted'] = 0
            summary['Dribble Completion %'] = 0

        complete_passes = player_events['pass_recipient'].notna().sum() if 'pass_recipient' in player_events.columns else 0
        misplaced_passes = player_events['pass_outcome'].notna().sum() if 'pass_outcome' in player_events.columns else 0
        summary['Total Passes'] = complete_passes + misplaced_passes
        summary['Pass Completion Percentage'] = round(complete_passes / summary['Total Passes'] * 100, 2) if summary['Total Passes'] > 0 else 0

        summary['Clearances'] = (player_events['type'] == 'Clearance').sum() if 'type' in player_events.columns else 0
        summary['Aerial Duels Won'] = (player_events['clearance_aerial_won'] == 'Aerial Clearance that is was also an aerial duel won').sum() if 'clearance_aerial_won' in player_events.columns else 0

        if 'interception_outcome' in player_events.columns:
            interceptions = player_events[player_events['type'] == 'Interception'] if 'type' in player_events.columns else pd.DataFrame()
            successful_interceptions = ['Won', 'Success', 'Success In Play', 'Success Out']
            summary['Interceptions'] = interceptions['interception_outcome'].isin(successful_interceptions).sum()
        else:
            summary['Interceptions'] = 0

        if '50_50' in player_events.columns:
            fifty_fifty_events = player_events[~player_events['50_50'].isna()].copy()
            fifty_fifty_events['outcome_name'] = fifty_fifty_events['50_50'].apply(
                lambda x: x.get('outcome', {}).get('name') if isinstance(x, dict) else None
            )
            won = (fifty_fifty_events['outcome_name'] == 'Won').sum()
            lost = (fifty_fifty_events['outcome_name'] == 'Lost').sum()
            summary['Duels Won (50/50)'] = won
            summary['Duels Lost (50/50)'] = lost
            summary['50/50 Win %'] = round((won / (won + lost)) * 100, 2) if (won + lost) > 0 else 0
        else:
            summary['Duels Won (50/50)'] = 0
            summary['Duels Lost (50/50)'] = 0
            summary['50/50 Win %'] = 0

        if 'duel_type' in player_events.columns and 'duel_outcome' in player_events.columns:
            tackles = player_events[player_events['duel_type'] == 'Tackle']
            success_labels = ['Won', 'Success', 'Success In Play', 'Success Out']
            fail_labels = ['Lost In Play', 'Lost Out']
            summary['Tackles Attempted'] = len(tackles)
            summary['Successful Tackles'] = tackles['duel_outcome'].isin(success_labels).sum()
            summary['Unsuccessful Tackles'] = tackles['duel_outcome'].isin(fail_labels).sum()
            summary['Tackle Success %'] = round(summary['Successful Tackles'] / summary['Tackles Attempted'] * 100, 2) if summary['Tackles Attempted'] > 0 else 0
        else:
            summary['Tackles Attempted'] = 0
            summary['Successful Tackles'] = 0
            summary['Unsuccessful Tackles'] = 0
            summary['Tackle Success %'] = 0

        summary['Fouls Committed'] = player_events['foul_committed_type'].notna().sum() if 'foul_committed_type' in player_events.columns else 0

        player_summary_off[player] = summary

    return pd.DataFrame.from_dict(player_summary_off, orient='index').reset_index(drop=True)

all_events_with_labels = all_events_combined.merge(
    possession_summary_df,
    on=['match_id', 'possession'],
    suffixes=('', '_seq'),
    how='left'
)

print(all_events_with_labels)
print(all_events_with_labels.columns)

all_events_with_labels.to_csv("All_events_labeled.csv", index = False)

%store all_events_with_labels

        50_50 ball_receipt_outcome ball_recovery_recovery_failure  \
0         NaN                  NaN                            NaN   
1         NaN                  NaN                            NaN   
2         NaN                  NaN                            NaN   
3         NaN                  NaN                            NaN   
4         NaN                  NaN                            NaN   
...       ...                  ...                            ...   
2865557   NaN                  NaN                            NaN   
2865558   NaN                  NaN                            NaN   
2865559   NaN                  NaN                            NaN   
2865560   NaN                  NaN                            NaN   
2865561   NaN                  NaN                            NaN   

        block_deflection block_offensive carry_end_location  \
0                    NaN             NaN                NaN   
1                    NaN             NaN     

I want to now pivot to sequence modeling, aka how Quantify how each individual action (pass, dribble, interception, etc.) contributes to the likelihood of a valuable outcome (like a goal or shot).
- Not all passes are equal
- A sideways pass in your own half = low value
- A through ball that breaks the defensive line = high value

Assigning a numerical "impact value" to each event in the match based on its contribution toward future success (e.g., a shot, goal, or high xG chance).

1. The game as a sequence of actions.

2. Each action shifts the probability of scoring (or conceding) in the near future.

3. You calculate the change in scoring probability before and after each action.

Impact of an action = P(goal | after action) – P(goal | before action): Conditional Probability, how the probability of an event changes given a certain condition
 - Captures the true value of buildip actions that don't directly lead to a goal but "move the needle"
 - Enables comparability across action types

Data Format: 
: Event level data (StatsBomb)
: For each action:
 - Match ID, player, timestamp, team
 - Action Type (Pass, shot, Dribble, etc. ) [We can focus on those three for now]
 - Start and end location
 - Possession Sequence ID: Every touch or action by a team between the time they win the ball and the time they lose it.
 - Outcome (successful, failed)
 - xG (for shots), pass outcome

Sequence Visualization on a Pitch
Input: A sequence of events from match data
Output: A plotted graphic of those actions mapped on a soccer pitch

Step 1: Data Collection & Structure
- Pull Event data from Statsbomb
- Ensure each row represents one action
 - Features: type, x, y, end_x, end_y, team, player, minute, outcome
 - Context: match_id, period, timestamp, possession, index in possession

Step 2: Create Possession Sequences
Goal: Group events into sequences of continuous possession by one team

- Use the possession field from StatsBomb or generate ones own
- Label each sequence with:
 - Posession_id
 - Whether it leads to a shot, goal or valuable outcome

Step 3: Define the Target Variable (action value)
What are you predicting? 
Possible Options:
 - xT Delta = xT(after action) - xT(before action)
 - P(goal within N actions) = Binary Classification label 
 - Expected Goal Contribution = Weighted sum of shot probabilities
 - Expected Threat (xT) = Location based scoring potential

Step 4: Feature Engineering:
Goal: Capture action context and predictive cues

-----------------------------------------------
Feature Type
Spatial : x, y, end_x, end_y, zones, angles
Action Type : Pass, dribble, shot, interception
Outcome : Success/Failure
Temporal : Minute, time since possession start
Player Context : Previous actions, touches in game
Team Context : Possession Length, Score differential

Step 5: Model Training

Step 6: Calculate Action Value
Goal: Use the model to assign a value to each action

Step 7: Visualization & Sequence Plotting

