In [3]:
#!pip install statsbombpy

Collecting statsbombpy
  Using cached statsbombpy-1.16.0-py3-none-any.whl.metadata (63 kB)
Collecting requests-cache (from statsbombpy)
  Using cached requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting inflect (from statsbombpy)
  Using cached inflect-7.5.0-py3-none-any.whl.metadata (24 kB)
Collecting joblib (from statsbombpy)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting more_itertools>=8.5.0 (from inflect->statsbombpy)
  Downloading more_itertools-10.7.0-py3-none-any.whl.metadata (37 kB)
Collecting typeguard>=4.0.1 (from inflect->statsbombpy)
  Using cached typeguard-4.4.4-py3-none-any.whl.metadata (3.3 kB)
Collecting cattrs>=22.2 (from requests-cache->statsbombpy)
  Using cached cattrs-25.1.1-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache->statsbombpy)
  Using cached url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Using cached statsbombpy-1.16.0-py3-none-any.whl (16 kB)
Using cached inflect-7.5.

In [9]:
import os
import warnings
import pandas as pd
import statsbombpy.sb as sb

# Defining Constants
DEFAULT_SEASON = '2015/2016'
TOP5_LEAGUES = ['Italy', 'England', 'Spain', 'Germany', 'France']

# Create data directory in current working directory
DATA_DIR = os.path.join(os.getcwd(), 'data')
    
warnings.filterwarnings("ignore", category=UserWarning, module="statsbombpy")

def collect_events_data(league, save_path=DATA_DIR, season_name=DEFAULT_SEASON, save_files=True):
    """
    Retrieves and saves all event data from selected league for specified season.
    
    Parameters:
    -----------
    league : str or list
        League name or list of leagues (e.g. 'Italy', 'England', 'Spain', 'Germany', 'France')
    save_path: str, optional
        Target file save path (default: 'data' folder in current directory)
    season_name: str, optional
        Season name (default: '2015/2016')
    save_files: bool, optional
        Whether to save CSV files (default: True)
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with all events
    """
    # Create directory if it doesn't exist
    if save_files and not os.path.exists(save_path):
        os.makedirs(save_path)
        print(f"Created directory: {save_path}")
    
    # Handle single league or list of leagues
    if isinstance(league, str):
        leagues_to_process = [league]
    else:
        leagues_to_process = league
    
    all_events_data = []
    
    # Process each league separately
    for current_league in leagues_to_process:
        print(f"\nStarting data retrieval from league: {current_league}")
        
        # Retrieve league data
        try:
            free_comps = sb.competitions()
            
            # Filter selected league
            league_data = free_comps[(free_comps['season_name']==season_name) & 
                               (free_comps['country_name']==current_league)]
            
            if league_data.empty:
                print(f"No data found for league {current_league} in season {season_name}. Skipping.")
                continue
            
            competitions = list(league_data['competition_id'])
            
            # Retrieve match IDs
            season_id = league_data['season_id'].iloc[0]
            all_matches = pd.concat([sb.matches(competition_id=comp_id, season_id=season_id) 
                                  for comp_id in competitions])
            matches_id = list(all_matches['match_id'])
            print(f"Found {len(matches_id)} matches to analyze")
        except Exception as e:
            print(f"Error retrieving matches for league {current_league}: {str(e)}")
            continue
        
        # Retrieve event data
        event_data = []
        
        for idx, match_id in enumerate(matches_id):
            try:
                print(f"Processing match {idx+1}/{len(matches_id)}", end='\r')
                
                # Get all events for this match
                events = sb.events(match_id=match_id)
                
                # Add match_id to the events for tracking
                if not events.empty:
                    events['match_id'] = match_id
                    event_data.append(events)
                                
            except Exception as e:
                print(f"\nError with match {match_id}: {str(e)}")
                continue
        
        if event_data:
            # Combine data from this league
            print("\nCombining data...")
            league_events = pd.concat(event_data, ignore_index=True)
            
            # Basic data info
            print(f"Total events collected: {len(league_events)}")
            print(f"Event types found: {league_events['type'].nunique()}")
            print(f"Most common events:")
            print(league_events['type'].value_counts().head(10))
            
            # Save file for this league
            if save_files:
                # Create proper filename
                season_str = season_name.replace("/", "_")
                output_filename = os.path.join(save_path, f'all_events_{current_league}_{season_str}.csv')
                league_events.to_csv(output_filename, index=False)
                print(f"Data saved to file: {output_filename}")
            
            # Add to collective data
            all_events_data.append(league_events)
    
    # Combine data from all leagues if there's more than one
    if len(all_events_data) > 0:
        all_events = pd.concat(all_events_data, ignore_index=True)
        
        # Print summary statistics
        print(f"\n=== SUMMARY STATISTICS ===")
        print(f"Total events: {len(all_events)}")
        print(f"Total matches: {all_events['match_id'].nunique()}")
        print(f"Event types: {all_events['type'].nunique()}")
        print(f"\nTop 15 event types:")
        print(all_events['type'].value_counts().head(15))
        
        # Save collective file if more than one league was processed
        if save_files and len(leagues_to_process) > 1:
            season_str = season_name.replace("/", "_")
            output_filename = os.path.join(save_path, f'all_events_combined_{season_str}.csv')
            all_events.to_csv(output_filename, index=False)
            print(f"\nCollective data saved to file: {output_filename}")
        
        return all_events
    else:
        print("No data retrieved.")
        return None

def analyze_events_for_xt_model(events_df):
    """
    Analyze events data for xT model preprocessing insights.
    
    Parameters:
    -----------
    events_df : pd.DataFrame
        DataFrame with all events
    """
    if events_df is None or events_df.empty:
        print("No data to analyze")
        return
    
    print(f"\n=== ANALYSIS FOR xT MODEL ===")
    
    # Essential columns for xT model
    essential_cols = ['type', 'location', 'possession', 'possession_team', 'period', 'minute', 'second']
    missing_cols = [col for col in essential_cols if col not in events_df.columns]
    if missing_cols:
        print(f"WARNING: Missing essential columns: {missing_cols}")
    
    # Event types analysis
    print(f"\nTotal unique event types: {events_df['type'].nunique()}")
    print("Event type distribution:")
    event_counts = events_df['type'].value_counts()
    for event_type, count in event_counts.head(20).items():
        pct = (count / len(events_df)) * 100
        print(f"  {event_type}: {count:,} ({pct:.1f}%)")
    
    # Location data analysis
    if 'location' in events_df.columns:
        events_with_location = events_df.dropna(subset=['location'])
        print(f"\nEvents with location data: {len(events_with_location):,} ({len(events_with_location)/len(events_df)*100:.1f}%)")
    
    # Possession analysis
    if 'possession' in events_df.columns:
        possessions = events_df['possession'].nunique()
        avg_events_per_possession = len(events_df) / possessions
        print(f"Total possessions: {possessions:,}")
        print(f"Average events per possession: {avg_events_per_possession:.1f}")
    
    # Goals analysis for class imbalance
    goals = events_df[events_df['type'] == 'Shot']['shot_outcome'] == 'Goal' if 'shot_outcome' in events_df.columns else 0
    if hasattr(goals, 'sum'):
        goal_count = goals.sum()
        shot_count = len(events_df[events_df['type'] == 'Shot'])
        if shot_count > 0:
            goal_rate = (goal_count / shot_count) * 100
            print(f"\nGoals: {goal_count}, Shots: {shot_count}, Goal rate: {goal_rate:.1f}%")

if __name__ == "__main__":
    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)
        print(f"Created data directory: {DATA_DIR}")
    

    
    # if events_df is not None:
    #     print(f"\nRetrieved data for {len(events_df):,} events from {len(TOP5_LEAGUES)} leagues")
        
    #     # Analyze data for xT model insights
    #     analyze_events_for_xt_model(events_df)
        
    #     # Save a sample for quick inspection
    #     sample_size = min(1000, len(events_df))
    #     sample_df = events_df.sample(n=sample_size, random_state=42)
    #     sample_filename = os.path.join(DATA_DIR, f'events_sample_{sample_size}.csv')
    #     sample_df.to_csv(sample_filename, index=False)
    #     print(f"\nSample of {sample_size} events saved to: {sample_filename}")
    


In [None]:
# Retrieve data for TOP5 leagues
print(f"Starting data retrieval for leagues: {', '.join(TOP5_LEAGUES)}")
events_df = collect_events_data("Italy", save_path=DATA_DIR)

Starting data retrieval for leagues: Italy, England, Spain, Germany, France

Starting data retrieval from league: Italy
Found 380 matches to analyze
Processing match 107/380