In [1]:
import os
import pandas as pd
import warnings
from statsbombpy import sb

In [2]:
# Defining Constants
DEFAULT_SEASON = '2015/2016'
TOP5_LEAGUES = ['Italy', 'England', 'Spain', 'Germany', 'France']

try:
    DATA_DIR = os.path.join(os.path.dirname(__file__), '..', 'data')
except NameError:
    DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), 'data')
    
warnings.filterwarnings("ignore", category=UserWarning, module="statsbombpy")

In [None]:
def collect_shots_data(league, save_path=DATA_DIR, season_name=DEFAULT_SEASON, save_files=True):
   """
   Retrieves and saves shot data from selected league for 2015/16 season.
   
   Parameters:
   -----------
   league : str or list
       League name or list of leagues (e.g. 'Italy', 'England', 'Spain', 'Germany', 'France')
   save_path: str, optional
       Target file save path (default: 'data' folder in current directory)
   season_name: str, optional
       Season name (default: '2015/2016')
   save_files: bool, optional
       Whether to save CSV files (default: True)
       
   Returns:
   --------
   pd.DataFrame
       DataFrame with all shots
   """
   # Create directory if it doesn't exist
   if save_files and not os.path.exists(save_path):
       os.makedirs(save_path)
       print(f"Created directory: {save_path}")
   
   # Handle single league or list of leagues
   if isinstance(league, str):
       leagues_to_process = [league]
   else:
       leagues_to_process = league
   
   all_shots_data = []
   
   # Process each league separately
   for current_league in leagues_to_process:
       print(f"\nStarting data retrieval from league: {current_league}")
       
       # Retrieve league data
       try:
           free_comps = sb.competitions()
           
           # Filter selected league
           league_data = free_comps[(free_comps['season_name']==season_name) & 
                              (free_comps['country_name']==current_league)]
           
           if league_data.empty:
               print(f"No data found for league {current_league} in season {season_name}. Skipping.")
               continue
           
           competitions = list(league_data['competition_id'])
           
           # Retrieve match IDs
           season_id = league_data['season_id'].iloc[0]
           all_matches = pd.concat([sb.matches(competition_id=comp_id, season_id=season_id) 
                                 for comp_id in competitions])
           matches_id = list(all_matches['match_id'])
           print(f"Found {len(matches_id)} matches to analyze")
       except Exception as e:
           print(f"Error retrieving matches for league {current_league}: {str(e)}")
           continue
       
       # Retrieve shot data
       shot_data = []
       
       for idx, match_id in enumerate(matches_id):
           try:
               print(f"Processing match {idx+1}/{len(matches_id)}", end='\r')
               
               events = sb.events(match_id=match_id)
               shots = events[events['type']=='Shot']
               
               # Add only non-empty dataframes
               if not shots.empty:
                   shot_data.append(shots)
                               
           except Exception as e:
               print(f"\nError with match {match_id}: {str(e)}")
               continue
       
       if shot_data:
           # Combine data from this league
           print("\nCombining data...")
           league_shots = pd.concat(shot_data)
           
           # Save file for this league
           if save_files:
               # Create proper filename
               season_str = season_name.replace("/", "_")
               output_filename = os.path.join(save_path, f'all_shots_{current_league}_{season_str}.csv')
               league_shots.to_csv(output_filename, index=False)
               print(f"Data saved to file: {output_filename}")
           
           # Add to collective data
           all_shots_data.append(league_shots)
   
   # Combine data from all leagues if there's more than one
   if len(all_shots_data) > 0:
       all_shots = pd.concat(all_shots_data)
       
       # Save collective file if more than one league was processed
       if save_files and len(leagues_to_process) > 1:
           season_str = season_name.replace("/", "_")
           output_filename = os.path.join(save_path, f'all_shots_combined_{season_str}.csv')
           all_shots.to_csv(output_filename, index=False)
           print(f"\nCollective data saved to file: {output_filename}")
       
       return all_shots
   else:
       print("No data retrieved.")
       return None

if __name__ == "__main__":
   if not os.path.exists(DATA_DIR):
       os.makedirs(DATA_DIR)
       print(f"Created data directory: {DATA_DIR}")
   
   # Retrieve data for TOP5 leagues
   print(f"Starting data retrieval for leagues: {', '.join(TOP5_LEAGUES)}")
   shots_df = collect_shots_data(league=TOP5_LEAGUES, save_path=DATA_DIR)
   
   if shots_df is not None:
       print(f"Retrieved data for {len(shots_df)} shots from {len(TOP5_LEAGUES)} leagues")

Starting data retrieval for leagues: Italy, England, Spain, Germany, France

Starting data retrieval from league: Italy
Found 380 matches to analyze
Processing match 380/380
Combining data...
Data saved to file: C:\Users\barto\Football-xG-Predictor\data\all_shots_Italy_2015_2016.csv

Starting data retrieval from league: England
Found 380 matches to analyze
Processing match 380/380
Combining data...
Data saved to file: C:\Users\barto\Football-xG-Predictor\data\all_shots_England_2015_2016.csv

Starting data retrieval from league: Spain
Found 380 matches to analyze
Processing match 380/380
Combining data...
Data saved to file: C:\Users\barto\Football-xG-Predictor\data\all_shots_Spain_2015_2016.csv

Starting data retrieval from league: Germany
Found 306 matches to analyze
Processing match 306/306
Combining data...
Data saved to file: C:\Users\barto\Football-xG-Predictor\data\all_shots_Germany_2015_2016.csv

Starting data retrieval from league: France
Found 377 matches to analyze
Processing