# Flagrant Fouls Data Collection

Extract flagrant foul data from NBA API for seasons 2020-21 through 2024-25.

- Loads existing CSV to avoid duplicate game IDs
- Fetches new games from API
- Saves to CSV immediately after each successful API call
- Stops on first read timeout (1 hour cooldown needed)

In [1]:
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta
from pathlib import Path
from nba_api.stats.endpoints.leaguegamefinder import LeagueGameFinder
from nba_api.stats.endpoints.playbyplayv3 import PlayByPlayV3
import warnings
warnings.filterwarnings('ignore')

## 1. Load Existing Data

In [2]:
csv_file = Path('nba_flagrant_fouls.csv')
skipped_file = Path('nba_skipped_games.csv')

# Load existing data if it exists
if csv_file.exists():
    existing_df = pd.read_csv(csv_file, dtype={'game_id': str})
    # Ensure game_id is string and pad to 10 digits
    existing_df['game_id'] = existing_df['game_id'].astype(str).str.zfill(10)
    existing_game_ids = set(existing_df['game_id'].unique())
    print(f"Loaded existing CSV with {len(existing_df)} games")
    print(f"Unique game IDs to skip: {len(existing_game_ids)}")
else:
    existing_df = pd.DataFrame()
    existing_game_ids = set()
    print("No existing CSV found. Starting fresh.")

# Load skipped games if exists
if skipped_file.exists():
    skipped_df = pd.read_csv(skipped_file, dtype={'game_id': str})
    skipped_df['game_id'] = skipped_df['game_id'].astype(str).str.zfill(10)
    existing_game_ids.update(skipped_df['game_id'].unique())
    print(f"Loaded {len(skipped_df)} previously skipped games")
else:
    skipped_df = pd.DataFrame()
    print("No skipped games file found.")

Loaded existing CSV with 2024 games
Unique game IDs to skip: 2024
No skipped games file found.


## 2. Define Data Extraction Function

In [3]:
def extract_game_data(game_id):
    """
    Extract flagrant fouls and game outcome from a single game.

    Args:
        game_id: NBA game ID

    Returns:
        tuple: (game_data dict, error) - one will be None
    """
    try:
        response = PlayByPlayV3(game_id=game_id)
        pbp = response.play_by_play.get_data_frame()

        # Check if play-by-play data is empty
        if pbp.empty:
            return None, ValueError("No play-by-play data available for this game")

        # Extract flagrants by team
        flagrants = pbp[pbp['subType'].isin(['Flagrant Type 1', 'Flagrant Type 2'])]

        home_flagrants = len(flagrants[flagrants['location'] == 'h'])
        away_flagrants = len(flagrants[flagrants['location'] == 'v'])

        # Get final score (from last row)
        final_row = pbp.iloc[-1]
        home_score = final_row['scoreHome']
        away_score = final_row['scoreAway']

        # Get team IDs - safely extract with validation
        team_rows = pbp[(pbp['teamId'] != 0) & (pbp['location'].isin(['h', 'v']))]

        if len(team_rows) == 0:
            return None, ValueError("No team data found in play-by-play")

        home_candidates = team_rows[team_rows['location'] == 'h']['teamId']
        away_candidates = team_rows[team_rows['location'] == 'v']['teamId']

        if len(home_candidates) == 0 or len(away_candidates) == 0:
            return None, ValueError("Missing home or away team data")

        home_team = home_candidates.iloc[0]
        away_team = away_candidates.iloc[0]

        game_data = {
            'game_id': game_id,
            'home_team': home_team,
            'away_team': away_team,
            'home_flagrants': home_flagrants,
            'away_flagrants': away_flagrants,
            'home_score': home_score,
            'away_score': away_score
        }
        return game_data, None

    except Exception as e:
        return None, e

## 3. Fetch Games from NBA API

In [4]:
# Define seasons to extract
seasons = ['2020-21', '2021-22', '2022-23', '2023-24', '2024-25']

all_game_ids = []

print(f"Fetching game IDs from {len(seasons)} seasons...")
for season in seasons:
    try:
        gamefinder = LeagueGameFinder(season_nullable=season)
        games_df = gamefinder.get_data_frames()[0]
        game_ids = games_df['GAME_ID'].unique()
        all_game_ids.extend(game_ids)
        print(f"  {season}: {len(game_ids)} games")
    except Exception as e:
        print(f"  {season}: ERROR - {type(e).__name__}")
        continue

print(f"\nTotal unique games: {len(set(all_game_ids))}")

# Keep game IDs as strings (DO NOT convert to integers - this strips leading zeros!)
# API requires 10-digit strings like '0022000605'
all_game_ids_str = [str(gid) for gid in all_game_ids]

# Filter out games already in CSV or skipped
new_game_ids = [gid for gid in all_game_ids_str if gid not in existing_game_ids]
print(f"New games to extract: {len(new_game_ids)}")
print(f"Games to skip (already processed): {len(all_game_ids_str) - len(new_game_ids)}")

Fetching game IDs from 5 seasons...
  2020-21: 1221 games
  2021-22: 1394 games
  2022-23: 1395 games
  2023-24: 1397 games
  2024-25: 1401 games

Total unique games: 6808
New games to extract: 4784
Games to skip (already processed): 2024


## 4. Extract Flagrant Foul Data

In [5]:
print(f"\nStarting data extraction for {len(new_game_ids)} new games...")

successful_count = 0
failed_count = 0
skipped_count = 0

try:
    for i, game_id in enumerate(new_game_ids):
        if i % 100 == 0 and i > 0:
            print(f"Progress: {i}/{len(new_game_ids)} | Saved: {successful_count} | Skipped: {skipped_count} | Errors: {failed_count}")

        game_data, error = extract_game_data(game_id)

        if game_data:
            # Ensure game_id is stored as string with leading zeros
            game_data['game_id'] = str(game_data['game_id']).zfill(10)
            
            # Convert to DataFrame and save immediately
            game_df = pd.DataFrame([game_data])

            # Append to CSV
            if csv_file.exists():
                game_df.to_csv(csv_file, mode='a', header=False, index=False)
            else:
                game_df.to_csv(csv_file, mode='w', header=True, index=False)

            successful_count += 1
        else:
            # Check if it's a read timeout
            if 'ReadTimeout' in str(type(error).__name__) or 'timeout' in str(error).lower():
                print(f"\n{'='*70}")
                print(f"READ TIMEOUT DETECTED - IP likely rate limited")
                print(f"{'='*70}")

                # Calculate 1 hour from now
                retry_time = datetime.now() + timedelta(hours=1)
                print(f"\nPlease wait until: {retry_time.strftime('%Y-%m-%d %H:%M:%S')} (local time)")
                print(f"This is approximately 1 hour from now.")
                print(f"\nProgress so far:")
                print(f"  Games extracted: {successful_count}")
                print(f"  Games processed: {i + 1}/{len(new_game_ids)}")
                print(f"\nRun this notebook again in 1 hour to continue extraction.")
                break
            elif isinstance(error, ValueError):
                # Track games with no data in separate file
                skipped_game_df = pd.DataFrame([{
                    'game_id': str(game_id).zfill(10),  # Ensure string format
                    'reason': str(error),
                    'timestamp': datetime.now().isoformat()
                }])
                
                if skipped_file.exists():
                    skipped_game_df.to_csv(skipped_file, mode='a', header=False, index=False)
                else:
                    skipped_game_df.to_csv(skipped_file, mode='w', header=True, index=False)
                
                skipped_count += 1
            else:
                print(f"Error extracting game {game_id}: {type(error).__name__} - {error}")
                failed_count += 1

        # Throttle to avoid rate limiting
        time.sleep(0.600)

except KeyboardInterrupt:
    print(f"\n\nInterrupted by user")

print(f"\n{'='*70}")
print(f"EXTRACTION COMPLETE")
print(f"{'='*70}")
print(f"Successfully saved: {successful_count} games")
print(f"Skipped (no data): {skipped_count} games")
print(f"Errors encountered: {failed_count} games")
print(f"Total processed: {successful_count + skipped_count + failed_count}/{len(new_game_ids)}")
print(f"Data saved to: {csv_file}")
if skipped_count > 0:
    print(f"Skipped games logged to: {skipped_file}")


Starting data extraction for 4784 new games...
Progress: 100/4784 | Saved: 100 | Skipped: 0 | Errors: 0
Progress: 200/4784 | Saved: 200 | Skipped: 0 | Errors: 0

READ TIMEOUT DETECTED - IP likely rate limited

Please wait until: 2025-11-30 09:02:26 (local time)
This is approximately 1 hour from now.

Progress so far:
  Games extracted: 258
  Games processed: 260/4784

Run this notebook again in 1 hour to continue extraction.

EXTRACTION COMPLETE
Successfully saved: 258 games
Skipped (no data): 1 games
Errors encountered: 0 games
Total processed: 259/4784
Data saved to: nba_flagrant_fouls.csv
Skipped games logged to: nba_skipped_games.csv


## 5. Verify Final Data

In [6]:
# Load and display summary (with correct dtype to preserve string format)
final_df = pd.read_csv(csv_file, dtype={'game_id': str})

print(f"\nFinal CSV Summary:")
print(f"Total games: {len(final_df)}")
print(f"Unique games: {final_df['game_id'].nunique()}")
print(f"\nSample game IDs (first 5):")
for gid in final_df['game_id'].head():
    print(f"  {gid}")

print(f"\nGames per season (inferred from game_id):")
final_df['season'] = final_df['game_id'].str[1:5]  # Extract season code from positions 1-4
print(final_df.groupby('season').size().sort_index())

print(f"\nData shape: {final_df.shape}")
print(f"Columns: {final_df.columns.tolist()}")
print(f"Game ID format: {final_df['game_id'].dtype} (10-digit strings with leading zeros)")


Final CSV Summary:
Total games: 2282
Unique games: 2282

Sample game IDs (first 5):
  0042300405
  0042300404
  0042300403
  0042300402
  0042300401

Games per season (inferred from game_id):
season
0120      49
0220    1080
0221     364
0223     507
0320       1
0321       4
0323       5
0420      85
0421      87
0423      82
0520       6
0521       6
0523       6
dtype: int64

Data shape: (2282, 8)
Columns: ['game_id', 'home_team', 'away_team', 'home_flagrants', 'away_flagrants', 'home_score', 'away_score', 'season']
Game ID format: object (10-digit strings with leading zeros)
