# Flagrant Foul Impact on Win Probability (2023-24 Season)

Logistic regression: Does committing a flagrant foul decrease the probability of winning?

Model: win ~ committed_flagrant_foul

In [1]:
import pandas as pd
import numpy as np
import time
import os
from pathlib import Path
from nba_api.stats.endpoints.leaguegamefinder import LeagueGameFinder
from nba_api.stats.endpoints.playbyplayv3 import PlayByPlayV3
import statsmodels.api as sm
from statsmodels.formula.api import logit
import warnings
warnings.filterwarnings('ignore')

## 1. Data Collection

In [None]:
# Check if CSV already exists
csv_file = Path('nba_flagrant_fouls.csv')
games_with_data = None

if csv_file.exists():
    print(f"Found existing data file: {csv_file}")
    games_with_data = pd.read_csv(csv_file)
    print(f"Loaded {len(games_with_data)} games from CSV")
else:
    # Get all 2023-24 season games
    print("Fetching 2023-24 season games...")
    gamefinder = LeagueGameFinder(season_nullable='2023-24')
    games_df = gamefinder.get_data_frames()[0]
    game_ids = games_df['GAME_ID'].unique()
    print(f"Found {len(game_ids)} unique games to process\n")
    
    print("Extracting flagrant fouls from all games...")
    print("(Throttled to 1 second between API calls - approximately 20-25 minutes)\n")
    
    # Initialize CSV with headers
    csv_file.unlink(missing_ok=True)  # Remove if exists
    csv_initialized = False
    processed_count = 0
    
    for i, game_id in enumerate(game_ids):
        if i % 100 == 0:
            print(f"Progress: {i}/{len(game_ids)} games processed, {processed_count} saved to CSV")
        
        game_data = extract_game_data(game_id)
        
        if game_data:
            # Convert to DataFrame and append to CSV immediately
            game_df = pd.DataFrame([game_data])
            
            if csv_initialized:
                # Append to existing CSV
                game_df.to_csv(csv_file, mode='a', header=False, index=False)
            else:
                # Create new CSV with headers
                game_df.to_csv(csv_file, mode='w', header=True, index=False)
                csv_initialized = True
            
            processed_count += 1
        
        # 1 second throttle between every play-by-play API call
        time.sleep(1.0)
    
    games_with_data = pd.read_csv(csv_file)
    
    print(f"\n{'='*70}")
    print(f"Successfully extracted and saved {processed_count} games")
    print(f"Data saved to: {csv_file}")
    print(f"{'='*70}\n")

In [None]:
def extract_game_data(game_id):
    """
    Extract flagrant fouls and game outcome from a single game.
    
    Args:
        game_id: NBA game ID
    
    Returns:
        dict: Game data or None if error occurs
    """
    try:
        response = PlayByPlayV3(game_id=game_id)
        pbp = response.play_by_play.get_data_frame()
        
        # Extract flagrants by team
        flagrants = pbp[pbp['subType'].isin(['Flagrant Type 1', 'Flagrant Type 2'])]
        
        home_flagrants = len(flagrants[flagrants['location'] == 'h'])
        away_flagrants = len(flagrants[flagrants['location'] == 'v'])
        
        # Get final score (from last row)
        final_row = pbp.iloc[-1]
        home_score = final_row['scoreHome']
        away_score = final_row['scoreAway']
        
        # Get team IDs (from first non-empty row)
        team_rows = pbp[pbp['teamId'] != 0]
        home_team = team_rows[team_rows['location'] == 'h']['teamId'].iloc[0]
        away_team = team_rows[team_rows['location'] == 'v']['teamId'].iloc[0]
        
        return {
            'game_id': game_id,
            'home_team': home_team,
            'away_team': away_team,
            'home_flagrants': home_flagrants,
            'away_flagrants': away_flagrants,
            'home_score': home_score,
            'away_score': away_score
        }
    
    except Exception as e:
        print(f"  ERROR on game {game_id}: {type(e).__name__} - {str(e)}")
        return None

# Check if CSV already exists
csv_file = Path('flagrant_fouls_2023_24.csv')
games_with_data = None

if csv_file.exists():
    print(f"Found existing data file: {csv_file}")
    games_with_data = pd.read_csv(csv_file)
    print(f"Loaded {len(games_with_data)} games from CSV\n")
else:
    # Get all 2023-24 season games
    print("Fetching 2023-24 season games...")
    gamefinder = LeagueGameFinder(season_nullable='2023-24')
    games_df = gamefinder.get_data_frames()[0]
    game_ids = games_df['GAME_ID'].unique()
    print(f"Found {len(game_ids)} unique games to process\n")
    
    print("Extracting flagrant fouls from all games...")
    print("(Throttled to 1 second between API calls)\n")
    
    # Initialize CSV with headers
    csv_file.unlink(missing_ok=True)  # Remove if exists
    csv_initialized = False
    processed_count = 0
    
    for i, game_id in enumerate(game_ids):
        if i % 100 == 0:
            print(f"Progress: {i}/{len(game_ids)} games processed, {processed_count} saved to CSV")
        
        game_data = extract_game_data(game_id)
        
        if game_data:
            # Convert to DataFrame and append to CSV immediately
            game_df = pd.DataFrame([game_data])
            
            if csv_initialized:
                # Append to existing CSV
                game_df.to_csv(csv_file, mode='a', header=False, index=False)
            else:
                # Create new CSV with headers
                game_df.to_csv(csv_file, mode='w', header=True, index=False)
                csv_initialized = True
            
            processed_count += 1
        
        # 1 second throttle between every play-by-play API call
        time.sleep(1.0)
    
    print(f"\n{'='*70}")
    print(f"Successfully extracted and saved {processed_count} games")
    print(f"Data saved to: {csv_file}")
    print(f"{'='*70}\n")
    
    # Load the complete CSV
    games_with_data = pd.read_csv(csv_file)

In [8]:
games_with_data = pd.DataFrame(all_games)
games_with_data

Unnamed: 0,game_id,home_team,away_team,home_flagrants,away_flagrants,home_score,away_score
0,0042300405,1610612738,1610612742,0,0,106,88
1,0042300404,1610612742,1610612738,0,0,122,84
2,0042300403,1610612742,1610612738,0,0,99,106
3,0042300402,1610612738,1610612742,0,0,105,98
4,0042300401,1610612738,1610612742,0,0,107,89
...,...,...,...,...,...,...,...
595,0022300702,1610612742,1610612749,0,0,117,129
596,0022300698,1610612755,1610612751,0,0,121,136
597,0022300700,1610612741,1610612758,0,0,115,123
598,0022300693,1610612763,1610612744,0,0,101,121


## 2. Data Preparation

In [7]:
# Create long-format dataset: one row per team per game
home_data = games_with_data[['game_id', 'home_team', 'home_flagrants', 'home_score', 'away_score']].copy()
home_data.columns = ['game_id', 'team_id', 'committed_flagrant', 'team_score', 'opp_score']
home_data['won'] = (home_data['team_score'] > home_data['opp_score']).astype(int)

away_data = games_with_data[['game_id', 'away_team', 'away_flagrants', 'away_score', 'home_score']].copy()
away_data.columns = ['game_id', 'team_id', 'committed_flagrant', 'team_score', 'opp_score']
away_data['won'] = (away_data['team_score'] > away_data['opp_score']).astype(int)

# Combine home and away
df = pd.concat([home_data, away_data], ignore_index=True)

# Convert flagrant count to binary (committed at least one flagrant foul)
df['committed_flagrant'] = (df['committed_flagrant'] > 0).astype(int)

print(f"Dataset shape: {df.shape}")
print(f"\nOutcome distribution:")
print(f"  Wins: {df['won'].sum()} ({df['won'].mean()*100:.1f}%)")
print(f"  Losses: {(1-df['won']).sum()} ({(1-df['won']).mean()*100:.1f}%)")
print(f"\nFlagrant foul distribution:")
print(f"  Committed flagrant: {df['committed_flagrant'].sum()} ({df['committed_flagrant'].mean()*100:.1f}%)")
print(f"  No flagrant: {(1-df['committed_flagrant']).sum()} ({(1-df['committed_flagrant']).mean()*100:.1f}%)")

Dataset shape: (1200, 6)

Outcome distribution:
  Wins: 600 (50.0%)
  Losses: 600 (50.0%)

Flagrant foul distribution:
  Committed flagrant: 64 (5.3%)
  No flagrant: 1136 (94.7%)


## 3. Descriptive Statistics

In [15]:
from scipy.stats import norm

# Calculate effect size from observed proportions
p_win_with_flagrant = df[df['committed_flagrant'] == 1]['won'].mean()
p_win_no_flagrant = df[df['committed_flagrant'] == 0]['won'].mean()
n_with_flagrant = (df['committed_flagrant'] == 1).sum()
n_no_flagrant = (df['committed_flagrant'] == 0).sum()

print("="*70)
print("POWER CALCULATION")
print("="*70)

print(f"\nObserved Effect Sizes:")
print(f"  Win rate WITH flagrant: {p_win_with_flagrant:.3f} (n={n_with_flagrant})")
print(f"  Win rate WITHOUT flagrant: {p_win_no_flagrant:.3f} (n={n_no_flagrant})")
print(f"  Difference: {p_win_no_flagrant - p_win_with_flagrant:.3f}")

# Calculate odds ratio from proportions
odds_with = p_win_with_flagrant / (1 - p_win_with_flagrant) if p_win_with_flagrant < 1 else float('inf')
odds_without = p_win_no_flagrant / (1 - p_win_no_flagrant)
obs_odds_ratio = odds_without / odds_with

print(f"  Observed Odds Ratio: {obs_odds_ratio:.4f}")

# Power calculation using normal approximation for logistic regression
# Based on Hsieh et al. formula
def power_logistic(n_cases, n_controls, p_exposed, odds_ratio, alpha=0.05):
    """
    Calculate power for logistic regression (two-sided test).
    
    Args:
        n_cases: number of cases (outcomes=1)
        n_controls: number of controls (outcomes=0)
        p_exposed: proportion exposed in population
        odds_ratio: hypothesized/observed odds ratio
        alpha: significance level (two-sided)
    
    Returns:
        power: statistical power
    """
    n_total = n_cases + n_controls
    
    # Calculate effect size on log scale
    log_or = np.log(odds_ratio)
    
    # Variance of log(OR)
    variance = (1 / (n_cases * p_exposed * (1 - p_exposed)) + 
                1 / (n_controls * p_exposed * (1 - p_exposed)))
    
    # Z-score for observed effect
    z_effect = log_or / np.sqrt(variance)
    
    # Critical z-value (two-sided)
    z_crit = norm.ppf(1 - alpha / 2)
    
    # Power = P(|Z| > z_crit)
    power = 1 - norm.cdf(z_crit - np.abs(z_effect)) + norm.cdf(-z_crit - np.abs(z_effect))
    
    return power

# Calculate current power
current_power = power_logistic(
    n_cases=(df['won'] == 1).sum(),
    n_controls=(df['won'] == 0).sum(),
    p_exposed=(df['committed_flagrant'] == 1).sum() / len(df),
    odds_ratio=obs_odds_ratio,
    alpha=0.05
)

print(f"\nCurrent Sample (2023-24 season only):")
print(f"  Total team-games: {len(df)}")
print(f"  Statistical Power: {current_power:.3f} ({current_power*100:.1f}%)")
print(f"  Interpretation: {('ADEQUATE' if current_power >= 0.80 else 'INSUFFICIENT')} for 80% target")

# Calculate sample size needed for 80% power
def sample_size_for_power(target_power=0.80, p_exposed=0.5, odds_ratio=1.1, alpha=0.05):
    """Calculate sample size needed to achieve target power."""
    z_crit = norm.ppf(1 - alpha / 2)
    z_power = norm.ppf(target_power)
    
    log_or = np.log(odds_ratio)
    
    # Approximate formula (Hsieh et al.)
    n_total = ((z_crit + z_power) ** 2) / (
        (p_exposed * (1 - p_exposed) * log_or) ** 2
    )
    
    return n_total

needed_sample = sample_size_for_power(
    target_power=0.80,
    p_exposed=(df['committed_flagrant'] == 1).sum() / len(df),
    odds_ratio=obs_odds_ratio,
    alpha=0.05
)

print(f"\nSample Size Needed for 80% Power:")
print(f"  Total team-games required: {int(np.ceil(needed_sample))}")

current_coverage = (len(df) / needed_sample) * 100
print(f"  Current coverage: {current_coverage:.1f}% of needed sample")

# Estimate how many seasons needed
team_games_per_season = 2460
seasons_needed = needed_sample / team_games_per_season

print(f"\nSeasonal Breakdown:")
print(f"  Team-games per season (2023-24): {team_games_per_season}")
print(f"  Seasons needed for 80% power: {seasons_needed:.2f}")

if seasons_needed <= 1:
    print(f"  → 2023-24 season is SUFFICIENT ✓")
elif seasons_needed <= 2:
    print(f"  → Need 2023-24 + one more season")
else:
    print(f"  → Need approximately {int(np.ceil(seasons_needed))} seasons of data")

POWER CALCULATION

Observed Effect Sizes:
  Win rate WITH flagrant: 0.469 (n=64)
  Win rate WITHOUT flagrant: 0.502 (n=1136)
  Difference: 0.033
  Observed Odds Ratio: 1.1413

Current Sample (2023-24 season only):
  Total team-games: 1200
  Statistical Power: 0.081 (8.1%)
  Interpretation: INSUFFICIENT for 80% target

Sample Size Needed for 80% Power:
  Total team-games required: 176165
  Current coverage: 0.7% of needed sample

Seasonal Breakdown:
  Team-games per season (2023-24): 2460
  Seasons needed for 80% power: 71.61
  → Need approximately 72 seasons of data


## 3B. Power Calculation

In [9]:
# Win rate by flagrant status
print("\nWin rate by flagrant foul status:")
win_by_flagrant = df.groupby('committed_flagrant')['won'].agg(['count', 'sum', 'mean'])
win_by_flagrant.columns = ['n_teams', 'n_wins', 'win_rate']
win_by_flagrant.index = ['No Flagrant', 'Committed Flagrant']
print(win_by_flagrant)

print(f"\nDifference: {win_by_flagrant.loc['No Flagrant', 'win_rate'] - win_by_flagrant.loc['Committed Flagrant', 'win_rate']:.1%}")


Win rate by flagrant foul status:
                    n_teams  n_wins  win_rate
No Flagrant            1136     570  0.501761
Committed Flagrant       64      30  0.468750

Difference: 3.3%


## 4. Logistic Regression Model

In [None]:
# Fit logistic regression: win ~ committed_flagrant
model = logit('won ~ committed_flagrant', data=df).fit(disp=0)

print(model.summary())

## 5. Model Results Extraction

In [None]:
# Extract key statistics
print("="*70)
print("LOGISTIC REGRESSION RESULTS: Win Probability ~ Committed Flagrant Foul")
print("="*70)

# Coefficients table
coef_table = model.summary2().tables[1]
print("\n1. COEFFICIENT TABLE:")
print(coef_table)

# Extract for flagrant coefficient
flagrant_coef = model.params['committed_flagrant']
flagrant_se = model.bse['committed_flagrant']
flagrant_pval = model.pvalues['committed_flagrant']
flagrant_ci = model.conf_int().loc['committed_flagrant']
flagrant_odds_ratio = np.exp(flagrant_coef)
flagrant_or_ci = np.exp(flagrant_ci)

print("\n2. FLAGRANT FOUL EFFECT:")
print(f"  Coefficient: {flagrant_coef:.4f}")
print(f"  Odds Ratio: {flagrant_odds_ratio:.4f}")
print(f"  95% CI (OR): [{flagrant_or_ci[0]:.4f}, {flagrant_or_ci[1]:.4f}]")
print(f"  P-value: {flagrant_pval:.4f}")
print(f"  Significance: {'Yes (p < 0.05)' if flagrant_pval < 0.05 else 'No (p >= 0.05)'}")

# Interpretation
pct_change = (flagrant_odds_ratio - 1) * 100
print(f"\n  Interpretation: Committing a flagrant foul is associated with")
print(f"  a {abs(pct_change):.1f}% {'DECREASE' if pct_change < 0 else 'INCREASE'} in odds of winning")

# Model fit
print(f"\n3. MODEL FIT:")
print(f"  AIC: {model.aic:.2f}")
print(f"  BIC: {model.bic:.2f}")
print(f"  Log-Likelihood: {model.llf:.2f}")
print(f"  Pseudo R-squared: {model.prsquared:.4f}")

# Sample size
print(f"\n4. SAMPLE SIZE:")
print(f"  Total team-games: {len(df)}")
print(f"  Team-games with flagrant: {df['committed_flagrant'].sum()}")
print(f"  Team-games without flagrant: {(1-df['committed_flagrant']).sum()}")

## 6. Summary Table

In [None]:
# Clean summary table
summary_data = {
    'Variable': ['Committed Flagrant Foul'],
    'Coefficient': [f"{flagrant_coef:.4f}"],
    'Odds Ratio': [f"{flagrant_odds_ratio:.4f}"],
    'OR 95% CI': [f"[{flagrant_or_ci[0]:.4f}, {flagrant_or_ci[1]:.4f}]"],
    'P-value': [f"{flagrant_pval:.4f}"],
    'Sig': ['*' if flagrant_pval < 0.05 else '']
}

summary_table = pd.DataFrame(summary_data)
print("\nSUMMARY TABLE:")
print(summary_table.to_string(index=False))
print("\nNote: * indicates p < 0.05")