# Basketball Reference Scraper EDA

This notebook demonstrates the capabilities of the `BRefScraper` module for collecting NBA data from Basketball Reference.

**Features:**
- Season-wide player statistics (per_game, totals, advanced, per_minute, per_poss)
- Individual player game logs
- Team ratings (offensive/defensive)
- DraftKings fantasy point calculations

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
import sys

# Add src to path
sys.path.insert(0, str(pathlib.Path('..').resolve()))

from data.bref_scraper import BRefScraper

# Initialize scraper (3.1s delay respects Basketball Reference rate limits)
scraper = BRefScraper(delay=3.1)

pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 200)

## 1. Season Per-Game Statistics

Fetch all player per-game averages for the current season.

In [None]:
# Fetch per-game stats for 2024-25 season
per_game = scraper.get_season_stats(2025, stat_type="per_game")

print(f"Retrieved {len(per_game)} player entries")
print(f"\nColumns: {per_game.columns.tolist()}")
per_game.head(10)

In [None]:
# Top 20 scorers
if 'PTS' in per_game.columns:
    per_game['PTS'] = pd.to_numeric(per_game['PTS'], errors='coerce')
    top_scorers = per_game.nlargest(20, 'PTS')[['player_name', 'team', 'games', 'PTS', 'TRB', 'AST']]
    print("Top 20 Scorers (Per Game):")
    display(top_scorers)

## 2. Advanced Statistics

Fetch advanced metrics: PER, TS%, USG%, WS, BPM, VORP.

In [None]:
# Fetch advanced stats
advanced = scraper.get_season_stats(2025, stat_type="advanced")

print(f"Retrieved {len(advanced)} player entries")
print(f"\nAdvanced columns: {advanced.columns.tolist()}")
advanced.head(10)

In [None]:
# Top players by PER (Player Efficiency Rating)
if 'PER' in advanced.columns:
    advanced['PER'] = pd.to_numeric(advanced['PER'], errors='coerce')
    advanced['G'] = pd.to_numeric(advanced['G'], errors='coerce')
    
    # Filter to players with significant minutes
    qualified = advanced[advanced['G'] >= 10].copy()
    top_per = qualified.nlargest(20, 'PER')
    
    print("Top 20 by PER (min 10 games):")
    display(top_per[['player_name', 'team', 'G', 'PER', 'TS%', 'USG%', 'WS', 'BPM']].head(20))

In [None]:
# Visualize PER vs Usage Rate
if 'PER' in advanced.columns and 'USG%' in advanced.columns:
    qualified = advanced[advanced['G'] >= 10].copy()
    qualified['PER'] = pd.to_numeric(qualified['PER'], errors='coerce')
    qualified['USG%'] = pd.to_numeric(qualified['USG%'], errors='coerce')
    qualified['WS'] = pd.to_numeric(qualified['WS'], errors='coerce')
    
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(
        qualified['USG%'],
        qualified['PER'],
        c=qualified['WS'],
        cmap='viridis',
        alpha=0.6,
        s=50
    )
    plt.colorbar(scatter, label='Win Shares')
    plt.xlabel('Usage Rate (%)')
    plt.ylabel('Player Efficiency Rating')
    plt.title('PER vs Usage Rate (color = Win Shares)')
    plt.grid(True, alpha=0.3)
    
    # Annotate top 5 PER players
    top5 = qualified.nlargest(5, 'PER')
    for _, row in top5.iterrows():
        plt.annotate(
            row['player_name'],
            (row['USG%'], row['PER']),
            xytext=(5, 5),
            textcoords='offset points',
            fontsize=8
        )
    
    plt.tight_layout()
    plt.show()

## 3. Player Game Logs

Fetch individual game logs for a specific player.

In [None]:
# Fetch game logs for a star player
player_name = "Luka Doncic"
game_logs = scraper.get_player_game_logs(player_name, 2025)

print(f"Retrieved {len(game_logs)} games for {player_name}")
print(f"\nColumns: {game_logs.columns.tolist()}")
game_logs.head(10)

In [None]:
# Fantasy points analysis
if not game_logs.empty and 'fantasy_points' in game_logs.columns:
    print(f"\n{player_name} Fantasy Points Summary:")
    print(game_logs['fantasy_points'].describe())
    
    plt.figure(figsize=(14, 5))
    
    # Game-by-game fantasy points
    plt.subplot(1, 2, 1)
    plt.plot(range(len(game_logs)), game_logs['fantasy_points'].values, marker='o', markersize=4)
    plt.axhline(game_logs['fantasy_points'].mean(), color='red', linestyle='--', label=f"Avg: {game_logs['fantasy_points'].mean():.1f}")
    plt.xlabel('Game Number')
    plt.ylabel('Fantasy Points')
    plt.title(f'{player_name} - Fantasy Points by Game')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Distribution
    plt.subplot(1, 2, 2)
    plt.hist(game_logs['fantasy_points'], bins=15, edgecolor='black', alpha=0.7)
    plt.xlabel('Fantasy Points')
    plt.ylabel('Frequency')
    plt.title(f'{player_name} - Fantasy Points Distribution')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Compare multiple players' game logs
players_to_compare = ["Shai Gilgeous-Alexander", "Jayson Tatum", "Anthony Edwards"]
player_logs = {}

for name in players_to_compare:
    logs = scraper.get_player_game_logs(name, 2025)
    if not logs.empty:
        player_logs[name] = logs
        print(f"{name}: {len(logs)} games, Avg FPts: {logs['fantasy_points'].mean():.1f}")

In [None]:
# Visualize comparison
if player_logs:
    plt.figure(figsize=(12, 6))
    
    for name, logs in player_logs.items():
        if 'fantasy_points' in logs.columns:
            plt.plot(range(len(logs)), logs['fantasy_points'].values, marker='o', markersize=3, label=name, alpha=0.7)
    
    plt.xlabel('Game Number')
    plt.ylabel('Fantasy Points')
    plt.title('Fantasy Points Comparison')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## 4. Team Ratings

Fetch team offensive and defensive ratings for opponent adjustments.

In [None]:
# Fetch team ratings
team_ratings = scraper.get_team_ratings(2025)

print(f"Retrieved ratings for {len(team_ratings)} teams")
print(f"\nColumns: {team_ratings.columns.tolist()}")
team_ratings

In [None]:
# Visualize offensive vs defensive ratings
if not team_ratings.empty and 'ORtg' in team_ratings.columns and 'DRtg' in team_ratings.columns:
    team_ratings['ORtg'] = pd.to_numeric(team_ratings['ORtg'], errors='coerce')
    team_ratings['DRtg'] = pd.to_numeric(team_ratings['DRtg'], errors='coerce')
    team_ratings['NRtg'] = pd.to_numeric(team_ratings.get('NRtg', team_ratings['ORtg'] - team_ratings['DRtg']), errors='coerce')
    
    plt.figure(figsize=(12, 8))
    
    scatter = plt.scatter(
        team_ratings['ORtg'],
        team_ratings['DRtg'],
        c=team_ratings['NRtg'],
        cmap='RdYlGn',
        s=150,
        alpha=0.8
    )
    plt.colorbar(scatter, label='Net Rating')
    
    # Annotate teams
    if 'Team' in team_ratings.columns:
        for _, row in team_ratings.iterrows():
            plt.annotate(
                row['Team'],
                (row['ORtg'], row['DRtg']),
                fontsize=8,
                ha='center'
            )
    
    plt.xlabel('Offensive Rating (higher = better)')
    plt.ylabel('Defensive Rating (lower = better)')
    plt.title('Team Offensive vs Defensive Ratings')
    plt.grid(True, alpha=0.3)
    
    # Add quadrant lines at league average
    avg_ortg = team_ratings['ORtg'].mean()
    avg_drtg = team_ratings['DRtg'].mean()
    plt.axvline(avg_ortg, color='gray', linestyle='--', alpha=0.5)
    plt.axhline(avg_drtg, color='gray', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.show()

## 5. Totals Statistics

Fetch cumulative season totals for all players.

In [None]:
# Fetch totals
totals = scraper.get_season_stats(2025, stat_type="totals")

print(f"Retrieved {len(totals)} player entries")
totals.head(10)

In [None]:
# League leaders in counting stats
if not totals.empty:
    for col in ['PTS', 'TRB', 'AST', 'STL', 'BLK']:
        if col in totals.columns:
            totals[col] = pd.to_numeric(totals[col], errors='coerce')
    
    fig, axes = plt.subplots(1, 5, figsize=(20, 5))
    
    stats = ['PTS', 'TRB', 'AST', 'STL', 'BLK']
    titles = ['Points', 'Rebounds', 'Assists', 'Steals', 'Blocks']
    
    for ax, stat, title in zip(axes, stats, titles):
        if stat in totals.columns:
            top10 = totals.nlargest(10, stat)
            ax.barh(range(10), top10[stat].values)
            ax.set_yticks(range(10))
            ax.set_yticklabels(top10['player_name'].values)
            ax.set_xlabel('Total')
            ax.set_title(f'{title} Leaders')
            ax.invert_yaxis()
    
    plt.tight_layout()
    plt.show()

## 6. DFS Value Analysis

Combine stats to identify potential DFS value plays.

In [None]:
# Merge per-game and advanced stats
if not per_game.empty and not advanced.empty:
    # Calculate fantasy points from per-game stats
    dfs_df = per_game.copy()
    
    for col in ['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', '3P']:
        if col in dfs_df.columns:
            dfs_df[col] = pd.to_numeric(dfs_df[col], errors='coerce').fillna(0)
    
    # Calculate expected fantasy points
    dfs_df['exp_fpts'] = (
        dfs_df['PTS'] * 1.0 +
        dfs_df.get('3P', 0) * 0.5 +
        dfs_df['TRB'] * 1.25 +
        dfs_df['AST'] * 1.5 +
        dfs_df['STL'] * 2.0 +
        dfs_df['BLK'] * 2.0 -
        dfs_df['TOV'] * 0.5
    )
    
    # Add double-double potential (simplified)
    dd_cats = (
        (dfs_df['PTS'] >= 10).astype(int) +
        (dfs_df['TRB'] >= 10).astype(int) +
        (dfs_df['AST'] >= 10).astype(int)
    )
    dfs_df['exp_fpts'] += (dd_cats >= 2).astype(float) * 1.5
    
    # Top fantasy producers
    top_fpts = dfs_df.nlargest(25, 'exp_fpts')[['player_name', 'team', 'games', 'PTS', 'TRB', 'AST', 'exp_fpts']]
    print("Top 25 Expected Fantasy Point Averages:")
    display(top_fpts)

In [None]:
# Fantasy points distribution
if 'exp_fpts' in dfs_df.columns:
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(dfs_df['exp_fpts'].dropna(), bins=50, edgecolor='black', alpha=0.7)
    plt.xlabel('Expected Fantasy Points')
    plt.ylabel('Frequency')
    plt.title('Distribution of Expected Fantasy Points')
    plt.axvline(dfs_df['exp_fpts'].mean(), color='red', linestyle='--', label=f"Mean: {dfs_df['exp_fpts'].mean():.1f}")
    plt.legend()
    
    plt.subplot(1, 2, 2)
    # Points vs rebounds contribution
    plt.scatter(dfs_df['PTS'], dfs_df['TRB'], c=dfs_df['exp_fpts'], cmap='viridis', alpha=0.5, s=30)
    plt.colorbar(label='Expected FPts')
    plt.xlabel('Points Per Game')
    plt.ylabel('Rebounds Per Game')
    plt.title('Points vs Rebounds (color = FPts)')
    
    plt.tight_layout()
    plt.show()

## 7. Historical Season Comparison

Compare stats across multiple seasons.

In [None]:
# Fetch stats from previous seasons (be patient - rate limited)
seasons = [2024, 2023]  # Add more if needed
historical_stats = {}

for season in seasons:
    stats = scraper.get_season_stats(season, stat_type="per_game")
    if not stats.empty:
        historical_stats[season] = stats
        print(f"Season {season-1}-{str(season)[2:]}: {len(stats)} players")

In [None]:
# Compare a player across seasons
target_player = "Nikola Jokic"

if historical_stats:
    player_seasons = []
    
    for season, stats in historical_stats.items():
        player_data = stats[stats['player_name'] == target_player]
        if not player_data.empty:
            row = player_data.iloc[0].copy()
            row['season'] = season
            player_seasons.append(row)
    
    if player_seasons:
        comparison = pd.DataFrame(player_seasons)
        print(f"\n{target_player} Season Comparison:")
        display(comparison[['season', 'team', 'games', 'PTS', 'TRB', 'AST']])

## 8. Data Export

Save collected data for modeling.

In [None]:
# Create output directory
output_dir = pathlib.Path('../../data/raw')
output_dir.mkdir(parents=True, exist_ok=True)

# Save per-game stats
if not per_game.empty:
    per_game.to_parquet(output_dir / 'bref_per_game_2025.parquet', index=False)
    print(f"Saved per-game stats: {len(per_game)} rows")

# Save advanced stats
if not advanced.empty:
    advanced.to_parquet(output_dir / 'bref_advanced_2025.parquet', index=False)
    print(f"Saved advanced stats: {len(advanced)} rows")

# Save totals
if not totals.empty:
    totals.to_parquet(output_dir / 'bref_totals_2025.parquet', index=False)
    print(f"Saved totals: {len(totals)} rows")

# Save team ratings
if not team_ratings.empty:
    team_ratings.to_parquet(output_dir / 'bref_team_ratings_2025.parquet', index=False)
    print(f"Saved team ratings: {len(team_ratings)} rows")

print(f"\nData saved to {output_dir.resolve()}")

## Summary

### BRefScraper Capabilities Demonstrated:

1. **Season Statistics:**
   - `get_season_stats(season, stat_type)` - per_game, totals, advanced, per_minute, per_poss
   - Automatic cleaning of header rows and data type conversion

2. **Player Game Logs:**
   - `get_player_game_logs(name, season)` - Individual player game-by-game data
   - Automatic fantasy point calculation

3. **Team Ratings:**
   - `get_team_ratings(season)` - Offensive/defensive ratings for opponent adjustments

4. **Rate Limiting:**
   - 3.1s delay between requests (respects 20 req/min limit)
   - Automatic retry with exponential backoff

### Data Applications:
- Player projection models using per-game and advanced stats
- Opponent adjustments using team defensive ratings
- Consistency analysis using game logs
- DFS value identification