In [1]:
import numpy as np
import pandas as pd

import pathlib
import os
import sys

# Ensure the project `src` directory is on sys.path so `data` imports work
sys.path.insert(0, str(pathlib.Path('..').resolve()))

from data.nba_scraper import NBAStatsScraper



# NBA Data Analysis Notebook

This notebook demonstrates the capabilities of the NBA data collection modules:
- **NBAStatsScraper**: Fetch player game logs, stats, and team data from the NBA API
- **Salary Loaders**: Parse DraftKings and FanDuel salary files


## 1. NBA Stats Scraper - Recent Game Logs

Fetch recent player game logs for the current season.

In [2]:
# Initialize the scraper
scraper = NBAStatsScraper(delay=0.6)

# Fetch recent game logs (limit to last few days to speed up the example)
from datetime import datetime, timedelta

today = datetime.now()
week_ago = today - timedelta(days=7)

game_logs = scraper.get_league_game_logs(
    season="2024-25",
    season_type="Regular Season",
    date_from=week_ago.strftime("%m/%d/%Y"),
    date_to=today.strftime("%m/%d/%Y")
)

print(f"Fetched {len(game_logs)} game logs")
game_logs.head(10)

2025-11-28 01:57:56,103 - INFO - Fetching league game logs for 2024-25 Regular Season
2025-11-28 01:57:56,781 - INFO - Retrieved 0 game log entries
2025-11-28 01:57:56,781 - INFO - Retrieved 0 game log entries


Fetched 0 game logs


Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,...,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,FANTASY_PTS,VIDEO_AVAILABLE


### Top Performers by Fantasy Points

In [3]:
# Show top 20 performances by fantasy points
top_performances = game_logs.nlargest(20, 'fantasy_points')[
    ['PLAYER_NAME', 'TEAM_ABBREVIATION', 'GAME_DATE', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'fantasy_points']
]

print("Top 20 Fantasy Performances:")
top_performances

KeyError: 'fantasy_points'

### Fantasy Points Distribution

Visualize the distribution of fantasy points across all recent games.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 5))

# Histogram
plt.subplot(1, 2, 1)
plt.hist(game_logs['fantasy_points'], bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Fantasy Points')
plt.ylabel('Frequency')
plt.title('Distribution of Fantasy Points')
plt.axvline(game_logs['fantasy_points'].mean(), color='red', linestyle='--', label=f'Mean: {game_logs["fantasy_points"].mean():.1f}')
plt.axvline(game_logs['fantasy_points'].median(), color='green', linestyle='--', label=f'Median: {game_logs["fantasy_points"].median():.1f}')
plt.legend()

# Box plot
plt.subplot(1, 2, 2)
plt.boxplot(game_logs['fantasy_points'], vert=True)
plt.ylabel('Fantasy Points')
plt.title('Fantasy Points Box Plot')

plt.tight_layout()
plt.show()

print(f"Fantasy Points Statistics:")
print(game_logs['fantasy_points'].describe())

## 2. Player-Specific Analysis

Analyze a specific player's recent performance.

In [None]:
# Get all active players
players = scraper.get_all_players(season="2024-25", active_only=True)

print(f"Total active players: {len(players)}")
print("\nSample players:")
players[['DISPLAY_FIRST_LAST', 'TEAM_ABBREVIATION']].head(10)

In [None]:
# Analyze average stats by player from recent games
player_stats = game_logs.groupby('PLAYER_NAME').agg({
    'fantasy_points': ['mean', 'std', 'max', 'count'],
    'PTS': 'mean',
    'REB': 'mean',
    'AST': 'mean',
    'MIN': 'mean'
}).round(2)

player_stats.columns = ['_'.join(col).strip() for col in player_stats.columns.values]
player_stats = player_stats.rename(columns={
    'fantasy_points_mean': 'avg_fpts',
    'fantasy_points_std': 'std_fpts',
    'fantasy_points_max': 'max_fpts',
    'fantasy_points_count': 'games',
    'PTS_mean': 'avg_pts',
    'REB_mean': 'avg_reb',
    'AST_mean': 'avg_ast',
    'MIN_mean': 'avg_min'
})

# Filter to players with at least 2 games
player_stats = player_stats[player_stats['games'] >= 2]

# Sort by average fantasy points
player_stats_sorted = player_stats.sort_values('avg_fpts', ascending=False)

print("Top 15 Players by Average Fantasy Points:")
player_stats_sorted.head(15)

### Consistency Analysis

Find players with high performance and low variance (consistent performers).

In [None]:
# Calculate coefficient of variation (lower is more consistent)
player_stats['cv'] = (player_stats['std_fpts'] / player_stats['avg_fpts']).round(3)

# Filter high performers (avg > 30 fpts)
high_performers = player_stats[player_stats['avg_fpts'] > 30].copy()

# Sort by consistency (low CV)
consistent_stars = high_performers.sort_values('cv')

print("Most Consistent High Performers (avg > 30 fpts):")
consistent_stars[['avg_fpts', 'std_fpts', 'cv', 'games']].head(10)

In [None]:
# Scatter plot: Average Fantasy Points vs Consistency
plt.figure(figsize=(12, 8))

scatter = plt.scatter(
    player_stats['avg_fpts'], 
    player_stats['cv'],
    alpha=0.6,
    s=player_stats['games'] * 20,  # Size by number of games
    c=player_stats['avg_min'],  # Color by minutes
    cmap='viridis'
)

plt.colorbar(scatter, label='Avg Minutes')
plt.xlabel('Average Fantasy Points')
plt.ylabel('Coefficient of Variation (lower = more consistent)')
plt.title('Player Performance vs Consistency\n(bubble size = number of games)')
plt.grid(True, alpha=0.3)

# Annotate top performers
top_5 = player_stats.nlargest(5, 'avg_fpts')
for player_name, row in top_5.iterrows():
    plt.annotate(
        player_name, 
        (row['avg_fpts'], row['cv']),
        xytext=(5, 5), 
        textcoords='offset points',
        fontsize=8,
        alpha=0.8
    )

plt.show()

## 3. Team Statistics

Analyze team-level metrics for opponent adjustments.

In [None]:
# Get team statistics
team_stats = scraper.get_team_stats(season="2024-25")

print(f"Retrieved stats for {len(team_stats)} teams")
print("\nAvailable columns:")
print(team_stats.columns.tolist())
print("\nSample team stats:")
team_stats.head()

## 4. Salary Data Integration

Load and analyze DFS salary data. Note: This requires actual salary CSV files.

In [None]:
from data.salary_loader import load_draftkings, load_fanduel, load_salary_file, normalize_player_name

# Example: How to load salary files (uncomment when you have files)
# salary_df = load_draftkings('path/to/dk_salaries.csv')
# salary_df = load_fanduel('path/to/fd_salaries.csv')
# salary_df = load_salary_file('path/to/salary.csv')  # Auto-detects platform

print("Salary loader functions available:")
print("- load_draftkings(filepath): Load DraftKings CSV")
print("- load_fanduel(filepath): Load FanDuel CSV")
print("- load_salary_file(filepath): Auto-detect and load")
print("- normalize_player_name(name): Standardize names for matching")

# Demo name normalization
print("\nName Normalization Examples:")
test_names = ["LeBron James", "Michael Jordan Jr.", "Karl-Anthony Towns III"]
for name in test_names:
    normalized = normalize_player_name(name)
    print(f"  {name:30} -> {normalized}")

### Value Analysis (Example with Mock Data)

Demonstrate how to combine salary data with performance data.

In [None]:
# Create mock salary data for demonstration
mock_salaries = pd.DataFrame({
    'name': player_stats_sorted.head(20).index,
    'salary': np.random.randint(5000, 12000, 20) * 100,  # Mock salaries
    'team': ['LAL', 'BOS', 'DEN', 'PHX', 'MIL'] * 4
})

# Merge with player stats
value_analysis = mock_salaries.merge(
    player_stats_sorted.reset_index(),
    left_on='name',
    right_on='PLAYER_NAME',
    how='left'
)

# Calculate value (points per $1000)
value_analysis['value'] = (value_analysis['avg_fpts'] / value_analysis['salary'] * 1000).round(3)

# Sort by value
value_analysis = value_analysis.sort_values('value', ascending=False)

print("Best Value Plays (Mock Data):")
value_analysis[['name', 'salary', 'avg_fpts', 'value', 'games']].head(15)

In [None]:
# Visualize value
plt.figure(figsize=(10, 6))

plt.scatter(value_analysis['salary'], value_analysis['avg_fpts'], 
           s=100, alpha=0.6, c=value_analysis['value'], cmap='RdYlGn')

plt.colorbar(label='Value (FPts per $1K)')
plt.xlabel('Salary ($)')
plt.ylabel('Average Fantasy Points')
plt.title('Salary vs Performance (Mock Data)')
plt.grid(True, alpha=0.3)

# Annotate best values
for idx, row in value_analysis.head(5).iterrows():
    plt.annotate(
        row['name'], 
        (row['salary'], row['avg_fpts']),
        xytext=(5, 5), 
        textcoords='offset points',
        fontsize=8
    )

plt.show()

## 5. Data Export

Save processed data for later use.

In [None]:
# Example: Save aggregated player stats
output_dir = pathlib.Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

# Save to parquet (efficient for large datasets)
player_stats_sorted.to_parquet(output_dir / 'player_stats_recent.parquet')

# Also save as CSV for easy viewing
player_stats_sorted.to_csv(output_dir / 'player_stats_recent.csv')

print(f"Data saved to {output_dir}")

## Summary

This notebook demonstrated:

1. **NBAStatsScraper capabilities:**
   - Fetching league-wide game logs with date filters
   - Automatic fantasy points calculation
   - Player roster data
   - Team statistics

2. **Data Analysis:**
   - Top performers identification
   - Fantasy points distribution
   - Consistency metrics (coefficient of variation)
   - Performance vs minutes analysis

3. **Salary Integration:**
   - Loading DraftKings/FanDuel CSV files
   - Name normalization for matching
   - Value calculations (points per dollar)

4. **Data Export:**
   - Saving processed data in multiple formats

**Next Steps:**
- Add actual salary files for real value analysis
- Build predictive models using historical data
- Implement opponent adjustments using team stats
- Create lineup optimization algorithms