In [3]:
import pandas as pd
import psycopg2
import json

with open('../endpoints/config/database_config.json', 'r') as f:
    config = json.load(f)

conn = psycopg2.connect(
    host=config['host'],
    database=config['name'],
    user=config['user'],
    password=config['password'],
    port=config['port']
)

# Use existing connection
# Load NBA master games into dataframe
nba_games_df = pd.read_sql_query("SELECT * FROM wnba_games ORDER BY gameid", conn)


  nba_games_df = pd.read_sql_query("SELECT * FROM wnba_games ORDER BY gameid", conn)


In [4]:
# Compare Master Games vs Endpoint Data vs Failed API Calls
print("üîç GAMES COLLECTION ANALYSIS")
print("="*50)

# Get all master games (all leagues)
master_games_query = """
SELECT DISTINCT gameid FROM (
    SELECT gameid FROM nba_games
) AS all_games
ORDER BY gameid
"""
master_games_df = pd.read_sql_query(master_games_query, conn)
master_gameids = set(master_games_df['gameid'])

print(f"üìä Master Games (all leagues): {len(master_gameids):,}")

# Get games from boxscoretraditionalv3 endpoint tables (with league prefix and _n suffix)
endpoint_gameids = set()
boxscore_tables = []

# Check for NBA boxscoretraditionalv3 tables with nba_ prefix and _n suffix
for i in range(10):  # Check _0 through _9
    table_name = f"nba_boxscoretraditionalv3_{i}"
    try:
        test_query = f"SELECT COUNT(*) FROM {table_name}"
        pd.read_sql_query(test_query, conn)
        boxscore_tables.append(table_name)
        
        # Get gameids from this table
        games_df = pd.read_sql_query(f"SELECT DISTINCT gameid FROM {table_name}", conn)
        table_gameids = set(games_df['gameid'])
        endpoint_gameids.update(table_gameids)
        print(f"‚úÖ {table_name}: {len(table_gameids):,} games")
        
    except Exception:
        # Table doesn't exist, skip
        continue

if endpoint_gameids:
    print(f"üìä Total BoxScore Traditional V3 Games: {len(endpoint_gameids):,}")
else:
    print(f"‚ùå BoxScore Traditional V3: No tables found")
    
print(f"üìã Found tables: {boxscore_tables}")

# Get failed API calls
try:
    failed_games_df = pd.read_sql_query(
        "SELECT DISTINCT id_value as gameid FROM failed_api_calls WHERE id_column = 'gameid'", 
        conn
    )
    failed_gameids = set(failed_games_df['gameid'])
    print(f"üö´ Failed API Calls: {len(failed_gameids):,}")
except Exception as e:
    print(f"‚ùå Failed API Calls: Table not found or empty")
    failed_gameids = set()

# Calculate what's missing
successfully_collected = endpoint_gameids
attempted_but_failed = failed_gameids
never_attempted = master_gameids - successfully_collected - attempted_but_failed

print(f"\nüìà BREAKDOWN:")
print(f"   ‚úÖ Successfully Collected: {len(successfully_collected):,}")
print(f"   üö´ Attempted but Failed: {len(attempted_but_failed):,}")
print(f"   ‚ùì Never Attempted: {len(never_attempted):,}")
print(f"   üéØ TOTAL MISSING: {len(attempted_but_failed) + len(never_attempted):,}")

# Calculate percentages
if len(master_gameids) > 0:
    success_pct = (len(successfully_collected) / len(master_gameids)) * 100
    failed_pct = (len(attempted_but_failed) / len(master_gameids)) * 100
    never_pct = (len(never_attempted) / len(master_gameids)) * 100
    
    print(f"\nüìä PERCENTAGES:")
    print(f"   ‚úÖ Success Rate: {success_pct:.1f}%")
    print(f"   üö´ Failed Rate: {failed_pct:.1f}%") 
    print(f"   ‚ùì Never Attempted: {never_pct:.1f}%")

print(f"\nüéØ GAMES LEFT TO PULL: {len(master_gameids) - len(successfully_collected):,}")
print("="*50)

üîç GAMES COLLECTION ANALYSIS


  master_games_df = pd.read_sql_query(master_games_query, conn)


üìä Master Games (all leagues): 52,782


  pd.read_sql_query(test_query, conn)
  games_df = pd.read_sql_query(f"SELECT DISTINCT gameid FROM {table_name}", conn)
  games_df = pd.read_sql_query(f"SELECT DISTINCT gameid FROM {table_name}", conn)


‚úÖ nba_boxscoretraditionalv3_0: 25,813 games
‚úÖ nba_boxscoretraditionalv3_1: 25,812 games
‚úÖ nba_boxscoretraditionalv3_1: 25,812 games
‚úÖ nba_boxscoretraditionalv3_2: 25,812 games
‚úÖ nba_boxscoretraditionalv3_2: 25,812 games
üìä Total BoxScore Traditional V3 Games: 25,813
üìã Found tables: ['nba_boxscoretraditionalv3_0', 'nba_boxscoretraditionalv3_1', 'nba_boxscoretraditionalv3_2']
üìä Total BoxScore Traditional V3 Games: 25,813
üìã Found tables: ['nba_boxscoretraditionalv3_0', 'nba_boxscoretraditionalv3_1', 'nba_boxscoretraditionalv3_2']


  failed_games_df = pd.read_sql_query(


üö´ Failed API Calls: 26,058

üìà BREAKDOWN:
   ‚úÖ Successfully Collected: 25,813
   üö´ Attempted but Failed: 26,058
   ‚ùì Never Attempted: 20,627
   üéØ TOTAL MISSING: 46,685

üìä PERCENTAGES:
   ‚úÖ Success Rate: 48.9%
   üö´ Failed Rate: 49.4%
   ‚ùì Never Attempted: 39.1%

üéØ GAMES LEFT TO PULL: 26,969


In [None]:
# Show sample games from each category
print("\nüìã SAMPLE GAMES BY CATEGORY")
print("="*40)

if successfully_collected:
    print(f"\n‚úÖ SUCCESSFULLY COLLECTED (showing first 5):")
    sample_success = sorted(list(successfully_collected))[:5]
    for game in sample_success:
        print(f"   {game}")
else:
    print(f"\n‚úÖ SUCCESSFULLY COLLECTED: None")

if attempted_but_failed:
    print(f"\nüö´ ATTEMPTED BUT FAILED (showing first 5):")
    sample_failed = sorted(list(attempted_but_failed))[:5]
    for game in sample_failed:
        print(f"   {game}")
else:
    print(f"\nüö´ ATTEMPTED BUT FAILED: None")

if never_attempted:
    print(f"\n‚ùì NEVER ATTEMPTED (showing first 10):")
    sample_never = sorted(list(never_attempted))[:10]
    for game in sample_never:
        print(f"   {game}")
        
    print(f"\nüéØ PRIORITY FOR NEXT COLLECTION:")
    print(f"   Focus on the {len(never_attempted):,} games that were never attempted")
    print(f"   Consider retrying the {len(attempted_but_failed):,} games that failed")
else:
    print(f"\n‚ùì NEVER ATTEMPTED: None")

print("="*40)