# Notebook 30: Create Opening Stats JSON/CSV

## Purpose
Create opening-specific statistics files (JSON and CSV) for use in inference pipeline.
These files contain the opening means needed for hierarchical Bayesian shrinkage.

## What This Creates
- `opening_stats_white.json`: Opening means for White openings (keyed by training_id)
- `opening_stats_black.json`: Opening means for Black openings (keyed by training_id)
- Also creates CSV versions for easy inspection

## Data Source
- Uses EXISTING openings in the database
- Calculates mean scores from all player-opening interactions
- Matches the calculation done in training notebook 28

## Output Location
`data/models/<model_dir>/opening_stats_<color>.json`

## Step 1: Setup and Imports

In [1]:
import json
import pandas as pd
import numpy as np
import duckdb
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

DB_PATH = PROJECT_ROOT / "data" / "processed" / "file_registry.json"
DATA_DIR = PROJECT_ROOT / "data"
MODELS_DIR = DATA_DIR / "models"

print("Setup complete")
print(f"Project root: {PROJECT_ROOT}")
print(f"DB path: {DB_PATH}")
print(f"Models dir: {MODELS_DIR}")

Setup complete
Project root: /Users/a/Documents/personalprojects/chess-opening-recommender
DB path: /Users/a/Documents/personalprojects/chess-opening-recommender/data/processed/file_registry.json
Models dir: /Users/a/Documents/personalprojects/chess-opening-recommender/data/models


## Step 2: Load Database Connection

In [None]:
def get_db_connection(db_dir: str) -> duckdb.DuckDBPyConnection:
    """Get a DuckDB connection to chess_games.db in the processed directory."""
    db_full_path = Path(db_dir).parent / "chess_games.db"
    
    if not db_full_path.exists():
        raise FileNotFoundError(f"Database not found: {db_full_path}")
    
    return duckdb.connect(str(db_full_path), read_only=True)

conn = get_db_connection(str(DB_PATH))
print("Database connection established")
print(f"Database: {conn.execute('SELECT current_database()').fetchone()[0]}")

ValueError: No current_db found in registry

## Step 3: Define Processing Function

This matches the calculation from training notebook 28's hierarchical Bayesian shrinkage section.

In [None]:
def calculate_opening_stats(color: str, min_games_threshold: int = 10) -> pd.DataFrame:
    """
    Calculate opening-specific statistics from the database.
    
    Parameters:
    -----------
    color : str
        'white' or 'black'
    min_games_threshold : int
        Minimum games per player-opening to include (default: 10)
    
    Returns:
    --------
    pd.DataFrame with columns:
        - opening_id (db id)
        - opening_mean (mean score for this opening)
        - opening_total_games (total games across all players)
        - opening_num_players (number of players who played this opening)
        - eco (ECO code for reference)
        - name (opening name for reference)
    """
    print(f"\n{'='*60}")
    print(f"CALCULATING OPENING STATS FOR {color.upper()}")
    print(f"{'='*60}")
    
    query = f"""
        SELECT 
            pos.opening_id,
            pos.player_id,
            pos.num_games,
            pos.num_wins,
            pos.num_draws,
            pos.num_losses,
            o.eco,
            o.name
        FROM player_opening_stats pos
        JOIN opening o ON pos.opening_id = o.id
        WHERE pos.color = '{color}'
          AND pos.num_games >= {min_games_threshold}
          AND o.eco IS NOT NULL
    """
    
    print(f"\n1. Loading data from database...")
    data = pd.DataFrame(conn.execute(query).df())
    print(f"   Loaded {len(data):,} player-opening entries")
    print(f"   Unique openings: {data['opening_id'].nunique():,}")
    print(f"   Unique players: {data['player_id'].nunique():,}")
    
    print(f"\n2. Calculating raw scores...")
    data['score'] = (data['num_wins'] + 0.5 * data['num_draws']) / data['num_games']
    print(f"   Score range: [{data['score'].min():.4f}, {data['score'].max():.4f}]")
    print(f"   Global mean score: {data['score'].mean():.4f}")
    
    print(f"\n3. Aggregating by opening...")
    opening_stats = (
        data.groupby('opening_id')
        .agg({
            'score': 'mean',
            'num_games': 'sum',
            'player_id': 'count',
            'eco': 'first',
            'name': 'first'
        })
        .rename(columns={
            'score': 'opening_mean',
            'num_games': 'opening_total_games',
            'player_id': 'opening_num_players'
        })
        .reset_index()
    )
    
    print(f"   Calculated stats for {len(opening_stats):,} openings")
    print(f"\n4. Opening mean distribution:")
    print(f"   Min: {opening_stats['opening_mean'].min():.4f}")
    print(f"   25th percentile: {opening_stats['opening_mean'].quantile(0.25):.4f}")
    print(f"   Median: {opening_stats['opening_mean'].median():.4f}")
    print(f"   75th percentile: {opening_stats['opening_mean'].quantile(0.75):.4f}")
    print(f"   Max: {opening_stats['opening_mean'].max():.4f}")
    print(f"   Std: {opening_stats['opening_mean'].std():.4f}")
    
    print(f"\n5. Opening size distribution:")
    print(f"   Total games (median): {opening_stats['opening_total_games'].median():.0f}")
    print(f"   Players (median): {opening_stats['opening_num_players'].median():.0f}")
    print(f"   Total games range: [{opening_stats['opening_total_games'].min():.0f}, {opening_stats['opening_total_games'].max():.0f}]")
    print(f"   Players range: [{opening_stats['opening_num_players'].min():.0f}, {opening_stats['opening_num_players'].max():.0f}]")
    
    return opening_stats

## Step 4: Calculate Stats for White Openings

In [None]:
white_stats = calculate_opening_stats('white', min_games_threshold=10)

## Step 5: Inspect White Opening Stats

In [None]:
print("\n" + "="*60)
print("WHITE OPENING STATS SAMPLE")
print("="*60)

print("\nFirst 10 openings:")
print(white_stats.head(10))

print("\n\nTop 10 STRONGEST openings for White (highest win rate):")
strongest = white_stats.nlargest(10, 'opening_mean')
for idx, row in strongest.iterrows():
    print(f"   {row['eco']:>3} | {row['name']:<50} | Mean: {row['opening_mean']:.4f} | Players: {row['opening_num_players']:>5} | Games: {row['opening_total_games']:>7}")

print("\n\nTop 10 WEAKEST openings for White (lowest win rate):")
weakest = white_stats.nsmallest(10, 'opening_mean')
for idx, row in weakest.iterrows():
    print(f"   {row['eco']:>3} | {row['name']:<50} | Mean: {row['opening_mean']:.4f} | Players: {row['opening_num_players']:>5} | Games: {row['opening_total_games']:>7}")

print("\n\nData types:")
print(white_stats.dtypes)

print("\nMemory usage:")
print(white_stats.memory_usage(deep=True))
print(f"\nTotal memory: {white_stats.memory_usage(deep=True).sum() / 1024:.2f} KB")

## Step 6: Load Training Mappings

We need to convert database IDs to training IDs before saving.

In [None]:
print("Available model directories:")
for model_dir in MODELS_DIR.iterdir():
    if model_dir.is_dir() and not model_dir.name.startswith('.'):
        print(f"   {model_dir.name}")

# UPDATE THIS TO YOUR MODEL DIRECTORY
MODEL_DIR_NAME = "20251111_155428_white"
MODEL_ARTIFACTS_DIR = MODELS_DIR / MODEL_DIR_NAME

print(f"\nUsing model directory: {MODEL_DIR_NAME}")
print(f"Full path: {MODEL_ARTIFACTS_DIR}")

if not MODEL_ARTIFACTS_DIR.exists():
    print(f"\nWARNING: Model directory does not exist!")
    print(f"Please create it or update MODEL_DIR_NAME")
else:
    print(f"Model directory exists")

In [None]:
mappings_path = MODEL_ARTIFACTS_DIR / "opening_mappings.csv"

if not mappings_path.exists():
    print(f"WARNING: opening_mappings.csv not found at {mappings_path}")
    print(f"You need to run the training pipeline first to create this file.")
    opening_mappings = None
else:
    opening_mappings = pd.read_csv(mappings_path)
    print("Loaded opening mappings")
    print(f"   Shape: {opening_mappings.shape}")
    print(f"   Columns: {list(opening_mappings.columns)}")
    print(f"\nFirst few mappings:")
    print(opening_mappings.head(10))

## Step 7: Merge Stats with Training IDs

In [None]:
if opening_mappings is not None:
    white_stats_with_training_ids = white_stats.merge(
        opening_mappings[['db_id', 'training_id']], 
        left_on='opening_id', 
        right_on='db_id',
        how='inner'
    )
    
    print("Merged opening stats with training IDs")
    print(f"   Original openings: {len(white_stats)}")
    print(f"   After merge: {len(white_stats_with_training_ids)}")
    print(f"   Dropped: {len(white_stats) - len(white_stats_with_training_ids)} (not in training set)")
    
    if white_stats_with_training_ids['training_id'].duplicated().any():
        print(f"\nWARNING: Found duplicate training_ids!")
        dups = white_stats_with_training_ids[white_stats_with_training_ids['training_id'].duplicated(keep=False)]
        print(dups)
    else:
        print("   No duplicate training_ids")
    
    print("\nSample of merged data:")
    print(white_stats_with_training_ids.head(10))
else:
    print("Skipping merge - no opening mappings available")
    white_stats_with_training_ids = None

## Step 8: Create JSON Output (Keyed by Training ID)

In [None]:
if white_stats_with_training_ids is not None:
    opening_stats_dict = {}
    
    for idx, row in white_stats_with_training_ids.iterrows():
        opening_stats_dict[int(row['training_id'])] = {
            'opening_mean': float(row['opening_mean']),
            'opening_total_games': int(row['opening_total_games']),
            'opening_num_players': int(row['opening_num_players']),
            'eco': str(row['eco']),
            'name': str(row['name']),
            'db_id': int(row['db_id'])
        }
    
    print("Created opening stats dictionary")
    print(f"   Total openings: {len(opening_stats_dict)}")
    print(f"   Training ID range: [{min(opening_stats_dict.keys())}, {max(opening_stats_dict.keys())}]")
    
    print("\nSample entries:")
    for training_id in sorted(opening_stats_dict.keys())[:5]:
        stats = opening_stats_dict[training_id]
        print(f"   Training ID {training_id}: {stats['eco']} | Mean: {stats['opening_mean']:.4f} | {stats['name']}")
else:
    print("Skipping JSON creation - no merged data available")
    opening_stats_dict = None

## Step 9: Validate JSON Structure

In [None]:
if opening_stats_dict is not None:
    print("Validating JSON structure...\n")
    
    all_int_keys = all(isinstance(k, int) for k in opening_stats_dict.keys())
    print(f"All keys are integers: {all_int_keys}")
    
    required_fields = ['opening_mean', 'opening_total_games', 'opening_num_players', 'eco', 'name', 'db_id']
    all_have_fields = all(
        all(field in v for field in required_fields) 
        for v in opening_stats_dict.values()
    )
    print(f"All values have required fields: {all_have_fields}")
    
    sample_value = list(opening_stats_dict.values())[0]
    print(f"\nSample value structure:")
    for field, value in sample_value.items():
        print(f"   {field}: {type(value).__name__} = {value}")
    
    has_none = any(
        any(v is None or (isinstance(v, float) and pd.isna(v)) for v in vals.values())
        for vals in opening_stats_dict.values()
    )
    print(f"\nNo None/NaN values: {not has_none}")
    
    means = [v['opening_mean'] for v in opening_stats_dict.values()]
    print(f"\nOpening mean range: [{min(means):.4f}, {max(means):.4f}]")
    all_in_range = all(0 <= m <= 1 for m in means)
    print(f"All means in [0, 1]: {all_in_range}")
    
    print("\nJSON structure validation complete")
else:
    print("Skipping validation - no data available")

## Step 10: Check JSON Size

In [None]:
if opening_stats_dict is not None:
    import sys
    
    memory_size = sys.getsizeof(opening_stats_dict)
    print(f"In-memory size: {memory_size:,} bytes ({memory_size / 1024:.2f} KB)")
    
    json_str = json.dumps(opening_stats_dict, indent=2)
    json_size = len(json_str.encode('utf-8'))
    print(f"JSON size (with indent=2): {json_size:,} bytes ({json_size / 1024:.2f} KB)")
    
    json_str_compact = json.dumps(opening_stats_dict)
    json_size_compact = len(json_str_compact.encode('utf-8'))
    print(f"JSON size (compact): {json_size_compact:,} bytes ({json_size_compact / 1024:.2f} KB)")
    
    print("\nFirst 500 characters of JSON:")
    print(json_str[:500])
else:
    print("Skipping size check - no data available")

## Step 11: Save JSON File

In [None]:
# if opening_stats_dict is not None:
#     output_json_path = MODEL_ARTIFACTS_DIR / "opening_stats_white.json"
    
#     with open(output_json_path, 'w') as f:
#         json.dump(opening_stats_dict, f, indent=2)
    
#     print(f"Saved JSON to: {output_json_path}")
#     print(f"   File size: {output_json_path.stat().st_size:,} bytes ({output_json_path.stat().st_size / 1024:.2f} KB)")
    
#     with open(output_json_path, 'r') as f:
#         loaded_dict = json.load(f)
    
#     print(f"\nVerified file can be loaded")
#     print(f"   Loaded {len(loaded_dict)} openings")
#     print(f"   Matches original: {len(loaded_dict) == len(opening_stats_dict)}")
# else:
#     print("Skipping save - no data available")

## Step 12: Save CSV File (for easy inspection)

In [None]:
# if white_stats_with_training_ids is not None:
#     output_csv_path = MODEL_ARTIFACTS_DIR / "opening_stats_white.csv"
    
#     csv_data = white_stats_with_training_ids[[
#         'training_id', 'db_id', 'opening_id', 'eco', 'name', 
#         'opening_mean', 'opening_total_games', 'opening_num_players'
#     ]].sort_values('training_id')
    
#     csv_data.to_csv(output_csv_path, index=False)
    
#     print(f"Saved CSV to: {output_csv_path}")
#     print(f"   File size: {output_csv_path.stat().st_size:,} bytes ({output_csv_path.stat().st_size / 1024:.2f} KB)")
#     print(f"   Rows: {len(csv_data)}")
    
#     print("\nFirst 10 rows of CSV:")
#     print(csv_data.head(10))
# else:
#     print("Skipping CSV save - no data available")

## Step 13: Test Loading and Using the JSON

In [None]:
# if opening_stats_dict is not None:
#     print("Testing practical usage of opening_stats.json...\n")
    
#     test_training_ids = list(opening_stats_dict.keys())[:5]
    
#     print("Simulating inference lookup:")
#     for training_id in test_training_ids:
#         stats = opening_stats_dict[training_id]
#         print(f"\n   Training ID {training_id}:")
#         print(f"      ECO: {stats['eco']}")
#         print(f"      Name: {stats['name']}")
#         print(f"      Opening mean: {stats['opening_mean']:.4f}")
#         print(f"      Total games: {stats['opening_total_games']}")
#         print(f"      Num players: {stats['opening_num_players']}")
    
#     print("\n\nSimulating Bayesian shrinkage calculation:")
#     k = 50
    
#     player_games = 15
#     player_raw_score = 0.6
#     training_id = test_training_ids[0]
#     opening_mean = opening_stats_dict[training_id]['opening_mean']
    
#     adjusted_score = (
#         (player_games * player_raw_score) + (k * opening_mean)
#     ) / (player_games + k)
    
#     confidence = player_games / (player_games + k)
    
#     print(f"   Player: {player_games} games, raw score = {player_raw_score:.4f}")
#     print(f"   Opening: {opening_stats_dict[training_id]['eco']} (training_id={training_id})")
#     print(f"   Opening mean: {opening_mean:.4f}")
#     print(f"   Adjusted score: {adjusted_score:.4f}")
#     print(f"   Confidence: {confidence:.4f}")
#     print(f"   Adjustment: {adjusted_score - player_raw_score:+.4f}")
    
#     print("\nAll tests passed!")
# else:
#     print("Skipping usage test - no data available")

## Step 14: Calculate Stats for Black Openings

In [None]:
black_stats = calculate_opening_stats('black', min_games_threshold=10)

## Step 15: Process and Save Black Opening Stats

In [None]:
print("\n" + "="*60)
print("BLACK OPENING STATS")
print("="*60)
print("\nTo process black openings:")
print("   1. Set MODEL_DIR_NAME to your black model directory")
print("   2. Load the black opening_mappings.csv")
print("   3. Repeat the merge, JSON creation, and save steps")
print("\nBlack stats preview:")
print(black_stats.head(10))

## Summary and Next Steps

In [None]:
# print("\n" + "="*60)
# print("SUMMARY")
# print("="*60)

# if opening_stats_dict is not None:
#     print(f"\nSuccessfully created opening_stats_white.json")
#     print(f"   Location: {MODEL_ARTIFACTS_DIR / 'opening_stats_white.json'}")
#     print(f"   Total openings: {len(opening_stats_dict)}")
#     print(f"   File size: {(MODEL_ARTIFACTS_DIR / 'opening_stats_white.json').stat().st_size / 1024:.2f} KB")
    
#     print(f"\nAlso created opening_stats_white.csv for inspection")
#     print(f"   Location: {MODEL_ARTIFACTS_DIR / 'opening_stats_white.csv'}")
    
#     print(f"\nNext steps:")
#     print(f"   1. Copy these files to your production model artifacts directory")
#     print(f"   2. Update inference pipeline to load opening_stats.json at startup")
#     print(f"   3. Use opening_mean values for Bayesian shrinkage during inference")
#     print(f"   4. Repeat for black openings (update MODEL_DIR_NAME and re-run)")
# else:
#     print(f"\nNo files created")
#     print(f"   Make sure opening_mappings.csv exists in your model directory")
#     print(f"   Update MODEL_DIR_NAME to point to the correct model")
#     print(f"   Re-run the notebook")

# print("\n" + "="*60)
# print("NOTEBOOK COMPLETE")
# print("="*60)

In [None]:
conn.close()
print("Database connection closed")