# Notebook 30: Create Opening Stats JSON/CSV

## Purpose
Create opening-specific statistics files (JSON and CSV) for use in inference pipeline.
These files contain the opening means needed for hierarchical Bayesian shrinkage.

## What This Creates
- `opening_stats_white.json`: Opening means for White openings (keyed by training_id)
- `opening_stats_black.json`: Opening means for Black openings (keyed by training_id)
- Also creates CSV versions for easy inspection

## Data Source
- Uses EXISTING openings in the database
- Calculates mean scores from all player-opening interactions
- Matches the calculation done in training notebook 28

## Output Location
`data/models/<model_dir>/opening_stats_<color>.json`

## Step 1: Setup and Imports

In [1]:
import json
import pandas as pd
import numpy as np
import duckdb
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

DB_PATH = PROJECT_ROOT / "data" / "processed" / "file_registry.json"
DATA_DIR = PROJECT_ROOT / "data"
MODELS_DIR = DATA_DIR / "models"

print("Setup complete")
print(f"Project root: {PROJECT_ROOT}")
print(f"DB path: {DB_PATH}")
print(f"Models dir: {MODELS_DIR}")

Setup complete
Project root: /Users/a/Documents/personalprojects/chess-opening-recommender
DB path: /Users/a/Documents/personalprojects/chess-opening-recommender/data/processed/file_registry.json
Models dir: /Users/a/Documents/personalprojects/chess-opening-recommender/data/models


## Step 2: Load Database Connection

In [2]:
def get_db_connection(db_dir: str) -> duckdb.DuckDBPyConnection:
    """Get a DuckDB connection to chess_games.db in the processed directory."""
    db_full_path = Path(db_dir).parent / "chess_games.db"
    
    if not db_full_path.exists():
        raise FileNotFoundError(f"Database not found: {db_full_path}")
    
    return duckdb.connect(str(db_full_path), read_only=True)

conn = get_db_connection(str(DB_PATH))
print("Database connection established")
print(f"Database: {conn.execute('SELECT current_database()').fetchone()[0]}")

Database connection established
Database: chess_games


## Step 3: Define Processing Function

This matches the calculation from training notebook 28's hierarchical Bayesian shrinkage section.

In [3]:
def calculate_opening_stats(color: str, min_games_threshold: int = 10) -> pd.DataFrame:
    """
    Calculate opening-specific statistics from the database.
    
    Parameters:
    -----------
    color : str
        'white' or 'black' (will be converted to 'w' or 'b' for database query)
    min_games_threshold : int
        Minimum games per player-opening to include (default: 10)
    
    Returns:
    --------
    pd.DataFrame with columns:
        - opening_id (db id)
        - opening_mean (mean score for this opening)
        - opening_total_games (total games across all players)
        - opening_num_players (number of players who played this opening)
        - eco (ECO code for reference)
        - name (opening name for reference)
    """
    print(f"\n{'='*60}")
    print(f"CALCULATING OPENING STATS FOR {color.upper()}")
    print(f"{'='*60}")
    
    color_db = 'w' if color.lower() == 'white' else 'b'
    
    query = f"""
        SELECT 
            pos.opening_id,
            pos.player_id,
            (pos.num_wins + pos.num_draws + pos.num_losses) as num_games,
            pos.num_wins,
            pos.num_draws,
            pos.num_losses,
            o.eco,
            o.name
        FROM player_opening_stats pos
        JOIN opening o ON pos.opening_id = o.id
        WHERE pos.color = '{color_db}'
          AND (pos.num_wins + pos.num_draws + pos.num_losses) >= {min_games_threshold}
          AND o.eco IS NOT NULL
    """
    
    print(f"\n1. Loading data from database...")
    data = pd.DataFrame(conn.execute(query).df())
    print(f"   Loaded {len(data):,} player-opening entries")
    print(f"   Unique openings: {data['opening_id'].nunique():,}")
    print(f"   Unique players: {data['player_id'].nunique():,}")
    
    print(f"\n2. Calculating raw scores...")
    data['score'] = (data['num_wins'] + 0.5 * data['num_draws']) / data['num_games']
    print(f"   Score range: [{data['score'].min():.4f}, {data['score'].max():.4f}]")
    print(f"   Global mean score: {data['score'].mean():.4f}")
    
    print(f"\n3. Aggregating by opening...")
    opening_stats = (
        data.groupby('opening_id')
        .agg({
            'score': 'mean',
            'num_games': 'sum',
            'player_id': 'count',
            'eco': 'first',
            'name': 'first'
        })
        .rename(columns={
            'score': 'opening_mean',
            'num_games': 'opening_total_games',
            'player_id': 'opening_num_players'
        })
        .reset_index()
    )
    
    print(f"   Calculated stats for {len(opening_stats):,} openings")
    print(f"\n4. Opening mean distribution:")
    print(f"   Min: {opening_stats['opening_mean'].min():.4f}")
    print(f"   25th percentile: {opening_stats['opening_mean'].quantile(0.25):.4f}")
    print(f"   Median: {opening_stats['opening_mean'].median():.4f}")
    print(f"   75th percentile: {opening_stats['opening_mean'].quantile(0.75):.4f}")
    print(f"   Max: {opening_stats['opening_mean'].max():.4f}")
    print(f"   Std: {opening_stats['opening_mean'].std():.4f}")
    
    print(f"\n5. Opening size distribution:")
    print(f"   Total games (median): {opening_stats['opening_total_games'].median():.0f}")
    print(f"   Players (median): {opening_stats['opening_num_players'].median():.0f}")
    print(f"   Total games range: [{opening_stats['opening_total_games'].min():.0f}, {opening_stats['opening_total_games'].max():.0f}]")
    print(f"   Players range: [{opening_stats['opening_num_players'].min():.0f}, {opening_stats['opening_num_players'].max():.0f}]")
    
    return opening_stats

## Step 4: Calculate Opening Stats

In [4]:
# Set color here: 'white' or 'black'
COLOR = 'black'

opening_stats = calculate_opening_stats(COLOR, min_games_threshold=10)


CALCULATING OPENING STATS FOR BLACK

1. Loading data from database...
   Loaded 3,215,430 player-opening entries
   Unique openings: 2,731
   Unique players: 49,906

2. Calculating raw scores...
   Score range: [0.0000, 1.0000]
   Global mean score: 0.4733

3. Aggregating by opening...
   Loaded 3,215,430 player-opening entries
   Unique openings: 2,731
   Unique players: 49,906

2. Calculating raw scores...
   Score range: [0.0000, 1.0000]
   Global mean score: 0.4733

3. Aggregating by opening...
   Calculated stats for 2,731 openings

4. Opening mean distribution:
   Min: 0.0769
   25th percentile: 0.4571
   Median: 0.4769
   75th percentile: 0.4989
   Max: 0.7727
   Std: 0.0516

5. Opening size distribution:
   Total games (median): 4208
   Players (median): 146
   Total games range: [10, 7107061]
   Players range: [1, 39943]
   Calculated stats for 2,731 openings

4. Opening mean distribution:
   Min: 0.0769
   25th percentile: 0.4571
   Median: 0.4769
   75th percentile: 0.4989


## Step 5: Inspect Opening Stats

In [5]:
print("\n" + "="*60)
print(f"OPENING STATS SAMPLE FOR {COLOR.upper()}")
print("="*60)

print("\nFirst 10 openings:")
print(opening_stats.head(10))

print("\n\nTop 10 STRONGEST openings (highest win rate):")
strongest = opening_stats.nlargest(10, 'opening_mean')
for idx, row in strongest.iterrows():
    print(f"   {row['eco']:>3} | {row['name']:<50} | Mean: {row['opening_mean']:.4f} | Players: {row['opening_num_players']:>5} | Games: {row['opening_total_games']:>7}")

print("\n\nTop 10 WEAKEST openings (lowest win rate):")
weakest = opening_stats.nsmallest(10, 'opening_mean')
for idx, row in weakest.iterrows():
    print(f"   {row['eco']:>3} | {row['name']:<50} | Mean: {row['opening_mean']:.4f} | Players: {row['opening_num_players']:>5} | Games: {row['opening_total_games']:>7}")

print("\n\nData types:")
print(opening_stats.dtypes)

print("\nMemory usage:")
print(opening_stats.memory_usage(deep=True))
print(f"\nTotal memory: {opening_stats.memory_usage(deep=True).sum() / 1024:.2f} KB")


OPENING STATS SAMPLE FOR BLACK

First 10 openings:
   opening_id  opening_mean  opening_total_games  opening_num_players  eco  \
0           2      0.443428                  539                   39  A00   
1           3      0.500000                   14                    1  A00   
2           5      0.481769                25773                 1998  A00   
3           8      0.446429                   26                    2  A00   
4           9      0.500195                  338                   26  A00   
5          10      0.489484                11839                  875  A00   
6          12      0.461364                   21                    2  A00   
7          13      0.555856                  220                   19  A00   
8          17      0.498766                 1987                  159  A00   
9          18      0.529715                  156                   14  A00   

                                         name  
0                                Amar Ope

## Step 6: Load Training Mappings

We need to convert database IDs to training IDs before saving.

In [6]:
print("Available model directories:")
for model_dir in MODELS_DIR.iterdir():
    if model_dir.is_dir() and not model_dir.name.startswith('.'):
        print(f"   {model_dir.name}")

# UPDATE THIS TO YOUR MODEL DIRECTORY
MODEL_DIR_NAME = "20251212_152017_black"
MODEL_ARTIFACTS_DIR = MODELS_DIR / MODEL_DIR_NAME

print(f"\nUsing model directory: {MODEL_DIR_NAME}")
print(f"Full path: {MODEL_ARTIFACTS_DIR}")

if not MODEL_ARTIFACTS_DIR.exists():
    print(f"\nWARNING: Model directory does not exist!")
    print(f"Please create it or update MODEL_DIR_NAME")
else:
    print(f"Model directory exists")

Available model directories:
   20251212_152017_black
   20251111_155428_white

Using model directory: 20251212_152017_black
Full path: /Users/a/Documents/personalprojects/chess-opening-recommender/data/models/20251212_152017_black
Model directory exists


In [7]:
mappings_path = MODEL_ARTIFACTS_DIR / "opening_mappings.csv"

if not mappings_path.exists():
    print(f"WARNING: opening_mappings.csv not found at {mappings_path}")
    print(f"You need to run the training pipeline first to create this file.")
    opening_mappings = None
else:
    opening_mappings = pd.read_csv(mappings_path)
    print("Loaded opening mappings")
    print(f"   Shape: {opening_mappings.shape}")
    print(f"   Columns: {list(opening_mappings.columns)}")
    print(f"\nFirst few mappings:")
    print(opening_mappings.head(10))

Loaded opening mappings
   Shape: (2728, 4)
   Columns: ['db_id', 'eco', 'name', 'training_id']

First few mappings:
   db_id  eco                                        name  training_id
0      2  A00                                Amar Opening            0
1      3  A00                  Amar Opening: Paris Gambit            1
2      5  A00                         Anderssen's Opening            2
3      8  A00               Barnes Opening: Gedult Gambit            3
4      9  A00                Barnes Opening: Hammerschlag            4
5     10  A00                             Clemenz Opening            5
6     12  A00                                Crab Opening            6
7     13  A00  Creepy Crawly Formation: Classical Defense            7
8     17  A00                            Gedult's Opening            8
9     18  A00                              Global Opening            9


## Step 7: Merge Stats with Training IDs

In [8]:
if opening_mappings is not None:
    stats_with_training_ids = opening_stats.merge(
        opening_mappings[['db_id', 'training_id']], 
        left_on='opening_id', 
        right_on='db_id',
        how='inner'
    )
    
    print("Merged opening stats with training IDs")
    print(f"   Original openings: {len(opening_stats)}")
    print(f"   After merge: {len(stats_with_training_ids)}")
    print(f"   Dropped: {len(opening_stats) - len(stats_with_training_ids)} (not in training set)")
    
    if stats_with_training_ids['training_id'].duplicated().any():
        print(f"\nWARNING: Found duplicate training_ids!")
        dups = stats_with_training_ids[stats_with_training_ids['training_id'].duplicated(keep=False)]
        print(dups)
    else:
        print("   No duplicate training_ids")
    
    print("\nSample of merged data:")
    print(stats_with_training_ids.head(10))
else:
    print("Skipping merge - no opening mappings available")
    stats_with_training_ids = None

Merged opening stats with training IDs
   Original openings: 2731
   After merge: 2728
   Dropped: 3 (not in training set)
   No duplicate training_ids

Sample of merged data:
   opening_id  opening_mean  opening_total_games  opening_num_players  eco  \
0           2      0.443428                  539                   39  A00   
1           3      0.500000                   14                    1  A00   
2           5      0.481769                25773                 1998  A00   
3           8      0.446429                   26                    2  A00   
4           9      0.500195                  338                   26  A00   
5          10      0.489484                11839                  875  A00   
6          12      0.461364                   21                    2  A00   
7          13      0.555856                  220                   19  A00   
8          17      0.498766                 1987                  159  A00   
9          18      0.529715                 

## Step 8: Create Compact JSON Output (Keyed by Training ID)

In [9]:
if stats_with_training_ids is not None:
    # COMPACT VERSION: [opening_mean, total_games, db_id]
    opening_stats_dict = {}
    
    for idx, row in stats_with_training_ids.iterrows():
        opening_stats_dict[int(row['training_id'])] = [
            float(row['opening_mean']),          # Index 0: mean score for Bayesian shrinkage
            int(row['opening_total_games']),     # Index 1: total games
            int(row['db_id'])                    # Index 2: db_id for reference
        ]
    
    print("Created COMPACT opening stats dictionary")
    print(f"   Total openings: {len(opening_stats_dict)}")
    print(f"   Training ID range: [{min(opening_stats_dict.keys())}, {max(opening_stats_dict.keys())}]")
    print(f"   Format: training_id: [mean, total_games, db_id]")
    
    print("\nSample entries:")
    for training_id in sorted(opening_stats_dict.keys())[:5]:
        stats = opening_stats_dict[training_id]
        print(f"   {training_id}: {stats}")
    
    # Size check
    json_str_compact = json.dumps(opening_stats_dict)
    json_size_compact = len(json_str_compact.encode('utf-8'))
    print(f"\nCompact JSON size: {json_size_compact:,} bytes ({json_size_compact / 1024:.2f} KB)")
else:
    print("Skipping JSON creation - no merged data available")
    opening_stats_dict = None

Created COMPACT opening stats dictionary
   Total openings: 2728
   Training ID range: [0, 2727]
   Format: training_id: [mean, total_games, db_id]

Sample entries:
   0: [0.44342817022621733, 539, 2]
   1: [0.5, 14, 3]
   2: [0.48176868177504945, 25773, 5]
   3: [0.4464285714285714, 26, 8]
   4: [0.5001945111597902, 338, 9]

Compact JSON size: 112,905 bytes (110.26 KB)


## Step 9: Validate JSON Structure

In [10]:
if opening_stats_dict is not None:
    print("Validating compact JSON structure...\n")
    
    all_int_keys = all(isinstance(k, int) for k in opening_stats_dict.keys())
    print(f"All keys are integers: {all_int_keys}")
    
    all_arrays = all(isinstance(v, list) and len(v) == 3 for v in opening_stats_dict.values())
    print(f"All values are 3-element arrays: {all_arrays}")
    
    sample_value = list(opening_stats_dict.values())[0]
    print(f"\nSample value structure: {sample_value}")
    print(f"   [0] Opening mean: {type(sample_value[0]).__name__}")
    print(f"   [1] Total games: {type(sample_value[1]).__name__}")
    print(f"   [2] DB ID: {type(sample_value[2]).__name__}")
    
    means = [v[0] for v in opening_stats_dict.values()]
    print(f"\nOpening mean range: [{min(means):.4f}, {max(means):.4f}]")
    all_in_range = all(0 <= m <= 1 for m in means)
    print(f"All means in [0, 1]: {all_in_range}")
    
    print("\nJSON structure validation complete")
else:
    print("Skipping validation - no data available")

Validating compact JSON structure...

All keys are integers: True
All values are 3-element arrays: True

Sample value structure: [0.44342817022621733, 539, 2]
   [0] Opening mean: float
   [1] Total games: int
   [2] DB ID: int

Opening mean range: [0.0769, 0.7727]
All means in [0, 1]: True

JSON structure validation complete


## Step 10: Spot Check Data Integrity

In [11]:
if opening_stats_dict is not None and stats_with_training_ids is not None:
    print("="*80)
    print("SPOT CHECK: Verifying Opening Stats Integrity")
    print("="*80)
    
    import random
    random.seed(42)
    
    all_training_ids = list(opening_stats_dict.keys())
    sample_size = min(200, len(all_training_ids))
    sample_training_ids = random.sample(all_training_ids, sample_size)
    
    print(f"\nSpot checking {sample_size} random entries...")
    
    verification_df = stats_with_training_ids.set_index('training_id')
    mismatches = []
    
    for i, training_id in enumerate(sample_training_ids, 1):
        json_data = opening_stats_dict[training_id]
        df_row = verification_df.loc[training_id]
        
        mean_match = abs(json_data[0] - float(df_row['opening_mean'])) < 1e-6
        games_match = json_data[1] == int(df_row['opening_total_games'])
        db_id_match = json_data[2] == int(df_row['db_id'])
        
        if not (mean_match and games_match and db_id_match):
            mismatches.append({
                'training_id': training_id,
                'json_data': json_data,
                'df_data': [float(df_row['opening_mean']), int(df_row['opening_total_games']), int(df_row['db_id'])],
                'eco': df_row['eco'],
                'name': df_row['name']
            })
        
        if i % 50 == 0:
            print(f"   Checked {i}/{sample_size} entries...")
    
    print(f"\nSpot check complete!")
    print(f"   Total entries checked: {sample_size}")
    print(f"   Mismatches found: {len(mismatches)}")
    
    if mismatches:
        print(f"\nWARNING: Found {len(mismatches)} mismatches!")
        for mismatch in mismatches[:10]:
            print(f"\n   Training ID {mismatch['training_id']}:")
            print(f"      Opening: {mismatch['eco']} - {mismatch['name']}")
            print(f"      JSON:  {mismatch['json_data']}")
            print(f"      DF:    {mismatch['df_data']}")
    else:
        print(f"\nAll {sample_size} entries verified successfully!")
        print("   JSON data matches source DataFrame exactly")
else:
    print("Skipping spot check - no data available")

SPOT CHECK: Verifying Opening Stats Integrity

Spot checking 200 random entries...
   Checked 50/200 entries...
   Checked 100/200 entries...
   Checked 150/200 entries...
   Checked 200/200 entries...

Spot check complete!
   Total entries checked: 200
   Mismatches found: 0

All 200 entries verified successfully!
   JSON data matches source DataFrame exactly


## Step 11: Save JSON and CSV Files

In [12]:
if opening_stats_dict is not None and stats_with_training_ids is not None:
    # Save compact JSON
    output_json_path = MODEL_ARTIFACTS_DIR / f"opening_stats_{COLOR}.json"
    
    with open(output_json_path, 'w') as f:
        json.dump(opening_stats_dict, f)
    
    print(f"Saved compact JSON to: {output_json_path}")
    print(f"   File size: {output_json_path.stat().st_size:,} bytes ({output_json_path.stat().st_size / 1024:.2f} KB)")
    
    # Verify JSON loads correctly
    with open(output_json_path, 'r') as f:
        loaded_dict = json.load(f)
    
    print(f"\nVerified JSON can be loaded")
    print(f"   Loaded {len(loaded_dict)} openings")
    print(f"   Matches original: {len(loaded_dict) == len(opening_stats_dict)}")
    
    # Save CSV for human inspection
    output_csv_path = MODEL_ARTIFACTS_DIR / f"opening_stats_{COLOR}.csv"
    
    csv_data = stats_with_training_ids[[
        'training_id', 'db_id', 'opening_id', 'eco', 'name', 
        'opening_mean', 'opening_total_games', 'opening_num_players'
    ]].sort_values('training_id')
    
    csv_data.to_csv(output_csv_path, index=False)
    
    print(f"\nSaved CSV to: {output_csv_path}")
    print(f"   File size: {output_csv_path.stat().st_size:,} bytes ({output_csv_path.stat().st_size / 1024:.2f} KB)")
    print(f"   Rows: {len(csv_data)}")
    
    print("\nFirst 5 rows of CSV:")
    print(csv_data.head(5))
else:
    print("Skipping save - no data available")

Saved compact JSON to: /Users/a/Documents/personalprojects/chess-opening-recommender/data/models/20251212_152017_black/opening_stats_black.json
   File size: 112,905 bytes (110.26 KB)

Verified JSON can be loaded
   Loaded 2728 openings
   Matches original: True

Saved CSV to: /Users/a/Documents/personalprojects/chess-opening-recommender/data/models/20251212_152017_black/opening_stats_black.csv
   File size: 257,957 bytes (251.91 KB)
   Rows: 2728

First 5 rows of CSV:
   training_id  db_id  opening_id  eco                           name  \
0            0      2           2  A00                   Amar Opening   
1            1      3           3  A00     Amar Opening: Paris Gambit   
2            2      5           5  A00            Anderssen's Opening   
3            3      8           8  A00  Barnes Opening: Gedult Gambit   
4            4      9           9  A00   Barnes Opening: Hammerschlag   

   opening_mean  opening_total_games  opening_num_players  
0      0.443428            

## Step 12: Test Bayesian Shrinkage Usage

In [13]:
if opening_stats_dict is not None:
    print("Testing production usage: Bayesian shrinkage calculation\n")
    
    test_training_ids = list(opening_stats_dict.keys())[:5]
    k = 50  # Shrinkage constant
    
    print(f"Using k={k} for Bayesian shrinkage\n")
    
    for i, training_id in enumerate(test_training_ids, 1):
        opening_mean = opening_stats_dict[training_id][0]  # Index 0: mean
        total_games_db = opening_stats_dict[training_id][1]  # Index 1: total games
        
        # Simulate player with 15 games
        player_games = 15
        player_raw_score = 0.6
        
        adjusted_score = ((player_games * player_raw_score) + (k * opening_mean)) / (player_games + k)
        confidence = player_games / (player_games + k)
        
        print(f"{i}. Training ID {training_id}:")
        print(f"   Opening mean from DB: {opening_mean:.4f} (from {total_games_db:,} total games)")
        print(f"   Player: {player_games} games, raw score: {player_raw_score:.4f}")
        print(f"   Adjusted score: {adjusted_score:.4f}")
        print(f"   Confidence: {confidence:.4f}")
        print(f"   Adjustment: {adjusted_score - player_raw_score:+.4f}\n")
    
    print("Bayesian shrinkage test complete")
else:
    print("Skipping usage test - no data available")

Testing production usage: Bayesian shrinkage calculation

Using k=50 for Bayesian shrinkage

1. Training ID 0:
   Opening mean from DB: 0.4434 (from 539 total games)
   Player: 15 games, raw score: 0.6000
   Adjusted score: 0.4796
   Confidence: 0.2308
   Adjustment: -0.1204

2. Training ID 1:
   Opening mean from DB: 0.5000 (from 14 total games)
   Player: 15 games, raw score: 0.6000
   Adjusted score: 0.5231
   Confidence: 0.2308
   Adjustment: -0.0769

3. Training ID 2:
   Opening mean from DB: 0.4818 (from 25,773 total games)
   Player: 15 games, raw score: 0.6000
   Adjusted score: 0.5091
   Confidence: 0.2308
   Adjustment: -0.0909

4. Training ID 3:
   Opening mean from DB: 0.4464 (from 26 total games)
   Player: 15 games, raw score: 0.6000
   Adjusted score: 0.4819
   Confidence: 0.2308
   Adjustment: -0.1181

5. Training ID 4:
   Opening mean from DB: 0.5002 (from 338 total games)
   Player: 15 games, raw score: 0.6000
   Adjusted score: 0.5232
   Confidence: 0.2308
   Adjustm

## Step 13: Test Loading and Using the JSON

In [14]:
print("\n" + "="*60)
print("SUMMARY")
print("="*60)

if opening_stats_dict is not None:
    json_path = MODEL_ARTIFACTS_DIR / f'opening_stats_{COLOR}.json'
    csv_path = MODEL_ARTIFACTS_DIR / f'opening_stats_{COLOR}.csv'
    
    print(f"\nSuccessfully created opening stats for {COLOR.upper()}")
    print(f"\nJSON (compact format for production):")
    print(f"   Location: {json_path}")
    print(f"   Total openings: {len(opening_stats_dict)}")
    print(f"   File size: {json_path.stat().st_size / 1024:.2f} KB")
    print(f"   Format: {{training_id: [mean, total_games, db_id]}}")
    
    print(f"\nCSV (human-readable for inspection):")
    print(f"   Location: {csv_path}")
    print(f"   File size: {csv_path.stat().st_size / 1024:.2f} KB")
    
    print(f"\nNext steps:")
    print(f"   1. Load opening_stats_{COLOR}.json in inference pipeline at startup")
    print(f"   2. Use opening_mean (index 0) for Bayesian shrinkage during inference")
    print(f"   3. To process other color: update COLOR variable and MODEL_DIR_NAME, then re-run")
else:
    print(f"\nNo files created")
    print(f"   Ensure opening_mappings.csv exists in model directory")
    print(f"   Verify MODEL_DIR_NAME points to correct model")

print("\n" + "="*60)
print("NOTEBOOK COMPLETE")
print("="*60)


SUMMARY

Successfully created opening stats for BLACK

JSON (compact format for production):
   Location: /Users/a/Documents/personalprojects/chess-opening-recommender/data/models/20251212_152017_black/opening_stats_black.json
   Total openings: 2728
   File size: 110.26 KB
   Format: {training_id: [mean, total_games, db_id]}

CSV (human-readable for inspection):
   Location: /Users/a/Documents/personalprojects/chess-opening-recommender/data/models/20251212_152017_black/opening_stats_black.csv
   File size: 251.91 KB

Next steps:
   1. Load opening_stats_black.json in inference pipeline at startup
   2. Use opening_mean (index 0) for Bayesian shrinkage during inference
   3. To process other color: update COLOR variable and MODEL_DIR_NAME, then re-run

NOTEBOOK COMPLETE


In [15]:
conn.close()
print("Database connection closed")

Database connection closed
