In [1]:
# Configuration
from pathlib import Path
from utils.database.db_utils import get_db_connection
 
# Define the path to the DuckDB database file
project_root = Path.cwd().parent if "notebooks" in str(Path.cwd()) else Path.cwd()
db_path = project_root / "data" / "processed" / "chess_games.db"
 
# Define partitions for player_opening_stats
partitions = list("ABCDE") + ["other"]
 
# Print configuration details
print(f"Database path: {db_path}")
print(f"Partitions: {partitions}")

Database path: /Users/a/Documents/personalprojects/chess-opening-recommender/data/processed/chess_games.db
Partitions: ['A', 'B', 'C', 'D', 'E', 'other']


# Purpose

To decrease the size of our opening stats DB, making queries faster and more efficient.

# Methods:

    1. Change INTs to SMALLINT
        - player_id and opening_id
        - num_draws
            - Not doing this on num_wins and num_losses because those may exceed the limit of this data type (~32,000)
        - Not doing this with player_opening_stats ID because that's a composite string, not an int 
        - Saves 2 bytes per entry
    
    2. Change `color` VARCHAR to w/b enum
        - 1b
        - Compared to the 12-16bytes the previous varchar took up
        
# New Strategy: Export and Re-import

The previous in-place compaction strategy caused the database file to bloat due to how DuckDB handles large transactions and table modifications. The new approach is to export the optimized data to temporary Parquet files and then create a completely new, clean database from them. This ensures the final file is as small as possible, with no leftover transactional history or fragmentation.

# Considerations
    - Saving a copy of the DB in case I bork this

In [2]:
# Print current database size
import os
try:
    db_size_bytes = os.path.getsize(db_path)
    db_size_mb = db_size_bytes / (1024 * 1024)
    print(f"Current database file size: {db_size_mb:.2f} MB")
except FileNotFoundError:
    print("Database file not found.")
    db_size_mb = 0

Current database file size: 1385.51 MB


In [3]:
# Diagnostic - check actual table schemas and data counts
with get_db_connection(db_path) as con:
    print("=== CURRENT TABLE SCHEMAS ===")
    
    # Check player table
    player_schema = con.execute("DESCRIBE player").fetchall()
    print(f"\nPLAYER table schema:")
    for col in player_schema:
        print(f"  {col[0]}: {col[1]}")
    
    # Check opening table  
    opening_schema = con.execute("DESCRIBE opening").fetchall()
    print(f"\nOPENING table schema:")
    for col in opening_schema:
        print(f"  {col[0]}: {col[1]}")
    
    # Check one stats table
    stats_schema = con.execute("DESCRIBE player_opening_stats_A").fetchall()
    print(f"\nPLAYER_OPENING_STATS_A table schema:")
    for col in stats_schema:
        print(f"  {col[0]}: {col[1]}")
    
    print("\n=== DATA RANGES ===")
    
    # Check max values
    max_player_id = con.execute("SELECT MAX(id) FROM player").fetchone()[0]
    max_opening_id = con.execute("SELECT MAX(id) FROM opening").fetchone()[0]
    print(f"Max player_id: {max_player_id:,}")
    print(f"Max opening_id: {max_opening_id:,}")
    
    # Check total row counts
    total_stats_rows = 0
    for letter in ['A', 'B', 'C', 'D', 'E', 'other']:
        count = con.execute(f"SELECT COUNT(*) FROM player_opening_stats_{letter}").fetchone()[0]
        print(f"player_opening_stats_{letter}: {count:,} rows")
        total_stats_rows += count
    
    print(f"\nTotal stats rows: {total_stats_rows:,}")
    
    # Check color values distribution
    color_dist = con.execute("SELECT color, COUNT(*) FROM player_opening_stats_A GROUP BY color").fetchall()
    print(f"\nColor distribution in stats_A: {color_dist}")
    
    # Check num_draws distribution
    draws_stats = con.execute("SELECT MIN(num_draws), MAX(num_draws), AVG(num_draws) FROM player_opening_stats_A").fetchone()
    print(f"\nnum_draws in stats_A - Min: {draws_stats[0]}, Max: {draws_stats[1]}, Avg: {draws_stats[2]:.2f}")

=== CURRENT TABLE SCHEMAS ===

PLAYER table schema:
  id: INTEGER
  name: VARCHAR
  title: VARCHAR

OPENING table schema:
  id: INTEGER
  eco: VARCHAR
  name: VARCHAR

PLAYER_OPENING_STATS_A table schema:
  player_id: INTEGER
  opening_id: INTEGER
  color: VARCHAR
  num_wins: INTEGER
  num_draws: INTEGER
  num_losses: INTEGER

=== DATA RANGES ===
Max player_id: 32,964,861
Max opening_id: 5,222,927
player_opening_stats_A: 3,137,793 rows
player_opening_stats_B: 3,339,867 rows
player_opening_stats_C: 4,175,489 rows
player_opening_stats_D: 1,771,597 rows
player_opening_stats_E: 442,866 rows
player_opening_stats_other: 0 rows

Total stats rows: 12,867,612

Color distribution in stats_A: [('b', 2029230), ('w', 1108563)]

num_draws in stats_A - Min: 0, Max: 480, Avg: 0.45


In [4]:
# Execute PROPER database shrinking - export and re-import to a new DB file
if db_size_mb > 0:
    # Define paths for the new database and temporary export directory
    db_path_new = db_path.with_name(f"{db_path.stem}_shrunk.db")
    export_dir = project_root / "data" / "processed" / "temp_export"
    export_dir.mkdir(exist_ok=True)

    print(f"New database will be created at: {db_path_new}")
    print(f"Temporary export directory: {export_dir}")

    with get_db_connection(db_path) as con:
        print("\n=== ANALYZING SOURCE DATABASE ===")
        player_count = con.execute('SELECT COUNT(*) FROM player').fetchone()[0]
        opening_count = con.execute('SELECT COUNT(*) FROM opening').fetchone()[0]
        
        print(f'Players: {player_count:,} records')
        print(f'Openings: {opening_count:,} records')

        # Determine optimal integer types
        player_id_type = "SMALLINT" if player_count <= 32767 else "INTEGER"
        opening_id_type = "SMALLINT" if opening_count <= 32767 else "INTEGER"
        print(f'\nPlayer IDs can use SMALLINT: {player_id_type == "SMALLINT"} -> Using {player_id_type}')
        print(f'Opening IDs can use SMALLINT: {opening_id_type == "SMALLINT"} -> Using {opening_id_type}')

        # --- EXPORT PHASE ---
        print("\n=== EXPORTING AND TRANSFORMING DATA ===")

        # Export compacted player table
        print("Exporting compacted player table...")
        con.execute(f"""
            COPY (
                SELECT 
                    ROW_NUMBER() OVER (ORDER BY name) as new_id,
                    name,
                    title,
                    id as old_id
                FROM player
            ) TO '{export_dir / 'player_mapping.parquet'}' (FORMAT PARQUET);
        """)

        # Export compacted opening table
        print("Exporting compacted opening table...")
        con.execute(f"""
            COPY (
                SELECT 
                    ROW_NUMBER() OVER (ORDER BY eco, name) as new_id,
                    eco,
                    name,
                    id as old_id
                FROM opening
            ) TO '{export_dir / 'opening_mapping.parquet'}' (FORMAT PARQUET);
        """)

        # Export and transform stats tables
        print("Exporting and transforming stats tables...")
        for letter in partitions:
            old_table = f"player_opening_stats_{letter}"
            export_path = export_dir / f'stats_{letter}.parquet'
            print(f"  - Exporting {old_table} to {export_path.name}...")
            
            con.execute(f"""
                COPY (
                    SELECT 
                        s.player_id as old_player_id,
                        s.opening_id as old_opening_id,
                        s.color,
                        s.num_wins,
                        s.num_draws,
                        s.num_losses
                    FROM {old_table} s
                    WHERE s.num_wins > 0 OR s.num_draws > 0 OR s.num_losses > 0
                ) TO '{export_path}' (FORMAT PARQUET);
            """)

    print("\n=== EXPORT COMPLETE ===")

    # --- IMPORT PHASE ---
    # Delete the new DB file if it exists to ensure a fresh start
    if db_path_new.exists():
        db_path_new.unlink()
        print(f"\nDeleted existing new database file: {db_path_new}")

    with get_db_connection(db_path_new) as con_new:
        print("\n=== CREATING NEW DATABASE AND IMPORTING DATA ===")
        
        # Create new tables
        print("Creating new tables with optimized schema...")
        con_new.execute(f"""
            CREATE TABLE player (
                id      {player_id_type} PRIMARY KEY,
                name    VARCHAR NOT NULL,
                title   VARCHAR
            );
        """)
        con_new.execute(f"""
            CREATE TABLE opening (
                id      {opening_id_type} PRIMARY KEY,
                eco     VARCHAR(3) NOT NULL,
                name    VARCHAR NOT NULL
            );
        """)
        con_new.execute("CREATE TYPE color_enum AS ENUM ('w', 'b');")

        # Import player and opening data
        print("Importing player and opening data...")
        con_new.execute(f"INSERT INTO player (id, name, title) SELECT new_id, name, title FROM read_parquet('{export_dir / 'player_mapping.parquet'}');")
        con_new.execute(f"INSERT INTO opening (id, eco, name) SELECT new_id, eco, name FROM read_parquet('{export_dir / 'opening_mapping.parquet'}');")

        # Create and import stats tables
        print("Creating and importing stats tables...")
        for letter in partitions:
            new_table = f"player_opening_stats_{letter}"
            print(f"  - Creating and populating {new_table}...")
            
            con_new.execute(f"""
                CREATE TABLE {new_table} (
                    player_id   {player_id_type} NOT NULL,
                    opening_id  {opening_id_type} NOT NULL,
                    color       color_enum NOT NULL,
                    num_wins    INTEGER DEFAULT 0,
                    num_draws   SMALLINT DEFAULT 0,
                    num_losses  INTEGER DEFAULT 0,
                    PRIMARY KEY (player_id, opening_id, color)
                );
            """)

            con_new.execute(f"""
                INSERT INTO {new_table}
                WITH player_mapping AS (
                    SELECT old_id, new_id FROM read_parquet('{export_dir / 'player_mapping.parquet'}')
                ),
                opening_mapping AS (
                    SELECT old_id, new_id FROM read_parquet('{export_dir / 'opening_mapping.parquet'}')
                )
                SELECT 
                    pm.new_id AS player_id,
                    om.new_id AS opening_id,
                    s.color::color_enum AS color,
                    s.num_wins,
                    s.num_draws::SMALLINT,
                    s.num_losses
                FROM read_parquet('{export_dir / f'stats_{letter}.parquet'}') s
                JOIN player_mapping pm ON s.old_player_id = pm.old_id
                JOIN opening_mapping om ON s.old_opening_id = om.old_id;
            """)

        # Recreate the view
        print("Recreating the consolidated view...")
        union_selects = "\nUNION ALL\n".join([
            f"SELECT * FROM player_opening_stats_{letter}"
            for letter in partitions
        ])
        con_new.execute(f"CREATE OR REPLACE VIEW player_opening_stats AS {union_selects};")
        
        print("\n=== COMPACTION AND RE-IMPORT COMPLETE ===")
        final_player_stats = con_new.execute('SELECT MIN(id), MAX(id), COUNT(*) FROM player').fetchone()
        final_opening_stats = con_new.execute('SELECT MIN(id), MAX(id), COUNT(*) FROM opening').fetchone()
        
        print(f'Final player IDs: {final_player_stats[0]:,} to {final_player_stats[1]:,} ({final_player_stats[2]:,} records)')
        print(f'Final opening IDs: {final_opening_stats[0]:,} to {final_opening_stats[1]:,} ({final_opening_stats[2]:,} records)')
        
        total_final_records = 0
        for letter in partitions:
            count = con_new.execute(f'SELECT COUNT(*) FROM player_opening_stats_{letter}').fetchone()[0]
            total_final_records += count
        print(f'\nTotal optimized records: {total_final_records:,}')

# Clean up temporary files
import shutil
shutil.rmtree(export_dir)
print(f"\nTemporary export directory cleaned up: {export_dir}")

New database will be created at: /Users/a/Documents/personalprojects/chess-opening-recommender/data/processed/chess_games_shrunk.db
Temporary export directory: /Users/a/Documents/personalprojects/chess-opening-recommender/data/processed/temp_export

=== ANALYZING SOURCE DATABASE ===
Players: 44,459 records
Openings: 3,132 records

Player IDs can use SMALLINT: False -> Using INTEGER
Opening IDs can use SMALLINT: True -> Using SMALLINT

=== EXPORTING AND TRANSFORMING DATA ===
Exporting compacted player table...
Exporting compacted opening table...
Exporting and transforming stats tables...
  - Exporting player_opening_stats_A to stats_A.parquet...

=== ANALYZING SOURCE DATABASE ===
Players: 44,459 records
Openings: 3,132 records

Player IDs can use SMALLINT: False -> Using INTEGER
Opening IDs can use SMALLINT: True -> Using SMALLINT

=== EXPORTING AND TRANSFORMING DATA ===
Exporting compacted player table...
Exporting compacted opening table...
Exporting and transforming stats tables...


In [5]:
# Print new database size and vacuum
try:
    db_path_new = db_path.with_name(f"{db_path.stem}_shrunk.db")
    db_size_bytes = os.path.getsize(db_path_new)
    db_size_mb = db_size_bytes / (1024 * 1024)
    print(f"New shrunken database file size: {db_size_mb:.2f} MB")
 
    with get_db_connection(db_path_new) as con:
        print("Vacuuming the new database to optimize storage...")
        con.execute("VACUUM;")
        print("Vacuum complete.")
 
    db_size_bytes = os.path.getsize(db_path_new)
    db_size_mb = db_size_bytes / (1024 * 1024)
    print(f"Post-vacuum shrunken database file size: {db_size_mb:.2f} MB")
except FileNotFoundError:
    print("New database file not found.")

New shrunken database file size: 655.51 MB
Vacuuming the new database to optimize storage...
Vacuum complete.
Post-vacuum shrunken database file size: 655.51 MB


In [6]:
# Final check on the new database
try:
    db_path_new = db_path.with_name(f"{db_path.stem}_shrunk.db")
    db_size_bytes = os.path.getsize(db_path_new)
    db_size_mb = db_size_bytes / (1024 * 1024)
    print(f"Final shrunken database file size: {db_size_mb:.2f} MB")
except FileNotFoundError:
    print("New database file not found.")

Final shrunken database file size: 655.51 MB
