In [1]:
# Configuration
from pathlib import Path
from utils.database.db_utils import get_db_connection
 
# Define the path to the DuckDB database file
project_root = Path.cwd().parent if "notebooks" in str(Path.cwd()) else Path.cwd()
db_path = project_root / "data" / "processed" / "chess_games.db"
 
# Define partitions for player_opening_stats
partitions = list("ABCDE") + ["other"]
 
# Print configuration details
print(f"Database path: {db_path}")
print(f"Partitions: {partitions}")

Database path: /Users/a/Documents/personalprojects/chess-opening-recommender/data/processed/chess_games.db
Partitions: ['A', 'B', 'C', 'D', 'E', 'other']


# Purpose

To decrease the size of our opening stats DB, making queries faster and more efficient.

# Methods:

    1. Change INTs to SMALLINT
        - player_id and opening_id
        - num_draws
            - Not doing this on num_wins and num_losses because those may exceed the limit of this data type (~32,000)
        - Not doing this with player_opening_stats ID because that's a composite string, not an int 
        - Saves 2 bytes per entry
    
    2. Change `color` VARCHAR to w/b enum
        - 1b
        - Compared to the 12-16bytes the previous varchar took up

# Considerations
    - Saving a copy of the DB in case I bork this

In [2]:
# Print current database size
import os
try:
    db_size_bytes = os.path.getsize(db_path)
    db_size_mb = db_size_bytes / (1024 * 1024)
    print(f"Current database file size: {db_size_mb:.2f} MB")
except FileNotFoundError:
    print("Database file not found.")
    db_size_mb = 0

Current database file size: 1385.51 MB


In [None]:
# Execute migration pipeline
if db_size_mb > 0:
    with get_db_connection(db_path) as con:
        print("Creating new schema for partitioned tables...")
        con.execute("CREATE TYPE IF NOT EXISTS color_enum AS ENUM ('w', 'b');")

        # Create new player and opening tables with appropriate ID types
        print("Creating new player and opening tables...")
        con.execute("""
            CREATE TABLE player_new (
                id INTEGER PRIMARY KEY,
                name VARCHAR
            );
        """)
        con.execute("""
            CREATE TABLE opening_new (
                id INTEGER PRIMARY KEY,
                eco VARCHAR,
                name VARCHAR
            );
        """)

        # Migrate data to new player and opening tables
        print("Migrating data to new player and opening tables...")
        con.execute("INSERT INTO player_new SELECT id, name FROM player;")
        con.execute("INSERT INTO opening_new SELECT id, eco, name FROM opening;")

        # Create new player_opening_stats tables with optimized schema (no foreign keys initially)
        print("Creating new player_opening_stats tables...")
        for letter in partitions:
            new_table = f"player_opening_stats_{letter}_new"
            con.execute(f"""
                CREATE TABLE {new_table} (
                    player_id   INTEGER NOT NULL,
                    opening_id  INTEGER NOT NULL,
                    color       color_enum NOT NULL,
                    num_wins    INTEGER DEFAULT 0,
                    num_draws   SMALLINT DEFAULT 0,
                    num_losses  INTEGER DEFAULT 0,
                    PRIMARY KEY (player_id, opening_id, color)
                );
            """)

        # Migrate data from old player_opening_stats tables to new ones
        print("Migrating player_opening_stats data...")
        for letter in partitions:
            old_table = f"player_opening_stats_{letter}"
            new_table = f"player_opening_stats_{letter}_new"
            print(f"Migrating data from {old_table} to {new_table}...")
            con.execute(f"""
                INSERT INTO {new_table} (player_id, opening_id, color, num_wins, num_draws, num_losses)
                SELECT player_id, opening_id, color, num_wins, num_draws, num_losses
                FROM {old_table}
                WHERE num_wins <> 0 OR num_draws <> 0 OR num_losses <> 0;
            """)

        # Now drop old tables and rename new ones
        print("Swapping old tables with new tables...")
        for letter in partitions:
            old_table = f"player_opening_stats_{letter}"
            new_table = f"player_opening_stats_{letter}_new"
            con.execute(f"DROP TABLE {old_table};")
            con.execute(f"ALTER TABLE {new_table} RENAME TO {old_table};")

        # Drop old player and opening tables
        con.execute("DROP TABLE player;")
        con.execute("DROP TABLE opening;")

        # Rename new tables
        con.execute("ALTER TABLE player_new RENAME TO player;")
        con.execute("ALTER TABLE opening_new RENAME TO opening;")

        # Now add foreign key constraints to the final tables
        print("Adding foreign key constraints...")
        for letter in partitions:
            table_name = f"player_opening_stats_{letter}"
            con.execute(f"ALTER TABLE {table_name} ADD CONSTRAINT fk_player_{letter} FOREIGN KEY (player_id) REFERENCES player(id);")
            con.execute(f"ALTER TABLE {table_name} ADD CONSTRAINT fk_opening_{letter} FOREIGN KEY (opening_id) REFERENCES opening(id);")

        print("Migration complete. All data has been preserved with optimized schema.")

Creating new schema for partitioned tables...
Creating new player and opening tables...
Migrating data to new player and opening tables...
Creating new player_opening_stats tables...
Migrating player_opening_stats data...
Migrating data from player_opening_stats_A to player_opening_stats_A_new...
Migrating data from player_opening_stats_B to player_opening_stats_B_new...
Migrating data from player_opening_stats_B to player_opening_stats_B_new...
Migrating data from player_opening_stats_C to player_opening_stats_C_new...
Migrating data from player_opening_stats_C to player_opening_stats_C_new...
Migrating data from player_opening_stats_D to player_opening_stats_D_new...
Migrating data from player_opening_stats_D to player_opening_stats_D_new...
Migrating data from player_opening_stats_E to player_opening_stats_E_new...
Migrating data from player_opening_stats_E to player_opening_stats_E_new...
Migrating data from player_opening_stats_other to player_opening_stats_other_new...
Swapping o

DependencyException: Dependency Error: Cannot alter entry "player_new" because there are entries that depend on it.

In [None]:
# Print new database size and vacuum
try:
    db_size_bytes = os.path.getsize(db_path)
    db_size_mb = db_size_bytes / (1024 * 1024)
    print(f"New database file size: {db_size_mb:.2f} MB")
 
    with get_db_connection(db_path) as con:
        print("Vacuuming database to optimize storage...")
        con.execute("VACUUM;")
        print("Vacuum complete.")
 
        db_size_bytes = os.path.getsize(db_path)
        db_size_mb = db_size_bytes / (1024 * 1024)
        print(f"Post-vacuum database file size: {db_size_mb:.2f} MB")
except FileNotFoundError:
    print("Database file not found.")