In [None]:
# Configuration
from pathlib import Path
from utils.database.db_utils import get_db_connection
 
# Define the path to the DuckDB database file
project_root = Path.cwd().parent if "notebooks" in str(Path.cwd()) else Path.cwd()
db_path = project_root / "data" / "processed" / "chess_games.db"
 
# Define partitions for player_opening_stats
partitions = list("ABCDE") + ["other"]
 
# Print configuration details
print(f"Database path: {db_path}")
print(f"Partitions: {partitions}")

# Purpose

To decrease the size of our opening stats DB, making queries faster and more efficient.

# Methods:

    1. Change INTs to SMALLINT
        - player_id and opening_id
        - num_draws
            - Not doing this on num_wins and num_losses because those may exceed the limit of this data type (~32,000)
        - Not doing this with player_opening_stats ID because that's a composite string, not an int 
        - Saves 2 bytes per entry
    
    2. Change `color` VARCHAR to w/b enum
        - 1b
        - Compared to the 12-16bytes the previous varchar took up

# Considerations
    - Saving a copy of the DB in case I bork this

In [None]:
# Print current database size
import os
try:
    db_size_bytes = os.path.getsize(db_path)
    db_size_mb = db_size_bytes / (1024 * 1024)
    print(f"Current database file size: {db_size_mb:.2f} MB")
except FileNotFoundError:
    print("Database file not found.")
    db_size_mb = 0

In [None]:
# Execute migration pipeline
if db_size_mb > 0:
    with get_db_connection(db_path) as con:
        print("Creating new schema for partitioned tables...")
        con.execute("CREATE TYPE IF NOT EXISTS color_enum AS ENUM ('w', 'b');")
 
        for letter in partitions:
            table_name = f"player_opening_stats_{letter}_new"
            con.execute(f"""
                CREATE TABLE IF NOT EXISTS {table_name} (
                    player_id   SMALLINT NOT NULL,
                    opening_id  SMALLINT NOT NULL,
                    color       color_enum NOT NULL,
                    num_wins    INTEGER DEFAULT 0,
                    num_draws   SMALLINT DEFAULT 0,
                    num_losses  INTEGER DEFAULT 0,
                    PRIMARY KEY (player_id, opening_id, color),
                    FOREIGN KEY (player_id) REFERENCES player(id) ON DELETE CASCADE,
                    FOREIGN KEY (opening_id) REFERENCES opening(id) ON DELETE CASCADE
                );
            """)
        print("New schema created successfully.")
 
        for letter in partitions:
            old_table = f"player_opening_stats_{letter}"
            new_table = f"{old_table}_new"
            print(f"Migrating data from {old_table} to {new_table}...")
            con.execute(f"""
                INSERT INTO {new_table} (player_id, opening_id, color, num_wins, num_draws, num_losses)
                SELECT player_id, opening_id, color, num_wins, num_draws, num_losses
                FROM {old_table}
                WHERE num_wins <> 0 OR num_draws <> 0 OR num_losses <> 0;
            """)
            print(f"Data migrated for partition {letter}.")
 
        print("Swapping old tables with new tables...")
        for letter in partitions:
            old_table = f"player_opening_stats_{letter}"
            new_table = f"{old_table}_new"
            con.execute(f"DROP TABLE {old_table};")
            con.execute(f"ALTER TABLE {new_table} RENAME TO {old_table};")
        print("Table swap complete.")

In [None]:
# Print new database size and vacuum
try:
    db_size_bytes = os.path.getsize(db_path)
    db_size_mb = db_size_bytes / (1024 * 1024)
    print(f"New database file size: {db_size_mb:.2f} MB")
 
    with get_db_connection(db_path) as con:
        print("Vacuuming database to optimize storage...")
        con.execute("VACUUM;")
        print("Vacuum complete.")
 
        db_size_bytes = os.path.getsize(db_path)
        db_size_mb = db_size_bytes / (1024 * 1024)
        print(f"Post-vacuum database file size: {db_size_mb:.2f} MB")
except FileNotFoundError:
    print("Database file not found.")