# Downloading Experiments

This notebook is a proof of concept

Right now, we have some flaws in our raw data download system. We want to make sure we get this right because we will be downloading and processing hundreds or thousands of GBs of raw data parquet files.

So, we'll be messing around here with some implementations, and if they work, we'll be replacing parts of our original code with this.

In [1]:
%pip install huggingface_hub --quiet
import os

# --- Configuration ---
# Percent of CPU to allocate (approximation, affects DuckDB threads)
cpu_allocation_percent = 85  # Adjust this value as needed
allocated_threads = max(1, int(os.cpu_count() * (cpu_allocation_percent / 100)))
print(
    f"Allocating {allocated_threads} threads based on {cpu_allocation_percent}% CPU usage."
)


os.environ["OMP_NUM_THREADS"] = str(allocated_threads)
os.environ["OPENBLAS_NUM_THREADS"] = str(allocated_threads)

print("OMP_NUM_THREADS:", os.environ.get("OMP_NUM_THREADS"))
print("OPENBLAS_NUM_THREADS:", os.environ.get("OPENBLAS_NUM_THREADS"))


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Allocating 10 threads based on 85% CPU usage.
OMP_NUM_THREADS: 10
OPENBLAS_NUM_THREADS: 10


## Getting file names

We want to use file names and other meta data to do dupe checks of what we've already processed, and see what we still need to download from a certain month. Let's run this code that gets all file names from the repo, to see what the data looks like.

TODO I wrote get_parquet_file_names for this; if we use this notebook again, convert this functionality to use that util.

In [2]:
# Here we get a list of raw data files for the given month/year

# TODO I wrote get_parquet_file_names for this; if we use this notebook again, convert this functionality to use that util.

# File names in the remote repo are structured like:
# data/year=2025/month=03/train-00001-of-00065.parquet
# Obviously, there will be different amounts of them so it won't always be -00065.parquet

from huggingface_hub import HfApi
from pathlib import Path
import os

year, month = 2024, 11
max_files_to_download = 250  # For testing; set to None to process all new files

api = HfApi()
files = api.list_repo_files(
    repo_id="Lichess/standard-chess-games",
    repo_type="dataset"
)

# Filter for that year/month
target_prefix = f"data/year={year}/month={month:02d}/"
all_file_names_in_month = [f for f in files if f.startswith(target_prefix)]

print(len(all_file_names_in_month), f"files found for {year}-{month}")
for f in all_file_names_in_month[:20]:  # preview first 20
    print(f)

385 files found for 2024-11
data/year=2024/month=11/train-00000-of-00385.parquet
data/year=2024/month=11/train-00001-of-00385.parquet
data/year=2024/month=11/train-00002-of-00385.parquet
data/year=2024/month=11/train-00003-of-00385.parquet
data/year=2024/month=11/train-00004-of-00385.parquet
data/year=2024/month=11/train-00005-of-00385.parquet
data/year=2024/month=11/train-00006-of-00385.parquet
data/year=2024/month=11/train-00007-of-00385.parquet
data/year=2024/month=11/train-00008-of-00385.parquet
data/year=2024/month=11/train-00009-of-00385.parquet
data/year=2024/month=11/train-00010-of-00385.parquet
data/year=2024/month=11/train-00011-of-00385.parquet
data/year=2024/month=11/train-00012-of-00385.parquet
data/year=2024/month=11/train-00013-of-00385.parquet
data/year=2024/month=11/train-00014-of-00385.parquet
data/year=2024/month=11/train-00015-of-00385.parquet
data/year=2024/month=11/train-00016-of-00385.parquet
data/year=2024/month=11/train-00017-of-00385.parquet
data/year=2024/mon

## Dupe checks

Now that we have the list of raw data file names for the given month and year, we'll perform our dupe checks. This parses through the list of file names to make sure we haven't already processed any of these files.

Each file is a 1GB download, so it's obviously in our best interest not to download a file we've already processed.

In [None]:
# Parse the list of raw data file names to make sure we haven't already processed any of these files, and skip downloading any dupes

from huggingface_hub import get_hf_file_metadata, hf_hub_url

import sys

# Current working directory (should be project root)
project_root = Path.cwd()
sys.path.insert(0, str(project_root))

from utils.file_processing.raw_data_file_dupe_checks import FileRegistry  # noqa: E402

# Init registry
registry = FileRegistry()

# Remove files already processed
non_dupe_files = []
month_str = f"{year}-{month}"
for f in all_file_names_in_month:
    url = hf_hub_url(
        repo_id="Lichess/standard-chess-games",
        repo_type="dataset",
        filename=f,
    )
    meta = get_hf_file_metadata(url=url)
    size = meta.size
    etag = meta.etag

    # This is the filename format that will be saved in the registry
    expected_filename_in_registry = f"{year}-{month:02d}-{Path(f).name}"

    if not registry.is_file_processed(
        month_str, expected_filename_in_registry, size, etag
    ):
        non_dupe_files.append(f)

print(len(non_dupe_files), "new files to download")

## Config

Now to configure how we want our downloading and processing pipeline to operate.

In [None]:
# Config stuff
import importlib
from utils.database import db_utils, player_game_counts_db_utils
from utils.file_processing import process_game_batch
importlib.reload(db_utils)
importlib.reload(process_game_batch)
importlib.reload(player_game_counts_db_utils)


from utils.downloading_raw_parquet_data.raw_parquet_data_file_downloader import (
    download_single_parquet_file,
)
from utils.file_processing.process_parquet_file import (
    process_parquet_file,
)
from utils.file_processing.types_and_classes import ProcessingConfig
from utils.database.db_utils import get_db_connection, setup_database
from utils.database.player_game_counts_db_utils import get_eligible_player_usernames


# --- Configuration ---
local_dir = Path("../data/raw/better_downloading_experiments")
local_dir.mkdir(parents=True, exist_ok=True)

# Define the path for the DuckDB database file.
db_path = Path("../data/processed/chess_games.db")
db_path.parent.mkdir(parents=True, exist_ok=True)

# Path to the database containing our list of eligible players.
player_counts_db_path = Path(
    "../data/processed/find_most_active_players/player_game_counts.duckdb"
)


# Base config for processing. This will be used for each file.
base_config = ProcessingConfig(
    parquet_path="",  # This will be set per-file
    db_path=db_path,
    batch_size=1_500_000,
    min_player_rating=1200,
    max_elo_difference_between_players=100,
    allowed_time_controls={"Blitz", "Rapid", "Classical"},
)

# --- Database Initialization ---
# Set up the database schema before starting any processing.
# Only run setup_database if the DB is new or empty. Otherwise, skip to avoid schema/constraint errors.
with get_db_connection(db_path) as con:
    tables = con.execute("SELECT table_name FROM information_schema.tables WHERE table_schema='main';").fetchall()
    table_names = {t[0] for t in tables}
    # Only run setup_database if no stats tables exist
    if not any(t.startswith("player_opening_stats_") for t in table_names):
        setup_database(con)
    con.execute(f"PRAGMA threads={allocated_threads}")

# --- Load Eligible Players ---
# Load the set of usernames for players we want to include in our analysis.
# Games will be filtered to only include those where at least one player is in this set.
with get_db_connection(player_counts_db_path) as con:
    eligible_players = get_eligible_player_usernames(con)


# --- File Selection ---
# Use the list of non-duplicate files from the previous cell
files_to_download = non_dupe_files
if max_files_to_download is not None:
    files_to_download = non_dupe_files[:max_files_to_download]

print(f"Prepared to download and process {len(files_to_download)} files.")

## Downloading, processing, deleting

Now, we'lll download, process and delete our raw data files one by one.

The workflow is:

1. Download a file
2. Process that file, extracting the game data we want
3. Delete that file

In [None]:
import time
import os
from collections import defaultdict

# --- Timing Stats ---
timing_stats = defaultdict(list)
file_metrics = []  # To store metrics for each file
total_start_time = time.time()

for i, file_to_download in enumerate(files_to_download):
    # Buffer for consolidated logging
    log_buffer = []

    log_buffer.append(f"\n--- Downloading [{i+1}/{len(files_to_download)}] ---")
    step_start_time = time.time()

    # 1. Download the file
    downloaded_file_path = download_single_parquet_file(
        repo_id="Lichess/standard-chess-games",
        repo_type="dataset",
        file_to_download=file_to_download,
        local_dir=local_dir,
        year=year,
        month=month,
    )

    timing_stats['download'].append(time.time() - step_start_time)
    if not downloaded_file_path:
        log_buffer.append(f"DOWNLOAD FAILED for {file_to_download}. Skipping.")
        print("\n".join(log_buffer))  # Print all logs for this file at once
        continue

    log_buffer.append(f"Successfully downloaded: {downloaded_file_path.name}")

    # Get metadata for the downloaded file
    url = hf_hub_url(
        repo_id="Lichess/standard-chess-games",
        repo_type="dataset",
        filename=file_to_download,
    )
    meta = get_hf_file_metadata(url=url)

    # 2. Process the file
    step_start_time = time.time()
    file_config = base_config.replace(parquet_path=str(downloaded_file_path))
    file_context = {
        "current_file_num": i + 1,
        "total_files": len(files_to_download),
        "total_start_time": total_start_time,
    }
    is_processing_successful = process_parquet_file(
        config=file_config,
        eligible_players=eligible_players,
        file_context=file_context,
    )
    process_time = time.time() - step_start_time
    timing_stats['process'].append(process_time)

    # Calculate games per second (if applicable)
    games_processed = meta.size // 1_000_000  # Approximation based on file size
    games_per_second = games_processed / process_time if process_time > 0 else 0

    # 3. Register and Delete on Success
    step_start_time = time.time()
    if is_processing_successful:
        log_buffer.append(f"PROCESSING SUCCESSFUL for {downloaded_file_path.name}")
        registry.mark_file_processed(
            month=f"{year}-{month}",
            filename=downloaded_file_path.name,
            size=meta.size,
            etag=meta.etag,
        )
        log_buffer.append("Registered file as processed.")
        os.remove(downloaded_file_path)
        log_buffer.append(f"Deleted local file: {downloaded_file_path.name}")
    else:
        log_buffer.append(f"PROCESSING FAILED for {downloaded_file_path.name}")
        # registry.mark_file_processed(
        #     month=f"{year}-{month}",
        #     filename=downloaded_file_path.name,
        #     size=meta.size,
        #     etag=meta.etag,
        # )
        log_buffer.append("Registered file as processed to avoid re-downloading.")
        os.remove(downloaded_file_path)
        log_buffer.append(f"Deleted local file: {downloaded_file_path.name}")
    timing_stats['register_delete'].append(time.time() - step_start_time)

    # Add file metrics to summary
    file_metrics.append({
        "file_name": downloaded_file_path.name,
        "total_time": process_time,
        "games_per_second": games_per_second,
    })

    # Log file-level summary
    log_buffer.append(f"\n--- File Summary ---")
    log_buffer.append(f"File: {downloaded_file_path.name}, Total Time: {process_time:.2f}s, Games Per Second: {games_per_second:.2f}")

    print("\n".join(log_buffer))  # Print all logs for this file at once

total_elapsed_time = time.time() - total_start_time
print(f"\nTotal elapsed time for all files: {total_elapsed_time:.2f}s") 