In [None]:
import zstandard as zstd

input_path = "../../lichess_db_standard_rated_2025-12.pgn.zst"
output_path = "../../lichess_db_standard_rated_2025-12.pgn"

CHUNK_SIZE = 1024 * 1024 * 64  # 64 MB

with open(input_path, "rb") as compressed, open(output_path, "wb") as decompressed:
    dctx = zstd.ZstdDecompressor()
    reader = dctx.stream_reader(compressed)

    while True:
        chunk = reader.read(CHUNK_SIZE)
        if not chunk:
            break
        decompressed.write(chunk)

    reader.close()

print("Extraction completed safely")


In [None]:
import zstandard as zstd
import chess.pgn
import io
import os
from tqdm import tqdm
import pandas as pd

def process_game(game):
    if game is None:
        return "STOP"

    headers = game.headers

    # --- basic validation ---
    try:
        white_elo = int(headers.get("WhiteElo", 0))
        black_elo = int(headers.get("BlackElo", 0))
    except ValueError:
        return None

    # is_rapid = headers.get("Event") == "Rated Rapid game"
    white_title = headers.get("WhiteTitle")
    black_title = headers.get("BlackTitle")

    # --- filter rating ---
    if white_elo < 2000 or black_elo < 2000:
        return None

    # --- filter time control (rapid) ---
    time_control = headers.get("TimeControl", "")
    base_time = int(time_control.split("+")[0]) if "+" in time_control else 0
    if not (600 <= base_time <= 1800):  # rapid lichess
        return None

    # --- collect moves ---
    moves = [m.move.uci() for m in game.mainline()]

    return {
        "link": headers.get("Site"),
        "white": headers.get("White"),
        "black": headers.get("Black"),
        "white_titled": white_title,
        "black_titled": black_title,
        "white_elo": white_elo,
        "black_elo": black_elo,
        "result": headers.get("Result"),
        "white_rating_diff": headers.get("WhiteRatingDiff"),
        "black_rating_diff": headers.get("BlackRatingDiff"),
        "time_control": time_control,
        "termination": headers.get("Termination"),
        "eco": headers.get("ECO"),
        "opening": headers.get("Opening"),
        "moves_uci": " ".join(moves),
        "num_moves": len(moves)
    }



file_path = "../lichess_db_standard_rated_2025-12.pgn.zst"

TOTAL_GAMES = 94_847_276
TOTAL_CSV = 100_000
BATCH_SIZE = 2_000
CSV_FILE = "lichess_rapid_elo2000.csv"

rows = []
collected_count = 0

with open(file_path, "rb") as f:
    dctx = zstd.ZstdDecompressor()
    stream = dctx.stream_reader(f)
    text_stream = io.TextIOWrapper(stream, encoding="utf-8")

    with tqdm(total=TOTAL_GAMES, desc="Processing PGN", smoothing=0.05, miniters=100) as pbar:

        while True:
            game = chess.pgn.read_game(text_stream)
            result = process_game(game)
        
            if game is None or result == "STOP":
                print("Game is None")
                break

            if result is not None:
                rows.append(result)
                collected_count += 1

            # save checkpoint
            if len(rows) >= BATCH_SIZE:
                pd.DataFrame(rows).to_csv(
                    CSV_FILE,
                    mode="a",
                    header=not os.path.exists(CSV_FILE),
                    index=False
                )
                rows.clear()

            if collected_count >= TOTAL_CSV:
                print("Target CSV rows reached.")
                break

            pbar.update(1)


df = pd.read_csv(CSV_FILE)
print(df.head())
print(f"\nTotal games collected: {len(df)}")
