# Purpose:
To obtain data on lichess games, sort and organize by username, sanitize and sort.

In [5]:
from typing import TypedDict, Optional

In [10]:
import chess.pgn
import zstandard as zstd
import io

# Path to the compressed PGN file
pgn_path = "/Users/a/Documents/personalprojects/chess-opening-recommender/data/raw/lichess_db_standard_rated_2025-07.pgn.zst"

# Open and decompress the file
with open(pgn_path, 'rb') as f:
    dctx = zstd.ZstdDecompressor()
    stream_reader = dctx.stream_reader(f)
    text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
    
    # Read the first game as an example
    game = chess.pgn.read_game(text_stream)
    
    # Print the game details
    if game:
        print(f"Event: {game.headers['Event']}")
        print(f"White: {game.headers['White']} (Elo: {game.headers.get('WhiteElo', 'N/A')})")
        print(f"Black: {game.headers['Black']} (Elo: {game.headers.get('BlackElo', 'N/A')})")
        print(f"Result: {game.headers['Result']}")
        print(f"Opening: {game.headers.get('Opening', 'N/A')}")
        print(f"ECO: {game.headers.get('ECO', 'N/A')}")
        print(f"\nMoves:")
        print(game)
    else:
        print("No game found in the file.")

Event: Rated Bullet game
White: my_name_jeff (Elo: 1706)
Black: xxxgrishaxxx (Elo: 1671)
Result: 0-1
Opening: Benoni Defense: Old Benoni
ECO: A43

Moves:
[Event "Rated Bullet game"]
[Site "https://lichess.org/VsUqVhC2"]
[Date "2025.07.01"]
[Round "-"]
[White "my_name_jeff"]
[Black "xxxgrishaxxx"]
[Result "0-1"]
[UTCDate "2025.07.01"]
[UTCTime "00:00:31"]
[WhiteElo "1706"]
[BlackElo "1671"]
[WhiteRatingDiff "-6"]
[BlackRatingDiff "+6"]
[ECO "A43"]
[Opening "Benoni Defense: Old Benoni"]
[TimeControl "60+0"]
[Termination "Time forfeit"]

1. d4 { [%clk 0:01:00] } 1... c5 { [%clk 0:01:00] } 2. e3 { [%clk 0:01:00] } 2... e6 { [%clk 0:00:59] } 3. dxc5 { [%clk 0:00:59] } 3... Bxc5 { [%clk 0:00:58] } 4. Nf3 { [%clk 0:00:59] } 4... Nf6 { [%clk 0:00:57] } 5. c3 { [%clk 0:00:59] } 5... Nc6 { [%clk 0:00:56] } 6. Bb5 { [%clk 0:00:58] } 6... a6 { [%clk 0:00:55] } 7. Bxc6 { [%clk 0:00:57] } 7... bxc6 { [%clk 0:00:55] } 8. O-O { [%clk 0:00:57] } 8... d5 { [%clk 0:00:54] } 9. Nd4 { [%clk 0:00:56] } 9...

In [13]:
# Improved function to read and filter games as we go
def read_filtered_games(file_path, max_games=None, min_rating=1500, 
                        exclude_bullet=True, min_moves=15):
    """Read games with filtering applied during reading to save memory and time"""
    games_read = 0
    games_kept = 0
    filtered_games = []
    
    with open(file_path, 'rb') as f:
        dctx = zstd.ZstdDecompressor()
        stream_reader = dctx.stream_reader(f)
        text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
        
        start_time = time.time()
        
        while True:
            if max_games and games_kept >= max_games:
                break
                
            # Read the next game
            game = chess.pgn.read_game(text_stream)
            
            # Break if no more games
            if game is None:
                break
                
            games_read += 1
            
            # Log progress
            if games_read % 5_000 == 0:
                elapsed = time.time() - start_time
                print(f"Read {games_read} games, kept {games_kept} in {elapsed:.2f} seconds")
            
            # Apply filters during reading to save memory
            headers = dict(game.headers)
            
            # Filter by rating
            white_elo = int(headers.get('WhiteElo', 0))
            black_elo = int(headers.get('BlackElo', 0))
            if white_elo < min_rating or black_elo < min_rating:
                continue
            
            # Filter bullet games (typically < 3 min)
            if exclude_bullet:
                time_control = headers.get('TimeControl', '')
                # Skip bullet games (usually less than 3 minutes per player)
                if time_control:
                    try:
                        # Common format: initial_time+increment, e.g. 60+1
                        parts = time_control.split('+')
                        initial_seconds = int(parts[0])
                        if initial_seconds < 180:  # Less than 3 minutes
                            continue
                    except:
                        # If we can't parse the time control, just keep the game
                        pass
            
            # Count moves to filter short games
            move_count = 0
            node = game
            while not node.is_end():
                move_count += 1
                if move_count >= min_moves:
                    break
                node = node.variations[0]
            
            if move_count < min_moves:
                continue
            
            # If the game passed all filters, keep it
            filtered_games.append(game)
            games_kept += 1
    
    elapsed = time.time() - start_time
    print(f"Finished reading {games_read} games, kept {games_kept} in {elapsed:.2f} seconds")
    return filtered_games

In [15]:
# Test the filtered reading function with a small sample
filtered_games = read_filtered_games(pgn_path, max_games=50000)

# Show stats of the first game as an example
if filtered_games:
    print(f"\nDetails of first filtered game:")
    print(f"White: {filtered_games[0].headers['White']} (Elo: {filtered_games[0].headers.get('WhiteElo', 'N/A')})")
    print(f"Black: {filtered_games[0].headers['Black']} (Elo: {filtered_games[0].headers.get('BlackElo', 'N/A')})")
    print(f"ECO: {filtered_games[0].headers.get('ECO', 'N/A')}")
    print(f"Opening: {filtered_games[0].headers.get('Opening', 'N/A')}")

Read 5000 games, kept 1764 in 12.00 seconds
Read 10000 games, kept 3520 in 23.73 seconds
Read 10000 games, kept 3520 in 23.73 seconds
Read 15000 games, kept 5256 in 36.64 seconds
Read 15000 games, kept 5256 in 36.64 seconds
Read 20000 games, kept 7026 in 48.89 seconds
Read 20000 games, kept 7026 in 48.89 seconds
Read 25000 games, kept 8817 in 73.96 seconds
Read 25000 games, kept 8817 in 73.96 seconds


KeyboardInterrupt: 

In [7]:
import time

# Function to read multiple games
def read_games(file_path, max_games: int):
    games = []
    
    with open(file_path, 'rb') as f:
        dctx = zstd.ZstdDecompressor()
        stream_reader = dctx.stream_reader(f)
        text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
        
        for _ in range(max_games):
            # Tracking how long this takes because there are a lot of games
            start_time = time.time() if _ == 0 else start_time  # Start timer at first iteration

            game = chess.pgn.read_game(text_stream)
            if (_ + 1) % 5_000 == 0:
                elapsed = time.time() - start_time
                print(f"Processed {_ + 1} games in {elapsed:.2f} seconds")
            if game is None:
                break
            games.append(game)
    
    return games

# Read 5 games as an example
games = read_games(pgn_path, max_games=30_000_000)

KeyboardInterrupt: 

In [None]:
# Now to filter out games we don't want, and extract only the information we need
# We'll do this bit by bit to make sure we don't mess it up.

import io

# Function to extract just the headers from a game
def extract_game_headers(game):
    if game is None:
        return None
    
    # Create a new dictionary with just the headers
    headers_dict = dict(game.headers)
    return headers_dict

# Extract headers from the first game as a test
game_headers = extract_game_headers(games[0])
print("Game headers only:")
print(game_headers)

# Process all games to get just the headers
all_game_headers = []
for i, game in enumerate(games, 1):  # Start with just 5 games
    headers = extract_game_headers(game)
    all_game_headers.append(headers)
    # print(f"Game {i} headers:")
    # print(f"  White: {headers['White']} (Elo: {headers.get('WhiteElo', 'N/A')})")
    # print(f"  Black: {headers['Black']} (Elo: {headers.get('BlackElo', 'N/A')})")
    # print(f"  Result: {headers['Result']}")
    # print(f"  ECO: {headers.get('ECO', 'N/A')}")
    # print(f"  Opening: {headers.get('Opening', 'N/A')}")
    # print()
    print(headers)

# Let's write a function that takes in this list of game headers
# Now to only extract the information we need from the headers
# Event, Date, White, Black, Result, WhiteElo, BlackElo, ECO, WhiteRatingDiff, BlackRatingDiff (this may be redundant), ECO, Opening, TimeControl, Termination (maybe)
# Make an object that extracts only these items and puts it in an object called... idk need a good name

# Return type of below function

class GameInfo(TypedDict):
    Event: str
    Date: str
    White: str
    Black: str
    Result: str
    WhiteElo: str
    BlackElo: str
    ECO: str
    Opening: str
    TimeControl: str
    Termination: str
    WhiteRatingDiff: str
    BlackRatingDiff: str


def extract_relevant_game_info(headers: dict[str, str] | None) -> Optional[GameInfo]:
    """Extract relevant information from a single game's headers."""

    if headers is None:
        return None

    relevant_game_info: GameInfo = {
        "Event": headers.get("Event", "N/A"),
        "Date": headers.get("Date", "N/A"),
        "White": headers.get("White", "N/A"),
        "Black": headers.get("Black", "N/A"),
        "Result": headers.get("Result", "N/A"),
        "WhiteElo": headers.get("WhiteElo", "N/A"),
        "BlackElo": headers.get("BlackElo", "N/A"),
        "ECO": headers.get("ECO", "N/A"),
        "Opening": headers.get("Opening", "N/A"),
        "TimeControl": headers.get("TimeControl", "N/A"),
        "Termination": headers.get("Termination", "N/A"),
        "WhiteRatingDiff": headers.get("WhiteRatingDiff", "N/A"),
        "BlackRatingDiff": headers.get("BlackRatingDiff", "N/A"),
    }


    return relevant_game_info

print("len(games)", len(games))


def extract_relevant_info_from_games(games: list[chess.pgn.Game]) -> list[GameInfo]:
    """Extract relevant information from a list of games."""
    relevant_info_list: list[GameInfo] = []

    for game in games:
        print("games length", len(games))
        headers = extract_game_headers(game)
        relevant_info = extract_relevant_game_info(headers)
        if relevant_info:
            relevant_info_list.append(relevant_info)

    return relevant_info_list

games_with_only_relevant_info = extract_relevant_info_from_games(games)
print(f"Extracted re levant info from {len(games_with_only_relevant_info)} games.")
print(games_with_only_relevant_info[0])

In [None]:
# Now to filter games we don't want.

## Next Steps

1. **Data Collection and Preprocessing**:
   - Filter games according to the specified criteria (rated games, not bullet/ultra-bullet, etc.)
   - Group games by username
   - Exclude users with insufficient number of games
   - Structure data for analysis

2. **Feature Engineering**:
   - Extract relevant features from games (opening choices, play style, etc.)
   - Create user profiles based on opening preferences
   - Identify patterns in user opening choices

3. **Model Development**:
   - Create a recommendation system that suggests openings based on user profiles
   - Evaluate model performance
   - Refine the model based on evaluation results

In [None]:
# Training data structure

# Note, I have a copilot chat in history about this

# Stuff we don't care about:
# Specific moves in the game
# Maybe time control? Adds a lot of complexity to training data
#

players_stats = {
    "my_username": {
        # Need to decide which TC this is in
        "rating": 1750,
        "black_games": {
            "opening_eco_code_1": {
                "opening_name": "French Defense",
                "results": {
                    "score_percentage_with_opening": 50,
                    "num_games": 74,
                    "num_wins": 15,
                    "num_losses": 25,
                    "num_draws": 7,
                },
            },
        },
        "white_games": {
            "opening_eco_code_1": {
                "opening_name": "French Defense",
                "results": {
                    "score_percentage_with_opening": 50,
                    "num_games": 74,
                    "num_wins": 15,
                    "num_losses": 25,
                    "num_draws": 7,
                },
            },
        },
        "num_games_total": 100,
    },
    "another_username": {
        # ....
    },
}

In [None]:
# How to extract data from PGN

# Data needed:
# Usernames
# Result
# Opening name and ECO code
# Time control - still need to figure out what we do with this, if any

# Exclude:
# Games shorter than a certain number of moves
# Cheat detected - maybe don't bother with this filter, it's rare and adds complexity
# Bullet games, probably
# Correspondence games maybe?

# Possibly:
# Weight games higher if they're Classical etc? Since players will spend more time on less classical games. Though this adds complexity



In [None]:
# Process all games and organize by player

def process_games_by_player(games, max_games=None):
    """Process games and organize them by player username"""
    players_data = {}
    
    for i, game in enumerate(games[:max_games] if max_games else games):
        headers = extract_game_headers(game)
        
        white_player = headers['White']
        black_player = headers['Black']
        white_elo = int(headers.get('WhiteElo', 0))
        black_elo = int(headers.get('BlackElo', 0))
        result = headers['Result']
        eco_code = headers.get('ECO', 'Unknown')
        opening_name = headers.get('Opening', 'Unknown Opening')
        time_control = headers.get('TimeControl', 'Unknown')
        
        # Process white player's game
        if white_player not in players_data:
            players_data[white_player] = {
                "rating": white_elo,  # Will be updated as we process more games
                "white_games": {},
                "black_games": {},
                "num_games_total": 0
            }
        
        # Update white player's data
        if eco_code not in players_data[white_player]["white_games"]:
            players_data[white_player]["white_games"][eco_code] = {
                "opening_name": opening_name,
                "results": {
                    "num_games": 0,
                    "num_wins": 0,
                    "num_losses": 0,
                    "num_draws": 0,
                    "score_percentage_with_opening": 0
                }
            }
        
        # Update game counts
        players_data[white_player]["num_games_total"] += 1
        players_data[white_player]["white_games"][eco_code]["results"]["num_games"] += 1
        
        # Update result counts
        if result == "1-0":  # White win
            players_data[white_player]["white_games"][eco_code]["results"]["num_wins"] += 1
        elif result == "0-1":  # Black win (white loss)
            players_data[white_player]["white_games"][eco_code]["results"]["num_losses"] += 1
        elif result == "1/2-1/2":  # Draw
            players_data[white_player]["white_games"][eco_code]["results"]["num_draws"] += 1
            
        # Update score percentage
        wins = players_data[white_player]["white_games"][eco_code]["results"]["num_wins"]
        draws = players_data[white_player]["white_games"][eco_code]["results"]["num_draws"]
        total = players_data[white_player]["white_games"][eco_code]["results"]["num_games"]
        score = (wins + (draws * 0.5)) / total * 100 if total > 0 else 0
        players_data[white_player]["white_games"][eco_code]["results"]["score_percentage_with_opening"] = round(score, 1)
        
        # Similarly process black player's game
        if black_player not in players_data:
            players_data[black_player] = {
                "rating": black_elo,
                "white_games": {},
                "black_games": {},
                "num_games_total": 0
            }
        
        # Update black player's data
        if eco_code not in players_data[black_player]["black_games"]:
            players_data[black_player]["black_games"][eco_code] = {
                "opening_name": opening_name,
                "results": {
                    "num_games": 0,
                    "num_wins": 0,
                    "num_losses": 0,
                    "num_draws": 0,
                    "score_percentage_with_opening": 0
                }
            }
        
        # Update game counts
        players_data[black_player]["num_games_total"] += 1
        players_data[black_player]["black_games"][eco_code]["results"]["num_games"] += 1
        
        # Update result counts
        if result == "0-1":  # Black win
            players_data[black_player]["black_games"][eco_code]["results"]["num_wins"] += 1
        elif result == "1-0":  # White win (black loss)
            players_data[black_player]["black_games"][eco_code]["results"]["num_losses"] += 1
        elif result == "1/2-1/2":  # Draw
            players_data[black_player]["black_games"][eco_code]["results"]["num_draws"] += 1
            
        # Update score percentage
        wins = players_data[black_player]["black_games"][eco_code]["results"]["num_wins"]
        draws = players_data[black_player]["black_games"][eco_code]["results"]["num_draws"]
        total = players_data[black_player]["black_games"][eco_code]["results"]["num_games"]
        score = (wins + (draws * 0.5)) / total * 100 if total > 0 else 0
        players_data[black_player]["black_games"][eco_code]["results"]["score_percentage_with_opening"] = round(score, 1)
    
    return players_data

# Process a small set of games as a test
players_stats_sample = process_games_by_player(games, max_games=50)

# Print stats for one player to verify
import random
if players_stats_sample:
    sample_player = random.choice(list(players_stats_sample.keys()))
    print(f"Sample stats for player: {sample_player}")
    print(f"Rating: {players_stats_sample[sample_player]['rating']}")
    print(f"Total games: {players_stats_sample[sample_player]['num_games_total']}")
    print("\nWhite openings:")
    for eco, data in players_stats_sample[sample_player]['white_games'].items():
        print(f"  {eco} - {data['opening_name']}: {data['results']['score_percentage_with_opening']}% score in {data['results']['num_games']} games")
    print("\nBlack openings:")
    for eco, data in players_stats_sample[sample_player]['black_games'].items():
        print(f"  {eco} - {data['opening_name']}: {data['results']['score_percentage_with_opening']}% score in {data['results']['num_games']} games")

Sample stats for player: ghridjvw
Rating: 1588
Total games: 1

White openings:

Black openings:
  A00 - Hungarian Opening: Dutch Defense: 0.0% score in 1 games


In [None]:
import multiprocessing
import json
import os
import pickle
from typing import Dict, List, Set, Tuple, TypedDict, Optional, Any, Union
from datetime import datetime

# Define types for better IDE support
class PlayerStats(TypedDict):
    """Type for storing a player's statistics"""
    rating: int
    white_games: Dict[str, OpeningStats]
    black_games: Dict[str, OpeningStats]
    num_games_total: int

class OpeningResults(TypedDict):
    """Type for storing results for a specific opening"""
    num_games: int
    num_wins: int
    num_losses: int
    num_draws: int
    score_percentage_with_opening: float

class OpeningStats(TypedDict):
    """Type for storing stats about an opening"""
    opening_name: str
    results: OpeningResults

class ProcessingConfig(TypedDict):
    """Configuration parameters for processing"""
    min_rating: int        # Minimum player rating to include
    min_moves: int         # Minimum number of moves in a game
    exclude_bullet: bool   # Whether to exclude bullet games
    min_games_per_player: int  # Minimum number of games a player must have to be included
    save_interval: int     # How often to save intermediate results (in games processed)
    max_players: Optional[int]  # Maximum number of players to track (for memory management)

# Create configuration with default values
config: ProcessingConfig = {
    "min_rating": 1500,         # Only consider games where both players are at least 1500 rated
    "min_moves": 15,           # Exclude games with fewer than 15 moves
    "exclude_bullet": True,     # Exclude bullet games (typically < 3 min)
    "min_games_per_player": 5,  # Only include players with at least 5 games
    "save_interval": 50000,     # Save results every 50,000 games processed
    "max_players": None         # No limit by default, but could set to e.g. 10000 to conserve memory
}

# Function to process games in chunks and continuously update player stats
def process_filtered_games_to_players(file_path: str, 
                                      output_dir: str = "processed_data",
                                      config: ProcessingConfig = config,
                                      max_games: Optional[int] = None) -> Dict[str, PlayerStats]:
    """
    Process games from a PGN file, filtering as we go, and building player statistics.
    Periodically saves results to avoid losing progress.
    
    Args:
        file_path: Path to the PGN file
        output_dir: Directory to save intermediate results
        config: Configuration parameters
        max_games: Maximum number of games to process (None for all)
        
    Returns:
        Dictionary of player statistics
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize counters and storage
    games_read = 0
    games_kept = 0
    players_data: Dict[str, PlayerStats] = {}
    
    # Set to track unique player-opening combinations we've seen (for filtering)
    # This helps limit the number of games we process for popular openings/players
    processed_combinations: Set[Tuple[str, str, str]] = set()  # (player, color, eco_code)
    
    # Check if we have a checkpoint to resume from
    checkpoint_path = os.path.join(output_dir, "players_data_checkpoint.pkl")
    if os.path.exists(checkpoint_path):
        try:
            print(f"Found checkpoint at {checkpoint_path}, attempting to load...")
            with open(checkpoint_path, 'rb') as f:
                checkpoint_data = pickle.load(f)
                players_data = checkpoint_data.get('players_data', {})
                games_read = checkpoint_data.get('games_read', 0)
                games_kept = checkpoint_data.get('games_kept', 0)
                processed_combinations = checkpoint_data.get('processed_combinations', set())
            print(f"Resumed from checkpoint: {games_read} games read, {games_kept} kept, {len(players_data)} players")
        except Exception as e:
            print(f"Error loading checkpoint: {e}. Starting fresh.")
    
    # Start timing
    start_time = time.time()
    last_save_time = start_time
    
    try:
        with open(file_path, 'rb') as f:
            # Set up decompression
            dctx = zstd.ZstdDecompressor()
            stream_reader = dctx.stream_reader(f)
            text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
            
            # Skip games we've already processed if resuming
            if games_read > 0:
                print(f"Skipping {games_read} already processed games...")
                # Note: This is a simplification. In reality, skipping ahead in a compressed
                # stream is challenging. For a production system, you might want to save 
                # the stream position or split the file into chunks.
                
            while True:
                # Check if we've reached max_games
                if max_games and games_kept >= max_games:
                    break
                    
                # Read the next game
                game = chess.pgn.read_game(text_stream)
                
                # Break if no more games
                if game is None:
                    break
                    
                games_read += 1
                
                # Log progress at regular intervals
                if games_read % 5_000 == 0:
                    elapsed = time.time() - start_time
                    rate = games_read / elapsed if elapsed > 0 else 0
                    print(f"Read {games_read} games, kept {games_kept} in {elapsed:.2f} seconds ({rate:.2f} games/sec)")
                    
                    # Print memory usage (optional, requires psutil)
                    # import psutil
                    # process = psutil.Process()
                    # print(f"Memory usage: {process.memory_info().rss / 1024 / 1024:.1f} MB")
                
                # Apply filters to see if we should keep this game
                headers = dict(game.headers)
                
                # Filter by rating
                try:
                    white_elo = int(headers.get('WhiteElo', 0))
                    black_elo = int(headers.get('BlackElo', 0))
                    if white_elo < config['min_rating'] or black_elo < config['min_rating']:
                        continue
                except ValueError:
                    # Skip games with invalid ratings
                    continue
                
                # Filter bullet games if requested
                if config['exclude_bullet']:
                    time_control = headers.get('TimeControl', '')
                    if time_control:
                        try:
                            # Common format: initial_time+increment, e.g. 60+1
                            parts = time_control.split('+')
                            initial_seconds = int(parts[0])
                            if initial_seconds < 180:  # Less than 3 minutes
                                continue
                        except (ValueError, IndexError):
                            # If we can't parse the time control, keep the game
                            pass
                
                # Check game length - only count moves to config['min_moves'] to save time
                move_count = 0
                node = game
                while not node.is_end():
                    move_count += 1
                    if move_count >= config['min_moves']:
                        break
                    node = node.variations[0]
                
                if move_count < config['min_moves']:
                    continue
                
                # Game passed all filters, now extract the info we need
                white_player = headers['White']
                black_player = headers['Black']
                result = headers['Result']
                eco_code = headers.get('ECO', 'Unknown')
                opening_name = headers.get('Opening', 'Unknown Opening')
                
                # Apply player limits if configured
                if config['max_players'] and len(players_data) >= config['max_players']:
                    # Only add games for players we already know
                    if white_player not in players_data and black_player not in players_data:
                        continue
                
                # Process white player's game
                if white_player not in players_data:
                    players_data[white_player] = {
                        "rating": white_elo,
                        "white_games": {},
                        "black_games": {},
                        "num_games_total": 0
                    }
                
                # Similarly for black player
                if black_player not in players_data:
                    players_data[black_player] = {
                        "rating": black_elo,
                        "white_games": {},
                        "black_games": {},
                        "num_games_total": 0
                    }
                
                # Update white player stats
                if eco_code not in players_data[white_player]["white_games"]:
                    players_data[white_player]["white_games"][eco_code] = {
                        "opening_name": opening_name,
                        "results": {
                            "num_games": 0,
                            "num_wins": 0,
                            "num_losses": 0,
                            "num_draws": 0,
                            "score_percentage_with_opening": 0
                        }
                    }
                
                # Update black player stats
                if eco_code not in players_data[black_player]["black_games"]:
                    players_data[black_player]["black_games"][eco_code] = {
                        "opening_name": opening_name,
                        "results": {
                            "num_games": 0,
                            "num_wins": 0,
                            "num_losses": 0,
                            "num_draws": 0,
                            "score_percentage_with_opening": 0
                        }
                    }
                
                # Update game counts and results for both players
                
                # White player updates
                players_data[white_player]["num_games_total"] += 1
                players_data[white_player]["white_games"][eco_code]["results"]["num_games"] += 1
                
                if result == "1-0":  # White win
                    players_data[white_player]["white_games"][eco_code]["results"]["num_wins"] += 1
                elif result == "0-1":  # Black win (white loss)
                    players_data[white_player]["white_games"][eco_code]["results"]["num_losses"] += 1
                elif result == "1/2-1/2":  # Draw
                    players_data[white_player]["white_games"][eco_code]["results"]["num_draws"] += 1
                
                # Black player updates
                players_data[black_player]["num_games_total"] += 1
                players_data[black_player]["black_games"][eco_code]["results"]["num_games"] += 1
                
                if result == "0-1":  # Black win
                    players_data[black_player]["black_games"][eco_code]["results"]["num_wins"] += 1
                elif result == "1-0":  # White win (black loss)
                    players_data[black_player]["black_games"][eco_code]["results"]["num_losses"] += 1
                elif result == "1/2-1/2":  # Draw
                    players_data[black_player]["black_games"][eco_code]["results"]["num_draws"] += 1
                
                # Update score percentages for both players
                # White player score
                white_wins = players_data[white_player]["white_games"][eco_code]["results"]["num_wins"]
                white_draws = players_data[white_player]["white_games"][eco_code]["results"]["num_draws"]
                white_total = players_data[white_player]["white_games"][eco_code]["results"]["num_games"]
                white_score = (white_wins + (white_draws * 0.5)) / white_total * 100 if white_total > 0 else 0
                players_data[white_player]["white_games"][eco_code]["results"]["score_percentage_with_opening"] = round(white_score, 1)
                
                # Black player score
                black_wins = players_data[black_player]["black_games"][eco_code]["results"]["num_wins"]
                black_draws = players_data[black_player]["black_games"][eco_code]["results"]["num_draws"]
                black_total = players_data[black_player]["black_games"][eco_code]["results"]["num_games"]
                black_score = (black_wins + (black_draws * 0.5)) / black_total * 100 if black_total > 0 else 0
                players_data[black_player]["black_games"][eco_code]["results"]["score_percentage_with_opening"] = round(black_score, 1)
                
                # Track that we've processed this game
                games_kept += 1
                processed_combinations.add((white_player, "white", eco_code))
                processed_combinations.add((black_player, "black", eco_code))
                
                # Periodically save progress
                if games_kept % config['save_interval'] == 0:
                    # Save checkpoint
                    save_data = {
                        'players_data': players_data,
                        'games_read': games_read,
                        'games_kept': games_kept,
                        'processed_combinations': processed_combinations,
                        'timestamp': datetime.now().isoformat()
                    }
                    
                    checkpoint_path = os.path.join(output_dir, "players_data_checkpoint.pkl")
                    with open(checkpoint_path, 'wb') as f:
                        pickle.dump(save_data, f)
                    
                    # Also save a JSON snapshot (more portable but larger)
                    json_path = os.path.join(output_dir, f"players_data_snapshot_{games_kept}.json")
                    with open(json_path, 'w') as f:
                        # Convert the set to a list for JSON serialization
                        save_data_json = save_data.copy()
                        save_data_json['processed_combinations'] = list(save_data_json['processed_combinations'])
                        json.dump(save_data_json, f)
                    
                    print(f"Saved checkpoint after {games_kept} games")
                    last_save_time = time.time()
    
    except Exception as e:
        print(f"Error during processing: {e}")
        # Save what we have so far
        save_data = {
            'players_data': players_data,
            'games_read': games_read,
            'games_kept': games_kept,
            'processed_combinations': processed_combinations,
            'timestamp': datetime.now().isoformat(),
            'error': str(e)
        }
        
        error_checkpoint_path = os.path.join(output_dir, f"players_data_error_{games_kept}.pkl")
        with open(error_checkpoint_path, 'wb') as f:
            pickle.dump(save_data, f)
        
        print(f"Saved error checkpoint to {error_checkpoint_path}")
    
    # Final stats
    elapsed_total = time.time() - start_time
    print(f"Processing complete. Read {games_read} games, kept {games_kept} in {elapsed_total:.2f} seconds")
    print(f"Processing rate: {games_read / elapsed_total:.2f} games/sec")
    print(f"Total players tracked: {len(players_data)}")
    
    # Filter out players with too few games (optional)
    if config['min_games_per_player'] > 0:
        before_count = len(players_data)
        players_data = {player: data for player, data in players_data.items() 
                        if data['num_games_total'] >= config['min_games_per_player']}
        print(f"Filtered players by minimum {config['min_games_per_player']} games: {before_count} → {len(players_data)}")
    
    # Save final results
    final_path = os.path.join(output_dir, f"players_data_final.pkl")
    with open(final_path, 'wb') as f:
        pickle.dump(players_data, f)
    
    print(f"Saved final results to {final_path}")
    return players_data

# Helper function to load saved results
def load_players_data(filepath: str) -> Dict[str, PlayerStats]:
    """Load saved player data from a pickle file"""
    with open(filepath, 'rb') as f:
        data = pickle.load(f)
        if isinstance(data, dict) and 'players_data' in data:
            return data['players_data']
        return data  # Assume it's just the players_data directly

# Function to check a saved file
def analyze_players_data(players_data: Dict[str, PlayerStats], top_n: int = 10) -> None:
    """Print summary statistics about the player data"""
    print(f"Total players: {len(players_data)}")
    
    # Count total games
    total_games = sum(player_data['num_games_total'] for player_data in players_data.values()) // 2
    print(f"Total unique games: {total_games}")
    
    # Find players with most games
    players_by_games = sorted(players_data.items(), key=lambda x: x[1]['num_games_total'], reverse=True)
    print(f"\nTop {top_n} players by number of games:")
    for i, (player, data) in enumerate(players_by_games[:top_n], 1):
        print(f"{i}. {player}: {data['num_games_total']} games, rating: {data['rating']}")
    
    # Find most popular openings for White
    white_openings: Dict[str, int] = {}
    for player_data in players_data.values():
        for eco, opening_data in player_data['white_games'].items():
            if eco not in white_openings:
                white_openings[eco] = 0
            white_openings[eco] += opening_data['results']['num_games']
    
    popular_white_openings = sorted(white_openings.items(), key=lambda x: x[1], reverse=True)
    print(f"\nTop {top_n} openings for White:")
    for i, (eco, count) in enumerate(popular_white_openings[:top_n], 1):
        # Find the opening name by looking at the first player who has this opening
        opening_name = "Unknown"
        for player_data in players_data.values():
            if eco in player_data['white_games']:
                opening_name = player_data['white_games'][eco]['opening_name']
                break
        print(f"{i}. {eco} - {opening_name}: {count} games")
    
    # Find most popular openings for Black
    black_openings: Dict[str, int] = {}
    for player_data in players_data.values():
        for eco, opening_data in player_data['black_games'].items():
            if eco not in black_openings:
                black_openings[eco] = 0
            black_openings[eco] += opening_data['results']['num_games']
    
    popular_black_openings = sorted(black_openings.items(), key=lambda x: x[1], reverse=True)
    print(f"\nTop {top_n} openings for Black:")
    for i, (eco, count) in enumerate(popular_black_openings[:top_n], 1):
        # Find the opening name
        opening_name = "Unknown"
        for player_data in players_data.values():
            if eco in player_data['black_games']:
                opening_name = player_data['black_games'][eco]['opening_name']
                break
        print(f"{i}. {eco} - {opening_name}: {count} games")

In [None]:
# Test the optimized processing function with a small sample
# Create output directory for processed data
output_dir = "/Users/a/Documents/personalprojects/chess-opening-recommender/data/processed"
os.makedirs(output_dir, exist_ok=True)

# Configure a small test run
test_config: ProcessingConfig = {
    "min_rating": 1500,
    "min_moves": 15,
    "exclude_bullet": True,
    "min_games_per_player": 2,  # Lower threshold for testing
    "save_interval": 100,      # Save more frequently during test
    "max_players": None
}

# Process a small batch as a test (100 games)
test_players_data = process_filtered_games_to_players(
    pgn_path, 
    output_dir=output_dir,
    config=test_config,
    max_games=100  # Small sample for testing
)

# Analyze the results
analyze_players_data(test_players_data)

In [None]:
# Advanced: Parallel processing implementation
# This splits the file into chunks and processes them in parallel

def split_pgn_file(file_path: str, chunk_size: int = 1000000, output_dir: str = "split_pgn") -> List[str]:
    """
    Split a large compressed PGN file into smaller chunks for parallel processing.
    
    Args:
        file_path: Path to the large compressed PGN file
        chunk_size: Approximate number of games per chunk
        output_dir: Directory to store the chunks
    
    Returns:
        List of paths to the chunk files
    """
    os.makedirs(output_dir, exist_ok=True)
    chunk_files = []
    
    with open(file_path, 'rb') as f:
        dctx = zstd.ZstdDecompressor()
        stream_reader = dctx.stream_reader(f)
        text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
        
        chunk_num = 0
        while True:
            chunk_path = os.path.join(output_dir, f"chunk_{chunk_num}.pgn")
            chunk_files.append(chunk_path)
            
            with open(chunk_path, 'w', encoding='utf-8') as chunk_file:
                games_in_chunk = 0
                
                while games_in_chunk < chunk_size:
                    # Read a game
                    game_text = ""
                    line = text_stream.readline()
                    
                    if not line:  # End of file
                        break
                    
                    # Read until we hit the end of a game
                    while line and not line.startswith('[Event '):
                        game_text += line
                        line = text_stream.readline()
                    
                    # If we hit another game header, include it and read the rest of the game
                    if line:
                        game_text += line
                        line = text_stream.readline()
                        
                        while line and not line.startswith('[Event '):
                            game_text += line
                            line = text_stream.readline()
                    
                    # Write the game to the chunk file
                    if game_text.strip():
                        chunk_file.write(game_text)
                        games_in_chunk += 1
                    
                    # If we're at the next game header, we need to save it for the next iteration
                    if line.startswith('[Event '):
                        # Put the line back (sort of)
                        next_game_header = line
                    else:
                        next_game_header = None
                    
                    if games_in_chunk % 1000 == 0:
                        print(f"Added {games_in_chunk} games to chunk {chunk_num}")
                
                if next_game_header:
                    # Write the saved game header to the next chunk file
                    chunk_file.write(next_game_header)
            
            print(f"Created chunk {chunk_num} with {games_in_chunk} games")
            chunk_num += 1
            
            if not line:  # End of file
                break
    
    return chunk_files

# Note: The splitting function is complex and might need refinement for production use
# Instead of implementing a full parallel system here, you could use a simpler approach:

def process_game_chunk(args):
    """
    Process a chunk of the PGN file specified by start and end offsets.
    Designed to be used with multiprocessing.
    """
    file_path, chunk_id, config, max_games = args
    output_path = f"chunk_{chunk_id}_results.pkl"
    
    players_data = process_filtered_games_to_players(
        file_path,
        output_dir=f"chunk_{chunk_id}",
        config=config,
        max_games=max_games
    )
    
    return players_data

def parallel_process_pgn(file_path: str, num_chunks: int = 4, 
                         config: ProcessingConfig = None, 
                         max_games_per_chunk: Optional[int] = None) -> Dict[str, PlayerStats]:
    """
    Process a PGN file in parallel by dividing the work among multiple processes.
    
    This is a simplified approach - in a production system, you would want to
    actually split the file or use more sophisticated methods to divide the work.
    
    Args:
        file_path: Path to the PGN file
        num_chunks: Number of parallel processes to use
        config: Processing configuration
        max_games_per_chunk: Max games to process per chunk
        
    Returns:
        Combined player statistics
    """
    if config is None:
        config = {
            "min_rating": 1500,
            "min_moves": 15,
            "exclude_bullet": True,
            "min_games_per_player": 5,
            "save_interval": 10000,
            "max_players": None
        }
    
    # Create arguments for each worker
    args_list = [(file_path, i, config, max_games_per_chunk) for i in range(num_chunks)]
    
    # Process in parallel
    with multiprocessing.Pool(processes=num_chunks) as pool:
        results = pool.map(process_game_chunk, args_list)
    
    # Merge results
    combined_data: Dict[str, PlayerStats] = {}
    for chunk_result in results:
        for player, player_data in chunk_result.items():
            if player not in combined_data:
                combined_data[player] = player_data
            else:
                # Merge game counts
                combined_data[player]["num_games_total"] += player_data["num_games_total"]
                
                # Merge white games data
                for eco, eco_data in player_data["white_games"].items():
                    if eco not in combined_data[player]["white_games"]:
                        combined_data[player]["white_games"][eco] = eco_data
                    else:
                        # Merge results
                        combined_data[player]["white_games"][eco]["results"]["num_games"] += eco_data["results"]["num_games"]
                        combined_data[player]["white_games"][eco]["results"]["num_wins"] += eco_data["results"]["num_wins"]
                        combined_data[player]["white_games"][eco]["results"]["num_losses"] += eco_data["results"]["num_losses"]
                        combined_data[player]["white_games"][eco]["results"]["num_draws"] += eco_data["results"]["num_draws"]
                        
                        # Recalculate score percentage
                        wins = combined_data[player]["white_games"][eco]["results"]["num_wins"]
                        draws = combined_data[player]["white_games"][eco]["results"]["num_draws"]
                        total = combined_data[player]["white_games"][eco]["results"]["num_games"]
                        score = (wins + (draws * 0.5)) / total * 100 if total > 0 else 0
                        combined_data[player]["white_games"][eco]["results"]["score_percentage_with_opening"] = round(score, 1)
                
                # Similarly merge black games data
                for eco, eco_data in player_data["black_games"].items():
                    if eco not in combined_data[player]["black_games"]:
                        combined_data[player]["black_games"][eco] = eco_data
                    else:
                        # Merge results
                        combined_data[player]["black_games"][eco]["results"]["num_games"] += eco_data["results"]["num_games"]
                        combined_data[player]["black_games"][eco]["results"]["num_wins"] += eco_data["results"]["num_wins"]
                        combined_data[player]["black_games"][eco]["results"]["num_losses"] += eco_data["results"]["num_losses"]
                        combined_data[player]["black_games"][eco]["results"]["num_draws"] += eco_data["results"]["num_draws"]
                        
                        # Recalculate score percentage
                        wins = combined_data[player]["black_games"][eco]["results"]["num_wins"]
                        draws = combined_data[player]["black_games"][eco]["results"]["num_draws"]
                        total = combined_data[player]["black_games"][eco]["results"]["num_games"]
                        score = (wins + (draws * 0.5)) / total * 100 if total > 0 else 0
                        combined_data[player]["black_games"][eco]["results"]["score_percentage_with_opening"] = round(score, 1)
    
    # Save combined results
    with open("combined_results.pkl", 'wb') as f:
        pickle.dump(combined_data, f)
    
    return combined_data

# Note: Parallelization is powerful but adds complexity. For initial development, the single-process
# approach with checkpointing might be simpler and sufficient.

In [None]:
# Full production run configuration
# Uncomment and run this when you're ready for a full processing run

# # Configure for a production run
# production_config: ProcessingConfig = {
#     "min_rating": 1800,        # Only include games with both players rated at least 1800
#     "min_moves": 15,          # Exclude very short games
#     "exclude_bullet": True,    # Exclude bullet games
#     "min_games_per_player": 10, # Only include players with at least 10 games
#     "save_interval": 50000,    # Save checkpoints every 50K games
#     "max_players": 50000      # Limit memory usage by tracking at most 50K players
# }

# # Process the entire file
# players_data = process_filtered_games_to_players(
#     pgn_path, 
#     output_dir="/Users/a/Documents/personalprojects/chess-opening-recommender/data/processed",
#     config=production_config
# )

# # Or run with parallel processing (advanced)
# # players_data = parallel_process_pgn(
# #     pgn_path,
# #     num_chunks=4,  # Use 4 parallel processes
# #     config=production_config
# # )

# # Analyze the final results
# analyze_players_data(players_data, top_n=20)