# Purpose:
To obtain data on lichess games, sort and organize by username, sanitize and sort.

In [8]:
import chess.pgn
import zstandard as zstd
import io

# Path to the compressed PGN file
pgn_path = "/Users/a/Documents/personalprojects/chess-opening-recommender/data/raw/lichess_db_standard_rated_2025-07.pgn.zst"

# Open and decompress the file
with open(pgn_path, 'rb') as f:
    dctx = zstd.ZstdDecompressor()
    stream_reader = dctx.stream_reader(f)
    text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
    
    # Read the first game as an example
    game = chess.pgn.read_game(text_stream)
    
    # Print the game details
    if game:
        print(f"Event: {game.headers['Event']}")
        print(f"White: {game.headers['White']} (Elo: {game.headers.get('WhiteElo', 'N/A')})")
        print(f"Black: {game.headers['Black']} (Elo: {game.headers.get('BlackElo', 'N/A')})")
        print(f"Result: {game.headers['Result']}")
        print(f"Opening: {game.headers.get('Opening', 'N/A')}")
        print(f"ECO: {game.headers.get('ECO', 'N/A')}")
        print(f"\nMoves:")
        print(game)
    else:
        print("No game found in the file.")

Event: Rated Bullet game
White: my_name_jeff (Elo: 1706)
Black: xxxgrishaxxx (Elo: 1671)
Result: 0-1
Opening: Benoni Defense: Old Benoni
ECO: A43

Moves:
[Event "Rated Bullet game"]
[Site "https://lichess.org/VsUqVhC2"]
[Date "2025.07.01"]
[Round "-"]
[White "my_name_jeff"]
[Black "xxxgrishaxxx"]
[Result "0-1"]
[UTCDate "2025.07.01"]
[UTCTime "00:00:31"]
[WhiteElo "1706"]
[BlackElo "1671"]
[WhiteRatingDiff "-6"]
[BlackRatingDiff "+6"]
[ECO "A43"]
[Opening "Benoni Defense: Old Benoni"]
[TimeControl "60+0"]
[Termination "Time forfeit"]

1. d4 { [%clk 0:01:00] } 1... c5 { [%clk 0:01:00] } 2. e3 { [%clk 0:01:00] } 2... e6 { [%clk 0:00:59] } 3. dxc5 { [%clk 0:00:59] } 3... Bxc5 { [%clk 0:00:58] } 4. Nf3 { [%clk 0:00:59] } 4... Nf6 { [%clk 0:00:57] } 5. c3 { [%clk 0:00:59] } 5... Nc6 { [%clk 0:00:56] } 6. Bb5 { [%clk 0:00:58] } 6... a6 { [%clk 0:00:55] } 7. Bxc6 { [%clk 0:00:57] } 7... bxc6 { [%clk 0:00:55] } 8. O-O { [%clk 0:00:57] } 8... d5 { [%clk 0:00:54] } 9. Nd4 { [%clk 0:00:56] } 9...

In [9]:
# Function to read multiple games
def read_games(file_path, max_games=10):
    games = []
    
    with open(file_path, 'rb') as f:
        dctx = zstd.ZstdDecompressor()
        stream_reader = dctx.stream_reader(f)
        text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
        
        for _ in range(max_games):
            game = chess.pgn.read_game(text_stream)
            if game is None:
                break
            games.append(game)
    
    return games

# Read 5 games as an example
games = read_games(pgn_path, max_games=5)

# Print basic information about each game
for i, game in enumerate(games, 1):
    print(f"Game {i}:")
    print(f"  White: {game.headers['White']} (Elo: {game.headers.get('WhiteElo', 'N/A')})")
    print(f"  Black: {game.headers['Black']} (Elo: {game.headers.get('BlackElo', 'N/A')})")
    print(f"  Result: {game.headers['Result']}")
    print(f"  ECO: {game.headers.get('ECO', 'N/A')}")
    print()

Game 1:
  White: my_name_jeff (Elo: 1706)
  Black: xxxgrishaxxx (Elo: 1671)
  Result: 0-1
  ECO: A43

Game 2:
  White: Lostratega (Elo: 2262)
  Black: abdo0diab2000 (Elo: 2191)
  Result: 1-0
  ECO: A46

Game 3:
  White: YarnHugen (Elo: 2279)
  Black: LateralusMind (Elo: 2339)
  Result: 0-1
  ECO: A46

Game 4:
  White: Timon_01 (Elo: 971)
  Black: tbruins82 (Elo: 1040)
  Result: 0-1
  ECO: A00

Game 5:
  White: birddead (Elo: 1752)
  Black: jay623 (Elo: 1737)
  Result: 0-1
  ECO: C41



## ECO Codes

ECO (Encyclopedia of Chess Openings) codes are a classification system for chess openings. They provide a standardized way to categorize and identify openings. Each opening is assigned a unique code consisting of a letter (A-E) followed by two digits (00-99).

- **A**: Flank openings (e.g., English Opening, Réti Opening)
- **B**: Semi-open games (e.g., Sicilian Defense, French Defense)
- **C**: Open games (e.g., Ruy Lopez, Italian Game)
- **D**: Closed games and semi-closed defenses (e.g., Queen's Gambit, Grünfeld Defense)
- **E**: Indian defenses (e.g., King's Indian, Nimzo-Indian)

As mentioned in the project instructions, we'll be grouping openings by ECO code rather than by name to avoid duplicates and ensure consistency.

## Next Steps

1. **Data Collection and Preprocessing**:
   - Filter games according to the specified criteria (rated games, not bullet/ultra-bullet, etc.)
   - Group games by username
   - Exclude users with insufficient number of games
   - Structure data for analysis

2. **Feature Engineering**:
   - Extract relevant features from games (opening choices, play style, etc.)
   - Create user profiles based on opening preferences
   - Identify patterns in user opening choices

3. **Model Development**:
   - Create a recommendation system that suggests openings based on user profiles
   - Evaluate model performance
   - Refine the model based on evaluation results

In [None]:
# Training data structure

# Stuff we don't care about:
# Specific moves in the game
# Maybe time control? Adds a lot of complexity to training data
#

players_stats = {
    "my_username": {
        # Need to decide which TC this is in
        "rating": 1750,
        "black_games": {
            "opening_eco_code_1": {
                "opening_name": "French Defense",
                "results": {
                    "score_percentage_with_opening": 50,
                    "num_games": 74,
                    "num_wins": 15,
                    "num_losses": 25,
                    "num_draws": 7,
                },
            },
        },
        "white_games": {
            "opening_eco_code_1": {
                "opening_name": "French Defense",
                "results": {
                    "score_percentage_with_opening": 50,
                    "num_games": 74,
                    "num_wins": 15,
                    "num_losses": 25,
                    "num_draws": 7,
                },
            },
        },
        "num_games_total": 100,
    },
    "another_username": {
        # ....
    },
}

In [None]:
# How to extract data from PGN

# Data needed:
# Usernames
# Result
# Opening name and ECO code
# Time control - still need to figure out what we do with this, if any

# Exclude:
# Games shorter than a certain number of moves
# Cheat detected - maybe don't bother with this filter, it's rare and adds complexity
# Bullet games, probably
# Correspondence games maybe?

# Possibly:
# Weight games higher if they're Classical etc? Since players will spend more time on less classical games. Though this adds complexity



In [None]:
-