In [5]:
# import chess.pgn

# pgn = open("lichess_db_standard_rated_2014-09.pgn")

# # Read just 1 game for testing
# first_game = chess.pgn.read_game(pgn)
# print(first_game.headers)
# print(first_game.mainline_moves())

In [25]:
# Start by activating the virtual environment: source chess-nlp-env/bin/activate

import chess.pgn  # (Portable Game Notation)
import pandas as pd
from tqdm import tqdm  # For a progress bar, useful in case of large files
import csv
from tabulate import tabulate

pgn_path = "../data/processed/lichess_db_standard_rated_2014-09.pgn"
output_csv = "../data/processed/chess_games.csv"

num_games = 500  # First 5000 games from the dataset
min_plies_required = 6

with open(output_csv, "w", newline="", encoding="utf-8") as fout:  # write mode ('w') with newline='' to prevent extra blank rows.
    writer = csv.writer(fout)  # writer object is responsible for handling the CSV formatting.
    writer.writerow(["white", "black", "result", "date", "site", "moves"])

    with open(pgn_path, encoding="utf-8") as pgn_file:  # with block ensures the file is automatically closed when done (or on error)
        for _ in tqdm(range(num_games)):
            game = chess.pgn.read_game(pgn_file)  # reads the next PGN entry from the open file and returns a Game object
            if game is None:
                break  # end of file
    
            # 'read_game' reads whole PGN blocks (headers + move text)
            try:
                # All the features in the data
                # game.headers: a dict-like mapping of PGN tag names to values
                # .get(key, default): returns value if present, else default '?' prevents KeyError and preserves placeholder if metadata missing.
                white = game.headers.get("White", "?")
                black = game.headers.get("Black", "?")
                result = game.headers.get("Result", "?")
                date = game.headers.get("Date", "?")
                site = game.headers.get("Site", "?")
    
                board = game.board()  # An actual chess board
                moves = []
                for move in game.mainline_moves():
                    try:
                        san = board.san(move)  # board.san(move): Python-Chess verifies that the move is legal from the current position.
                        moves.append(san)
                        board.push(move)  # updates board state
                    except Exception:
                        # Skip illegal move: illegal move raises an Assertion error
                        moves = []
                        break
    
                if not moves or len(moves) < min_plies_required:
                    continue  # skip this game and minimum number of plies/tokens for a checkmate is 6: The fool's mate
    
                writer.writerow([white, black, result, date, site, " ".join(moves)])  # Instead of storing game as a list, saving memory
                                                                                      # and writing directly to csv

            except Exception as e:
                # Skip the whole game if anything else fails
                continue

print(f"\nFinished processing {num_games} games.")
print(f"Clean data written to: {output_csv}")

data = pd.read_csv(output_csv)
print(f"Number of valid games: {len(data)}.\n")
print(data.head())

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 334.46it/s]


Finished processing 500 games.
Clean data written to: ../data/processed/chess_games.csv
Number of valid games: 496.

         white       black result        date                          site  \
0       KACHAL  justplaybi    0-1  ????.??.??  https://lichess.org/1Vimq9SL   
1     mustroll       pelao    1-0  ????.??.??  https://lichess.org/PK5H93NR   
2  luciano2000     amnezia    1-0  ????.??.??  https://lichess.org/br0KfpXd   
3      martinz       HighP    1-0  ????.??.??  https://lichess.org/4CnlXUB0   
4      Thoth33  JaiSkiesNY    1-0  ????.??.??  https://lichess.org/X9Q96D1l   

                                               moves  
0  e4 d5 Nf3 dxe4 Ne5 Nf6 d4 exd3 Bxd3 e6 Nc3 Bd6...  
1  b4 e5 Bb2 d6 c3 Bf5 d3 Nf6 e4 Bg6 Be2 Be7 Nf3 ...  
2  e4 d5 exd5 Qxd5 Nc3 Qa5 Nf3 Nf6 d4 Bg4 Bd2 Nc6...  
3  e4 e5 Nf3 Nf6 Nc3 d6 Bc4 Be6 Qe2 Nbd7 d4 Bxc4 ...  
4  d4 e6 c4 c6 Nc3 d5 cxd5 cxd5 e4 Nc6 e5 Qb6 Nge...  





In [None]:
# Next steps
'''
Next logical steps (after CSV)

Build a vocabulary from the moves column: move -> id.

Convert sequences into integer token lists and save as .npy or PyTorch tensors.

Create a Dataset class yielding (input_seq, next_move) pairs for training.

Implement and train LSTM baseline; then implement Transformer from scratch.
'''