In [5]:
# import chess.pgn

# pgn = open("lichess_db_standard_rated_2014-09.pgn")

# # Read just 1 game for testing
# first_game = chess.pgn.read_game(pgn)
# print(first_game.headers)
# print(first_game.mainline_moves())

In [2]:
# Start by activating the virtual environment: source chess-nlp-env/bin/activate

import chess.pgn
import pandas as pd

pgn_path = "../data/processed/lichess_db_standard_rated_2014-09.pgn"
output_csv = "../data/processed/chess_games.csv"

games_data = []
num_games = 5000  # First 5000 games from the dataset

with open(pgn_path, encoding="utf-8") as pgn_file:
    for i in range(num_games):
        game = chess.pgn.read_game(pgn_file)
        if game is None:
            break  # end of file

        try:
            # All the features in the data
            white = game.headers.get("White", "?")
            black = game.headers.get("Black", "?")
            result = game.headers.get("Result", "?")
            date = game.headers.get("Date", "?")
            site = game.headers.get("Site", "?")

            board = game.board()
            moves = []
            for move in game.mainline_moves():
                try:
                    san = board.san(move)
                    moves.append(san)
                    board.push(move)
                except Exception:
                    # Skip illegal move
                    moves = []
                    break

            if not moves:
                continue  # skip this game

            games_data.append({
                "white": white,
                "black": black,
                "result": result,
                "date": date,
                "site": site,
                "moves": " ".join(moves)
            })
        except Exception as e:
            # Skip the whole game if anything else fails
            continue

df = pd.DataFrame(games_data)
df.to_csv(output_csv, index=False)

print(f"✅ Extracted {len(df)} valid games to {output_csv}")
print(df.head())

✅ Extracted 5000 valid games to ../data/processed/chess_games.csv
         white       black result        date                          site  \
0       KACHAL  justplaybi    0-1  ????.??.??  https://lichess.org/1Vimq9SL   
1     mustroll       pelao    1-0  ????.??.??  https://lichess.org/PK5H93NR   
2  luciano2000     amnezia    1-0  ????.??.??  https://lichess.org/br0KfpXd   
3      martinz       HighP    1-0  ????.??.??  https://lichess.org/4CnlXUB0   
4      Thoth33  JaiSkiesNY    1-0  ????.??.??  https://lichess.org/X9Q96D1l   

                                               moves  
0  e4 d5 Nf3 dxe4 Ne5 Nf6 d4 exd3 Bxd3 e6 Nc3 Bd6...  
1  b4 e5 Bb2 d6 c3 Bf5 d3 Nf6 e4 Bg6 Be2 Be7 Nf3 ...  
2  e4 d5 exd5 Qxd5 Nc3 Qa5 Nf3 Nf6 d4 Bg4 Bd2 Nc6...  
3  e4 e5 Nf3 Nf6 Nc3 d6 Bc4 Be6 Qe2 Nbd7 d4 Bxc4 ...  
4  d4 e6 c4 c6 Nc3 d5 cxd5 cxd5 e4 Nc6 e5 Qb6 Nge...  


In [None]:
# Preprocessing