# Chess BPT data cleansing

Downloaded [Lichess elite database](https://database.nikonoel.fr/)

In [9]:
import chess.pgn
import os
from schemas import GameState

EMPTY = '.'

def parse_game(game: str, n: int) -> list[GameState]:
    g = game.split(' ')
    moves = g[1:]
    board = chess.Board()
    result = g[0]
    
    game_states: list[GameState] = []
    for i, move in enumerate(moves):
        first_move_idx = i - n if i - n > 0 else 0
        board.push_san(move)
        # Create an 8x8 board representation
        board_state = []
        for rank in range(8):
            rank_squares = []
            for file in range(8):
                square = chess.square(file, 7-rank)  # chess library uses 0-7 for ranks from bottom to top
                piece = board.piece_at(square)
                if piece is None:
                    rank_squares.append(EMPTY)
                else:
                    piece_color = "white" if piece.color == chess.WHITE else "black"
                    piece_type = chess.piece_name(piece.piece_type)
                    rank_squares.append(f"{piece_color} {piece_type}")
            board_state.append(rank_squares)
            
        m = [EMPTY] * (n + 1)
        for j, move_text in enumerate(moves[first_move_idx:i+1]):
            # Determine color based on move index - white moves first in chess
            move_index = first_move_idx + j
            color = "w" if move_index % 2 == 0 else "b"
            m[j] = f"{move_text}:{color}"
            
            
        # Create a GameState with the current board state
        game_state = GameState(
            board=board_state,
            result=result,
            moves=reversed(m)
        )
        game_states.append(game_state)
    
    return game_states

def extract_formatted_games_from_pgn(pgn_file:str, possible_moves: set[str], sample: int | None = None) -> list[str]:
    formatted_games = []
    count = 0
    
    with open(pgn_file, "r", encoding="utf-8") as file:
        while True:
            if sample is not None and count >= sample:
                break
            if count % 100 == 0:
                print(f'{count} games')
            game = chess.pgn.read_game(file)
            if game is None:
                break  # End of file

            # Extract the result
            result = game.headers.get("Result", "*")
            
            # Skip games with unknown results
            if result == "*":
                continue
                
            winner = None
            if result == "1-0":
                winner = WHITE
            elif result == "0-1":
                winner = BLACK
            elif result == "1/2-1/2":
                winner = "draw"
            else:
                continue  # Skip any other unexpected result format

            board = game.board()
            moves = []
            for move in game.mainline_moves():
                moves.append(board.san(move))
                board.push(move)

            # Format as: winner START moves END
            game_text = f"{winner} {' '.join(moves)}"
            games_states = parse_game(game_text, 10)
            formatted_games.extend(games_states)
            possible_moves.update(moves)
            count += 1
    
    return formatted_games, possible_moves

WHITE = "white"
BLACK = "black"
START = "<start>"
END = "<end>"
# Example usage
pgn_file = "data/raw/lichess_elite_2022-02.pgn"  # Replace with your file
games, possible_moves = extract_formatted_games_from_pgn(pgn_file, set(), 3)

# Print first 5 games' moves
for i, game_moves in enumerate(games):
    print(f"Game {i+1}: {game_moves}\n")

0 games
Game 1: board=[['black rook', 'black knight', 'black bishop', 'black queen', 'black king', 'black bishop', 'black knight', 'black rook'], ['black pawn', 'black pawn', 'black pawn', 'black pawn', 'black pawn', 'black pawn', 'black pawn', 'black pawn'], ['.', '.', '.', '.', '.', '.', '.', '.'], ['.', '.', '.', '.', '.', '.', '.', '.'], ['.', '.', '.', 'white pawn', '.', '.', '.', '.'], ['.', '.', '.', '.', '.', '.', '.', '.'], ['white pawn', 'white pawn', 'white pawn', '.', 'white pawn', 'white pawn', 'white pawn', 'white pawn'], ['white rook', 'white knight', 'white bishop', 'white queen', 'white king', 'white bishop', 'white knight', 'white rook']] moves=['.', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'd4:w'] result='draw'

Game 2: board=[['black rook', 'black knight', 'black bishop', 'black queen', 'black king', 'black bishop', '.', 'black rook'], ['black pawn', 'black pawn', 'black pawn', 'black pawn', 'black pawn', 'black pawn', 'black pawn', 'black pawn'], ['.', '.', '.'

In [11]:
len(possible_moves)

154

In [12]:
games: list[GameState] = []
files = os.listdir("data/raw")
possible_moves = set()
for file in files[:1]:
    print(file)
    games, possible_moves = extract_formatted_games_from_pgn(f"data/raw/{file}", possible_moves, 1_000)
    games.extend(games)
    print(f'{len(games)} games')
# Save games to a JSON file

# Create the output directory if it doesn't exist
json_output_file = "data/processed/games.json"
os.makedirs(os.path.dirname(json_output_file), exist_ok=True)

# Write games to the JSON file using Pydantic's json_dumps method
with open(json_output_file, 'w', encoding='utf-8') as f:
    # Convert list of GameState objects to JSON using Pydantic's json method
    games_json = [game.model_dump_json() for game in games]
    # Write as a JSON array
    f.write("[\n")
    f.write(",\n".join(games_json))
    f.write("\n]")

print(f"Saved {len(games)} games to {json_output_file}")



lichess_elite_2022-04.pgn
0 games
100 games
200 games
300 games
400 games
500 games
600 games
700 games
800 games
900 games
163674 games
Saved 163674 games to data/processed/games.json


In [16]:
# Save possible moves to a text file
moves_output_file = "data/processed/possible_moves.txt"
os.makedirs(os.path.dirname(moves_output_file), exist_ok=True)

# Write moves to the text file with space delimiter
with open(moves_output_file, 'w', encoding='utf-8') as f:
    f.write(' '.join(possible_moves))

print(f"Saved {len(possible_moves)} possible moves to {moves_output_file}")
len(possible_moves)

# Read the saved possible moves from the text file
def read_possible_moves_from_txt(txt_file_path):
    """
    Read possible moves from a text file
    """
    with open(txt_file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Split by space to get individual moves
    possible_moves = set(content.split())
    return possible_moves

# Example usage
loaded_moves = read_possible_moves_from_txt("data/processed/possible_moves.txt")
print(f"Loaded {len(loaded_moves)} possible moves from text file")



Saved 2007 possible moves to data/processed/possible_moves.txt
Loaded 2007 possible moves from text file


In [None]:
from typing import Callable

EMPTY = '.'

# Read the processed game file
with open("data/processed/chess_games.txt", 'r', encoding='utf-8') as f:
    processed_games = f.read().splitlines()

# Create character-to-index mapping from the processed games
tokens = ' '.join(processed_games).split(' ')
stoi = {ch: i for i, ch in enumerate(set(tokens))}
itos = {i: ch for ch, i in stoi.items()}

encode: Callable[[str], list[int]] = lambda s: [stoi[c] for c in s]
decode: Callable[[list[int]], str] = lambda l: ''.join([itos[i] for i in l])
vocab_size = len(stoi)
context_length = max([len(g) for g in processed_games])
print(f'{context_length=}')
print(f'{len(stoi)=}')
stoi


In [8]:
def get_chess_elements():
    # All possible pieces (6 types × 2 colors + empty square)
    pieces = {
        'white pawn', 'white rook', 'white knight', 'white bishop', 'white queen', 'white king',
        'black pawn', 'black rook', 'black knight', 'black bishop', 'black queen', 'black king',
        '.'  # empty square
    }
    
    # Standard algebraic notation + color suffix patterns
    # Real games will have specific instances of these patterns
    move_patterns = {
        # Basic piece moves: e4:w, Nf3:b, etc.
        f"{file}{rank}:{color}" for file in 'abcdefgh' for rank in '12345678' for color in 'wb'
    } | {
        f"{piece}{file}{rank}:{color}" for piece in 'NBRQK' for file in 'abcdefgh' 
        for rank in '12345678' for color in 'wb'
    } | {
        # Castling: O-O:w, O-O-O:b
        f"{castle}:{color}" for castle in ['O-O', 'O-O-O'] for color in 'wb'
    }
    
    return pieces, move_patterns

# Get the elements
pieces, moves = get_chess_elements()

print(f"Pieces ({len(pieces)}): {sorted(pieces)}")
print(f"Number of basic move patterns: {len(moves)} {moves}")

Pieces (13): ['.', 'black bishop', 'black king', 'black knight', 'black pawn', 'black queen', 'black rook', 'white bishop', 'white king', 'white knight', 'white pawn', 'white queen', 'white rook']
Number of basic move patterns: 772 {'Kb2:w', 'Bc5:b', 'Bd1:b', 'Kg2:b', 'Bd2:b', 'Rf5:w', 'Qe8:w', 'Bh7:b', 'h5:b', 'Ke3:w', 'Ke5:b', 'Rf2:b', 'f3:b', 'Kf4:b', 'f5:w', 'Ka1:b', 'Bh4:b', 'Re1:w', 'f5:b', 'Bc4:w', 'Nf2:b', 'g7:w', 'Re6:b', 'Qb6:w', 'Be4:b', 'Nd5:w', 'Qh1:b', 'Kf2:b', 'b8:b', 'c2:w', 'Bf3:b', 'Bg3:b', 'Nf6:w', 'Rh1:b', 'b3:b', 'Na8:w', 'Nf8:w', 'Bg2:b', 'Kb7:w', 'Nb1:w', 'Nc1:b', 'g5:w', 'Kf5:w', 'd1:b', 'Bf8:w', 'Qc7:b', 'Bg4:w', 'Bg2:w', 'Ka2:b', 'Qg1:w', 'f3:w', 'Qg7:b', 'Rd4:w', 'Kg6:w', 'Kb1:b', 'Bg5:w', 'a6:b', 'Bc2:w', 'Rc3:b', 'Bd6:w', 'Rb6:w', 'Kb3:w', 'Qa7:w', 'e8:w', 'Be3:w', 'Rg7:w', 'Ne4:b', 'Kh3:b', 'Re8:b', 'Rb1:b', 'Qa8:b', 'Bd7:w', 'Qd3:w', 'Rf5:b', 'Rf8:b', 'Ra7:w', 'a3:w', 'Re3:b', 'Rh8:w', 'Qe5:w', 'h3:b', 'Bh3:w', 'a2:w', 'e7:b', 'Qc1:b', 'Nc5:w', 'Ng2:w', '

In [24]:
import chess
import numpy as np
from typing import List, Dict, Tuple, Union, Optional

class ChessFeatureEncoder:
    """
    Encodes chess moves as 12-dimensional feature vectors.
    
    Feature dimensions:
    0: piece type (0-5, representing pawn=0, knight=1, bishop=2, rook=3, queen=4, king=5)
    1: from_file (0-7, representing a-h)
    2: from_rank (0-7, representing 1-8)
    3: to_file (0-7, representing a-h)
    4: to_rank (0-7, representing 1-8)
    5: is_capture (0 or 1)
    6: is_check (0 or 1)
    7: is_checkmate (0 or 1)
    8: is_promotion (0 or 1)
    9: promotion_piece (0-3, representing queen=0, rook=1, bishop=2, knight=3, or -1 if not a promotion)
    10: is_castle_kingside (0 or 1)
    11: is_castle_queenside (0 or 1)
    """
    
    def __init__(self):
        # Mapping from piece type to integer (0-5)
        self.piece_to_int = {
            chess.PAWN: 0,
            chess.KNIGHT: 1,
            chess.BISHOP: 2,
            chess.ROOK: 3, 
            chess.QUEEN: 4,
            chess.KING: 5
        }
        
        # Mapping from integer to piece type
        self.int_to_piece = {v: k for k, v in self.piece_to_int.items()}
        
        # Mapping from promotion piece to integer (0-3)
        self.promotion_to_int = {
            chess.QUEEN: 0,
            chess.ROOK: 1,
            chess.BISHOP: 2,
            chess.KNIGHT: 3
        }
        
        # Mapping from integer to promotion piece
        self.int_to_promotion = {v: k for k, v in self.promotion_to_int.items()}
    
    def encode_move(self, board: chess.Board, move: chess.Move) -> np.ndarray:
        """
        Encode a chess move as a 12-dimensional feature vector.
        
        Args:
            board: The current chess board
            move: The move to encode
            
        Returns:
            np.ndarray: 12-dimensional feature vector
        """
        # Initialize feature vector with zeros
        features = np.zeros(12, dtype=np.int8)
        
        # Get piece type
        piece = board.piece_at(move.from_square)
        if piece is None:
            raise ValueError(f"No piece at source square {chess.square_name(move.from_square)}")
        
        features[0] = self.piece_to_int[piece.piece_type]
        
        # Get source square (file and rank)
        features[1] = chess.square_file(move.from_square)  # 0-7 for a-h
        features[2] = chess.square_rank(move.from_square)  # 0-7 for 1-8
        
        # Get destination square (file and rank)
        features[3] = chess.square_file(move.to_square)    # 0-7 for a-h
        features[4] = chess.square_rank(move.to_square)    # 0-7 for 1-8
        
        # Check if the move is a capture
        features[5] = int(board.is_capture(move))
        
        # We need to make the move temporarily to check if it results in check or checkmate
        board.push(move)
        features[6] = int(board.is_check())
        features[7] = int(board.is_checkmate() if features[6] else 0)  # Checkmate implies check
        board.pop()
        
        # Check if the move is a promotion
        features[8] = int(move.promotion is not None)
        features[9] = self.promotion_to_int.get(move.promotion, -1)  # -1 if not a promotion
        
        # Check if the move is castling
        # Kingside castling
        if piece.piece_type == chess.KING and move.from_square == chess.E1 and move.to_square == chess.G1:
            features[10] = 1
        elif piece.piece_type == chess.KING and move.from_square == chess.E8 and move.to_square == chess.G8:
            features[10] = 1
        # Queenside castling
        elif piece.piece_type == chess.KING and move.from_square == chess.E1 and move.to_square == chess.C1:
            features[11] = 1
        elif piece.piece_type == chess.KING and move.from_square == chess.E8 and move.to_square == chess.C8:
            features[11] = 1
        
        return features
    
    def encode_uci(self, board: chess.Board, uci: str) -> np.ndarray:
        """
        Encode a UCI move string as a 12-dimensional feature vector.
        
        Args:
            board: The current chess board
            uci: UCI move string (e.g., "e2e4", "e7e8q")
            
        Returns:
            np.ndarray: 12-dimensional feature vector
        """
        move = chess.Move.from_uci(uci)
        return self.encode_move(board, move)
    
    def encode_san(self, board: chess.Board, san: str) -> np.ndarray:
        """
        Encode a SAN move string as a 12-dimensional feature vector.
        
        Args:
            board: The current chess board
            san: SAN move string (e.g., "e4", "Nf3", "O-O")
            
        Returns:
            np.ndarray: 12-dimensional feature vector
        """
        move = board.parse_san(san)
        return self.encode_move(board, move)
    
    def decode_to_uci(self, board: chess.Board, features: np.ndarray) -> Optional[str]:
        """
        Attempt to decode a feature vector back to a UCI move string.
        
        Args:
            board: The current chess board
            features: 12-dimensional feature vector
            
        Returns:
            str: UCI move string, or None if no valid move could be found
        """
        # Extract basic move information
        piece_type = self.int_to_piece[features[0]]
        from_file = features[1]
        from_rank = features[2]
        to_file = features[3]
        to_rank = features[4]
        is_promotion = bool(features[8])
        
        # Calculate source and destination squares
        from_square = chess.square(from_file, from_rank)
        to_square = chess.square(to_file, to_rank)
        
        # Handle promotion
        promotion = None
        if is_promotion:
            promotion_idx = features[9]
            if promotion_idx >= 0:
                promotion = self.int_to_promotion[promotion_idx]
        
        # Create the move
        move = chess.Move(from_square, to_square, promotion)
        
        # Verify that this is a legal move
        if move in board.legal_moves:
            return move.uci()
        else:
            # Try to find a similar legal move
            for legal_move in board.legal_moves:
                if (legal_move.from_square == from_square and 
                    legal_move.to_square == to_square):
                    return legal_move.uci()
        
        return None
    
    def get_all_legal_moves_encoded(self, board: chess.Board) -> Dict[str, np.ndarray]:
        """
        Get all legal moves from the current position, encoded as feature vectors.
        
        Args:
            board: The current chess board
            
        Returns:
            Dict[str, np.ndarray]: Dictionary mapping UCI strings to feature vectors
        """
        encoded_moves = {}
        
        for move in board.legal_moves:
            uci = move.uci()
            features = self.encode_move(board, move)
            encoded_moves[uci] = features
        
        return encoded_moves

def demonstrate_encoding():
    """
    Demonstrate the feature encoding on various chess positions.
    """
    encoder = ChessFeatureEncoder()
    
    # Create a board with the starting position
    board = chess.Board()
    
    # Get all legal moves and their encodings
    encoded_moves = encoder.get_all_legal_moves_encoded(board)
    
    print(f"Number of legal moves from starting position: {len(encoded_moves)}")
    print("\nSample encodings:")
    
    # Show a few examples
    for uci, features in list(encoded_moves.items())[:5]:
        move = chess.Move.from_uci(uci)
        san = board.san(move)
        print(f"Move: {san} (UCI: {uci})")
        print(f"Features: {features}")
        print(f"- Piece: {['Pawn', 'Knight', 'Bishop', 'Rook', 'Queen', 'King'][features[0]]}")
        print(f"- From: {chess.square_name(chess.square(features[1], features[2]))}")
        print(f"- To: {chess.square_name(chess.square(features[3], features[4]))}")
        print(f"- Capture: {bool(features[5])}")
        print(f"- Check: {bool(features[6])}")
        print(f"- Checkmate: {bool(features[7])}")
        print(f"- Promotion: {bool(features[8])}")
        if features[8]:
            promotion_type = ['Queen', 'Rook', 'Bishop', 'Knight'][features[9]]
            print(f"- Promotion piece: {promotion_type}")
        print(f"- Castling: {['None', 'Kingside', 'Queenside', 'Both'][int(features[10]) + 2*int(features[11])]}")
        print()
    
    # Demonstrate on a more complex position with special moves
    print("\nDemonstrating on a position with special moves:")
    # A position with potential captures, promotions, and checks
    board = chess.Board("r1bqkb1r/pPpp1ppp/2n2n2/4p3/2B1P3/5N2/PPPP1PPP/RNBQK2R w KQkq - 0 5")
    print(board)
    
    encoded_moves = encoder.get_all_legal_moves_encoded(board)
    
    # Look for interesting moves to demonstrate
    special_moves = []
    for uci, features in encoded_moves.items():
        # Look for captures, promotions, or castling
        if features[5] or features[8] or features[10] or features[11]:
            special_moves.append((uci, features))
    
    print("\nInteresting moves from this position:")
    for uci, features in special_moves[:5]:  # Show up to 5
        move = chess.Move.from_uci(uci)
        san = board.san(move)
        print(f"Move: {san} (UCI: {uci})")
        print(f"Features: {features}")
        
        # Describe the move
        description = []
        if features[10]:
            description.append("kingside castling")
        elif features[11]:
            description.append("queenside castling")
        
        if features[5]:
            description.append("capture")
            
        if features[8]:
            promotion_type = ['Queen', 'Rook', 'Bishop', 'Knight'][features[9]]
            description.append(f"promotion to {promotion_type}")
            
        if features[6]:
            description.append("check")
        if features[7]:
            description.append("checkmate")
            
        print(f"This is a {', '.join(description)} move")
        print()

demonstrate_encoding()

Number of legal moves from starting position: 20

Sample encodings:
Move: Nh3 (UCI: g1h3)
Features: [ 1  6  0  7  2  0  0  0  0 -1  0  0]
- Piece: Knight
- From: g1
- To: h3
- Capture: False
- Check: False
- Checkmate: False
- Promotion: False
- Castling: None

Move: Nf3 (UCI: g1f3)
Features: [ 1  6  0  5  2  0  0  0  0 -1  0  0]
- Piece: Knight
- From: g1
- To: f3
- Capture: False
- Check: False
- Checkmate: False
- Promotion: False
- Castling: None

Move: Nc3 (UCI: b1c3)
Features: [ 1  1  0  2  2  0  0  0  0 -1  0  0]
- Piece: Knight
- From: b1
- To: c3
- Capture: False
- Check: False
- Checkmate: False
- Promotion: False
- Castling: None

Move: Na3 (UCI: b1a3)
Features: [ 1  1  0  0  2  0  0  0  0 -1  0  0]
- Piece: Knight
- From: b1
- To: a3
- Capture: False
- Check: False
- Checkmate: False
- Promotion: False
- Castling: None

Move: h3 (UCI: h2h3)
Features: [ 0  7  1  7  2  0  0  0  0 -1  0  0]
- Piece: Pawn
- From: h2
- To: h3
- Capture: False
- Check: False
- Checkmate: False
- 