# Chess Reinforcement Learning (AlphaZero-like Implementation)

In [None]:
pip install torch numpy chess tqdm

## Imports and Setup

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import chess
import chess.pgn
import io
import os
import random
from tqdm import tqdm
import math
from collections import deque, namedtuple
import pickle

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Chess Environment and Utilities

In [None]:
class ChessGame:
    """Wrapper around python-chess for our needs"""
    def __init__(self):
        self.board = chess.Board()
    
    def reset(self):
        self.board.reset()
        return self.board
    
    def step(self, move):
        """Execute a move and return (new_state, reward, done, info)"""
        if isinstance(move, str):
            move = chess.Move.from_uci(move)
        
        self.board.push(move)
        
        # Check game status
        done = self.board.is_game_over()
        reward = 0
        
        if done:
            result = self.board.result()
            if result == "1-0":
                reward = 1  # White wins
            elif result == "0-1":
                reward = -1  # Black wins
            # else reward remains 0 for draw
        
        return self.board, reward, done, {}
    
    def legal_moves(self):
        return list(self.board.legal_moves)
    
    def to_fen(self):
        return self.board.fen()
    
    def is_game_over(self):
        return self.board.is_game_over()
    
    def current_player(self):
        return self.board.turn
    
    def copy(self):
        new_game = ChessGame()
        new_game.board = self.board.copy()
        return new_game

## Neural Network Architecture

In [None]:
class ChessNet(nn.Module):
    """Neural network that takes board position as input and outputs
    policy (move probabilities) and value (expected outcome)"""
    
    def __init__(self):
        super(ChessNet, self).__init__()
        
        # Input: 8x8 board with 14 planes (6 piece types + colors, plus some meta info)
        self.conv1 = nn.Conv2d(14, 256, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        
        # Policy head
        self.policy_conv = nn.Conv2d(256, 2, kernel_size=1)
        self.policy_fc = nn.Linear(2*8*8, 4672)  # 4672 is max possible moves in chess
        
        # Value head
        self.value_conv = nn.Conv2d(256, 1, kernel_size=1)
        self.value_fc1 = nn.Linear(8*8, 256)
        self.value_fc2 = nn.Linear(256, 1)
        
    def forward(self, x):
        # Common trunk
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        
        # Policy head
        p = F.relu(self.policy_conv(x))
        p = p.view(-1, 2*8*8)
        p = self.policy_fc(p)
        
        # Value head
        v = F.relu(self.value_conv(x))
        v = v.view(-1, 8*8)
        v = F.relu(self.value_fc1(v))
        v = torch.tanh(self.value_fc2(v))  # Output between -1 and 1
        
        return p, v

def board_to_tensor(board):
    """Convert a chess.Board to a tensor representation"""
    # Create a 14x8x8 tensor (6 piece types * 2 colors + some meta info)
    tensor = torch.zeros(14, 8, 8, device=device)
    
    # Piece positions
    for square in chess.SQUARES:
        piece = board.piece_at(square)
        if piece:
            # Piece type: pawn=0, knight=1, bishop=2, rook=3, queen=4, king=5
            piece_type = piece.piece_type - 1
            color = 0 if piece.color == chess.WHITE else 6
            plane = color + piece_type
            row, col = divmod(square, 8)
            tensor[plane, row, col] = 1
    
    # Additional planes
    # Plane 12: color to move (0 for black, 1 for white)
    tensor[12] = 1 if board.turn == chess.WHITE else 0
    
    # Plane 13: total move count (normalized)
    tensor[13] = board.fullmove_number / 100.0
    
    return tensor.unsqueeze(0)  # Add batch dimension

## Monte Carlo Tree Search (MCTS)

In [None]:
class Node:
    """Node in the Monte Carlo Tree"""
    def __init__(self, game_state, parent=None, move=None):
        self.game_state = game_state  # ChessGame instance
        self.parent = parent
        self.move = move  # Move that led to this node
        self.children = []
        self.visit_count = 0
        self.value_sum = 0
        self.prior = 0
    
    def expanded(self):
        return len(self.children) > 0
    
    def value(self):
        if self.visit_count == 0:
            return 0
        return self.value_sum / self.visit_count

class MCTS:
    """Monte Carlo Tree Search implementation"""
    def __init__(self, model, num_simulations=800, c_puct=1.0):
        self.model = model
        self.num_simulations = num_simulations
        self.c_puct = c_puct
    
    def search(self, root_state):
        """Perform MCTS and return action probabilities"""
        root = Node(root_state)
        
        for _ in range(self.num_simulations):
            node = root
            search_path = [node]
            
            # Selection
            while node.expanded():
                node = self.select_child(node)
                search_path.append(node)
            
            # Expansion
            parent = search_path[-1]
            if not parent.game_state.is_game_over():
                # Get policy and value from neural net
                board_tensor = board_to_tensor(parent.game_state.board)
                policy_logits, value = self.model(board_tensor)
                policy_logits = policy_logits.squeeze(0).detach().cpu().numpy()
                value = value.item()
                
                # Mask illegal moves
                legal_moves = parent.game_state.legal_moves()
                move_indices = [move_to_index(move) for move in legal_moves]
                mask = torch.zeros(4672, device=device)
                for idx in move_indices:
                    if idx < 4672:  # Safety check
                        mask[idx] = 1
                
                # Apply mask and softmax to get probabilities
                policy_logits = torch.from_numpy(policy_logits).to(device)
                policy_logits = policy_logits - 1000*(1 - mask)  # Large negative for illegal moves
                policy = F.softmax(policy_logits, dim=0).cpu().numpy()
                
                # Expand node
                for move in legal_moves:
                    child_state = parent.game_state.copy()
                    child_state.step(move)
                    move_idx = move_to_index(move)
                    child_node = Node(child_state, parent, move)
                    child_node.prior = policy[move_idx]
                    parent.children.append(child_node)
            else:
                # Game is over, get actual value
                result = parent.game_state.board.result()
                if result == "1-0":
                    value = 1
                elif result == "0-1":
                    value = -1
                else:
                    value = 0
            
            # Backpropagation
            self.backpropagate(search_path, value)
        
        # Return visit counts as action probabilities
        visit_counts = np.array([child.visit_count for child in root.children])
        action_probs = visit_counts / np.sum(visit_counts)
        
        return action_probs, root
    
    def select_child(self, node):
        """Select child with highest UCB score"""
        total_visits = sum(child.visit_count for child in node.children)
        log_total_visits = math.log(total_visits) if total_visits > 0 else 0
        
        best_score = -float("inf")
        best_child = None
        
        for child in node.children:
            # UCB score
            exploit = child.value()
            explore = self.c_puct * child.prior * math.sqrt(log_total_visits) / (child.visit_count + 1)
            score = exploit + explore
            
            if score > best_score:
                best_score = score
                best_child = child
        
        return best_child
    
    def backpropagate(self, search_path, value):
        """Backpropagate value through the search path"""
        for node in reversed(search_path):
            node.visit_count += 1
            node.value_sum += value
            value = -value  # Alternate perspective for opponent

def move_to_index(move):
    """Convert a chess move to an index in the policy output"""
    # This is a simplified version - in practice you'd need a more comprehensive mapping
    from_square = move.from_square
    to_square = move.to_square
    promotion = move.promotion or 0
    
    # Simple hash (this should be replaced with a proper mapping)
    return from_square * 64 + to_square + promotion * 4096

## Self-Play and Training

In [None]:
class SelfPlayDataset(Dataset):
    """Dataset to store self-play games"""
    def __init__(self):
        self.data = []
    
    def add_game(self, game_data):
        self.data.extend(game_data)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        board_tensor, policy, value = self.data[idx]
        return board_tensor, (policy, value)

def self_play(model, num_games=100, num_simulations=100):
    """Generate self-play games"""
    dataset = SelfPlayDataset()
    mcts = MCTS(model, num_simulations=num_simulations)
    
    for _ in tqdm(range(num_games), desc="Self-play"):
        game = ChessGame()
        game_data = []
        
        while not game.is_game_over():
            # Get action probabilities from MCTS
            action_probs, root = mcts.search(game)
            
            # Store training data
            board_tensor = board_to_tensor(game.board)
            policy = np.zeros(4672)
            for child, prob in zip(root.children, action_probs):
                move_idx = move_to_index(child.move)
                policy[move_idx] = prob
            
            # Value is from the perspective of the current player
            value = 0  # Will be updated when game ends
            game_data.append((board_tensor, policy, value))
            
            # Choose move (with some temperature for exploration)
            move = np.random.choice([child.move for child in root.children], p=action_probs)
            game.step(move)
        
        # Update values based on game outcome
        result = game.board.result()
        if result == "1-0":
            final_value = 1
        elif result == "0-1":
            final_value = -1
        else:
            final_value = 0
        
        # Assign values to each position (from perspective of player to move)
        for i, (board_tensor, policy, _) in enumerate(game_data):
            # Alternate perspective based on player turn
            value = final_value if i % 2 == 0 else -final_value
            game_data[i] = (board_tensor, policy, value)
        
        dataset.add_game(game_data)
    
    return dataset

def train_model(model, dataset, epochs=100, batch_size=32):
    """Train the model on self-play data"""
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    for epoch in range(epochs):
        total_loss = 0
        policy_loss_total = 0
        value_loss_total = 0
        
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
            board_tensors, (policies, values) = batch
            board_tensors = board_tensors.to(device).float()
            policies = policies.to(device).float()
            values = values.to(device).float().unsqueeze(1)
            
            optimizer.zero_grad()
            
            # Forward pass
            policy_pred, value_pred = model(board_tensors)
            
            # Losses
            policy_loss = F.cross_entropy(policy_pred, policies)
            value_loss = F.mse_loss(value_pred, values)
            loss = policy_loss + value_loss
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            policy_loss_total += policy_loss.item()
            value_loss_total += value_loss.item()
        
        avg_loss = total_loss / len(dataloader)
        avg_policy_loss = policy_loss_total / len(dataloader)
        avg_value_loss = value_loss_total / len(dataloader)
        
        print(f"Epoch {epoch+1}: Loss={avg_loss:.4f} (Policy={avg_policy_loss:.4f}, Value={avg_value_loss:.4f})")
    
    return model

## Training Loop

In [None]:
def train_chess_ai(num_iterations=100, num_self_play_games=20, num_epochs=5, num_simulations=100):
    """Main training loop"""
    model = ChessNet().to(device)
    
    for iteration in range(num_iterations):
        print(f"\n=== Iteration {iteration+1}/{num_iterations} ===")
        
        # Self-play
        print("Generating self-play games...")
        dataset = self_play(model, num_games=num_self_play_games, num_simulations=num_simulations)
        
        # Training
        print("Training model...")
        model = train_model(model, dataset, epochs=num_epochs)
        
        # Save model
        torch.save(model.state_dict(), f"chess_ai_iteration_{iteration}.pth")
        print(f"Model saved to chess_ai_iteration_{iteration}.pth")
    
    return model

## Evaluation and ELO Calculation

In [None]:
def evaluate_model(model, opponent, num_games=100, num_simulations=100):
    """Evaluate model against an opponent"""
    mcts = MCTS(model, num_simulations=num_simulations)
    wins = 0
    losses = 0
    draws = 0
    
    for game_num in tqdm(range(num_games), desc="Evaluation"):
        game = ChessGame()
        
        # Alternate who plays white
        if game_num % 2 == 0:
            players = [model, opponent]  # Model is white
        else:
            players = [opponent, model]  # Model is black
        
        while not game.is_game_over():
            current_player = 0 if game.current_player() == chess.WHITE else 1
            player = players[current_player]
            
            if player == model:
                # Model uses MCTS
                action_probs, _ = mcts.search(game)
                legal_moves = game.legal_moves()
                move = np.random.choice(legal_moves, p=action_probs)
            else:
                # Opponent makes a move (could be random, stockfish, etc.)
                if opponent == "random":
                    legal_moves = game.legal_moves()
                    move = random.choice(legal_moves)
                else:
                    # For more sophisticated opponents
                    move = opponent.make_move(game)
            
            game.step(move)
        
        # Record result
        result = game.board.result()
        if result == "1-0":
            if game_num % 2 == 0:
                wins += 1  # Model was white and won
            else:
                losses += 1  # Model was black and opponent won
        elif result == "0-1":
            if game_num % 2 == 0:
                losses += 1  # Model was white and lost
            else:
                wins += 1  # Model was black and won
        else:
            draws += 1
    
    win_rate = wins / num_games
    loss_rate = losses / num_games
    draw_rate = draws / num_games
    
    print(f"Results against opponent: Wins={wins}, Losses={losses}, Draws={draws}")
    print(f"Win rate: {win_rate:.2%}, Loss rate: {loss_rate:.2%}, Draw rate: {draw_rate:.2%}")
    
    return wins, losses, draws

def estimate_elo(wins, losses, draws, opponent_elo=1500):
    """Estimate ELO rating based on performance against an opponent"""
    total_games = wins + losses + draws
    win_prob = (wins + 0.5 * draws) / total_games
    
    if win_prob == 1:
        win_prob = 0.999  # Avoid division by zero
    elif win_prob == 0:
        win_prob = 0.001
    
    elo_difference = -400 * math.log10(1/win_prob - 1)
    estimated_elo = opponent_elo + elo_difference
    
    return estimated_elo

## Main Execution

In [None]:
if __name__ == "__main__":
    # Train the model
    model = train_chess_ai(
        num_iterations=1,  # For demo purposes - increase for better results
        num_self_play_games=1,  # For demo purposes - increase for better results
        num_epochs=5,  # For demo purposes - increase for better results
        num_simulations=2  # For demo purposes - increase for better results
    )
    
    # Evaluate against random opponent
    print("\nEvaluating against random opponent...")
    wins, losses, draws = evaluate_model(model, opponent="random", num_games=10)
    
    # Estimate ELO (assuming random opponent is ~800 ELO)
    elo = estimate_elo(wins, losses, draws, opponent_elo=800)
    print(f"Estimated ELO rating: {elo:.0f}")