# Training Othello Agents with Ray RLlib

This notebook demonstrates how to train reinforcement learning agents to play Othello using Ray RLlib, including:
- Setting up RLlib with the Othello environment
- Configuring PPO algorithm
- Creating custom CNN models
- Training agents with self-play
- Monitoring training progress
- Saving and loading checkpoints
- Vectorized environments for faster training

## Setup

First, let's import the necessary libraries.

In [None]:
import ray
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
import torch
import torch.nn as nn
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import aip_rl.othello

print("Libraries imported successfully!")

## 1. Basic PPO Training

Let's start with a simple PPO training setup using default models.

In [None]:
# Initialize Ray
ray.init(ignore_reinit_error=True)

# Configure PPO
config = (
    PPOConfig()
    .environment(
        env="Othello-v0",
        env_config={
            "opponent": "self",
            "reward_mode": "sparse",
            "invalid_move_mode": "penalty",
        }
    )
    .framework("torch")
    .env_runners(
        num_env_runners=2,  # Number of parallel workers
    )
    .resources(
        num_gpus=0,  # Set to 1 if GPU available
    )
    .training(
        train_batch_size=4000,
        minibatch_size=128,
        num_sgd_iter=10,
        lr=0.0003,
        gamma=0.99,
        lambda_=0.95,
        clip_param=0.2,
    )
)

print("PPO configuration created!")

In [None]:
# Build algorithm
algo = config.build()

print("Algorithm built successfully!")
print(f"Algorithm: {algo}")

In [None]:
# Train for a few iterations
num_iterations = 10
results = []

print(f"Training for {num_iterations} iterations...\n")

for i in range(num_iterations):
    result = algo.train()
    results.append(result)
    
    # Print progress
    if "env_runners" in result:
        episode_return = result["env_runners"].get("episode_return_mean", 0)
        episode_len = result["env_runners"].get("episode_len_mean", 0)
        print(f"Iteration {i+1:2d}: Reward={episode_return:6.2f}, Length={episode_len:5.1f}")
    else:
        print(f"Iteration {i+1:2d}: Training...")

print("\nTraining complete!")

### Visualizing Training Progress

In [None]:
# Extract metrics
episode_returns = []
episode_lengths = []

for result in results:
    if "env_runners" in result:
        episode_returns.append(result["env_runners"].get("episode_return_mean", 0))
        episode_lengths.append(result["env_runners"].get("episode_len_mean", 0))

# Plot
fig, axes = plt.subplots(1, 2, figsize=(15, 4))

axes[0].plot(episode_returns)
axes[0].set_title('Episode Return Mean')
axes[0].set_xlabel('Iteration')
axes[0].set_ylabel('Return')
axes[0].grid(True, alpha=0.3)

axes[1].plot(episode_lengths)
axes[1].set_title('Episode Length Mean')
axes[1].set_xlabel('Iteration')
axes[1].set_ylabel('Length')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2. Custom CNN Model

For better performance with the (3, 8, 8) observation space, let's create a custom CNN model.

In [None]:
class OthelloCNN(TorchModelV2, nn.Module):
    """
    Custom CNN model for Othello board observations.
    
    Architecture:
    - Conv2d(3, 64, 3x3) + ReLU
    - Conv2d(64, 128, 3x3) + ReLU
    - Conv2d(128, 128, 3x3) + ReLU
    - Flatten
    - Linear(128*8*8, 512) + ReLU
    - Linear(512, num_outputs) for policy
    - Linear(512, 1) for value function
    """
    
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, 
                             model_config, name)
        nn.Module.__init__(self)
        
        # CNN layers for (3, 8, 8) input
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        
        # Fully connected layers
        self.fc1 = nn.Linear(128 * 8 * 8, 512)
        self.fc2 = nn.Linear(512, num_outputs)
        
        # Value function head
        self.value_fc = nn.Linear(512, 1)
        
        self._features = None
    
    def forward(self, input_dict, state, seq_lens):
        x = input_dict["obs"].float()
        
        # CNN forward pass
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        
        # Flatten
        x = x.reshape(x.size(0), -1)
        
        # FC layers
        x = torch.relu(self.fc1(x))
        self._features = x
        
        # Policy logits
        logits = self.fc2(x)
        
        return logits, state
    
    def value_function(self):
        return self.value_fc(self._features).squeeze(1)

# Register custom model
ModelCatalog.register_custom_model("othello_cnn", OthelloCNN)

print("Custom CNN model registered!")

### Training with Custom Model

In [None]:
# Stop previous algorithm
algo.stop()

# Configure PPO with custom model
config_cnn = (
    PPOConfig()
    .environment(
        env="Othello-v0",
        env_config={
            "opponent": "self",
            "reward_mode": "sparse",
        }
    )
    .framework("torch")
    .env_runners(num_env_runners=2)
    .resources(num_gpus=0)
    .training(
        train_batch_size=4000,
        lr=0.0003,
    )
    .model({
        "custom_model": "othello_cnn",
    })
)

# Build and train
algo_cnn = config_cnn.build()

print("Training with custom CNN model...\n")

results_cnn = []
for i in range(10):
    result = algo_cnn.train()
    results_cnn.append(result)
    
    if "env_runners" in result:
        episode_return = result["env_runners"].get("episode_return_mean", 0)
        episode_len = result["env_runners"].get("episode_len_mean", 0)
        print(f"Iteration {i+1:2d}: Reward={episode_return:6.2f}, Length={episode_len:5.1f}")

print("\nTraining with CNN complete!")

## 3. Vectorized Environments

Use multiple parallel environments for faster training.

In [None]:
# Stop previous algorithm
algo_cnn.stop()

# Configure with vectorized environments
config_vec = (
    PPOConfig()
    .environment(env="Othello-v0")
    .framework("torch")
    .env_runners(
        num_env_runners=4,           # 4 parallel workers
        num_envs_per_env_runner=4,   # 4 environments per worker
    )
    .resources(num_gpus=0)
    .training(
        train_batch_size=8000,
        lr=0.0003,
    )
    .model({"custom_model": "othello_cnn"})
)

print("Configuration with 4 * 4 = 16 parallel environments")
print("This significantly speeds up training!\n")

algo_vec = config_vec.build()

print("Training with vectorized environments...\n")

results_vec = []
for i in range(10):
    result = algo_vec.train()
    results_vec.append(result)
    
    if "env_runners" in result:
        episode_return = result["env_runners"].get("episode_return_mean", 0)
        episode_len = result["env_runners"].get("episode_len_mean", 0)
        print(f"Iteration {i+1:2d}: Reward={episode_return:6.2f}, Length={episode_len:5.1f}")

print("\nVectorized training complete!")

## 4. Checkpointing

Save and load model checkpoints.

In [None]:
# Save checkpoint
checkpoint_path = algo_vec.save()
print(f"Checkpoint saved to: {checkpoint_path}")

In [None]:
# Stop current algorithm
algo_vec.stop()

# Create new algorithm and restore from checkpoint
algo_restored = config_vec.build()
algo_restored.restore(checkpoint_path)

print(f"Algorithm restored from checkpoint!")
print(f"Continuing training...\n")

# Continue training
for i in range(5):
    result = algo_restored.train()
    
    if "env_runners" in result:
        episode_return = result["env_runners"].get("episode_return_mean", 0)
        print(f"Iteration {i+1}: Reward={episode_return:.2f}")

print("\nContinued training complete!")

## 5. Testing the Trained Agent

Let's test the trained agent in the environment.

In [None]:
# Create test environment
env = gym.make("Othello-v0", render_mode="ansi")

# Run test episode
observation, info = env.reset()
done = False
total_reward = 0
step_count = 0

print("Testing trained agent...\n")
print("Initial board:")
print(env.render())

while not done and step_count < 60:
    # Get action from trained policy
    action = algo_restored.compute_single_action(observation)
    
    # Take step
    observation, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    step_count += 1
    done = terminated or truncated
    
    # Print every 10 steps
    if step_count % 10 == 0:
        print(f"\nAfter {step_count} steps:")
        print(env.render())

print(f"\nFinal board:")
print(env.render())
print(f"\nEpisode finished!")
print(f"Steps: {step_count}")
print(f"Total reward: {total_reward:.2f}")
print(f"Final score - Black: {info['black_count']}, White: {info['white_count']}")

## 6. Different Reward Modes

Compare training with sparse vs dense rewards.

In [None]:
# Stop current algorithm
algo_restored.stop()

# Train with dense rewards
config_dense = (
    PPOConfig()
    .environment(
        env="Othello-v0",
        env_config={
            "opponent": "self",
            "reward_mode": "dense",  # Dense rewards
        }
    )
    .framework("torch")
    .env_runners(num_env_runners=2)
    .resources(num_gpus=0)
    .training(train_batch_size=4000, lr=0.0003)
    .model({"custom_model": "othello_cnn"})
)

algo_dense = config_dense.build()

print("Training with dense rewards...\n")

results_dense = []
for i in range(10):
    result = algo_dense.train()
    results_dense.append(result)
    
    if "env_runners" in result:
        episode_return = result["env_runners"].get("episode_return_mean", 0)
        print(f"Iteration {i+1:2d}: Reward={episode_return:6.2f}")

print("\nDense reward training complete!")

# Compare learning curves
sparse_returns = [r["env_runners"].get("episode_return_mean", 0) for r in results_vec if "env_runners" in r]
dense_returns = [r["env_runners"].get("episode_return_mean", 0) for r in results_dense if "env_runners" in r]

plt.figure(figsize=(10, 5))
plt.plot(sparse_returns, label='Sparse Rewards', marker='o')
plt.plot(dense_returns, label='Dense Rewards', marker='s')
plt.xlabel('Iteration')
plt.ylabel('Episode Return Mean')
plt.title('Sparse vs Dense Rewards')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

algo_dense.stop()

## 7. Evaluation

Evaluate the trained agent over multiple episodes.

In [None]:
def evaluate_agent(algo, num_episodes=20):
    """Evaluate agent over multiple episodes."""
    env = gym.make("Othello-v0")
    
    wins = 0
    losses = 0
    draws = 0
    total_rewards = []
    episode_lengths = []
    
    for episode in range(num_episodes):
        observation, info = env.reset()
        done = False
        episode_reward = 0
        steps = 0
        
        while not done:
            action = algo.compute_single_action(observation)
            observation, reward, terminated, truncated, info = env.step(action)
            episode_reward += reward
            steps += 1
            done = terminated or truncated
        
        total_rewards.append(episode_reward)
        episode_lengths.append(steps)
        
        # Count results
        if info['black_count'] > info['white_count']:
            wins += 1
        elif info['white_count'] > info['black_count']:
            losses += 1
        else:
            draws += 1
    
    return {
        'wins': wins,
        'losses': losses,
        'draws': draws,
        'mean_reward': np.mean(total_rewards),
        'std_reward': np.std(total_rewards),
        'mean_length': np.mean(episode_lengths),
    }

# Note: This requires a trained algorithm to be active
# Uncomment and run if you have a trained algorithm
# results = evaluate_agent(algo_restored, num_episodes=20)
# print(f"Evaluation Results:")
# print(f"  Wins: {results['wins']}/20")
# print(f"  Losses: {results['losses']}/20")
# print(f"  Draws: {results['draws']}/20")
# print(f"  Mean Reward: {results['mean_reward']:.2f} Â± {results['std_reward']:.2f}")
# print(f"  Mean Episode Length: {results['mean_length']:.1f}")

## Cleanup

Shutdown Ray when done.

In [None]:
# Shutdown Ray
ray.shutdown()
print("Ray shutdown complete!")

## Summary

In this notebook, we covered:
1. Basic PPO training with default models
2. Creating and using custom CNN models
3. Vectorized environments for faster training
4. Saving and loading checkpoints
5. Testing trained agents
6. Comparing sparse vs dense rewards
7. Evaluating agent performance

Key takeaways:
- Custom CNN models improve performance for board games
- Vectorized environments significantly speed up training
- Dense rewards can provide faster initial learning
- Regular checkpointing is important for long training runs

Next steps:
- See `03_evaluating_trained_agents.ipynb` for detailed evaluation and analysis
- Experiment with different hyperparameters
- Try other RLlib algorithms (DQN, APPO, etc.)
- Implement curriculum learning or opponent diversity