In [None]:
from checkers_env import CheckersEnv
from Agent import PPOAgent
from Memory import Memory
from checkers_game import BLUE, RED

import torch
import pandas as pd
import numpy as np
import random
import os

In [None]:
# Initialize environment and agents
env = CheckersEnv()
input_shape = (1, 8, 8)
n_actions = env.action_space.n

# Create Agents 1 and 2 to play checkers
agent1 = PPOAgent(input_shape, n_actions)
agent2 = PPOAgent(input_shape, n_actions)

# Track the starting agent (agent1 as BLUE initially)
agent1_side = BLUE
agent2_side = RED

# Data storage for monitoring training progress
training_data = {
    "epoch": [],
    "episode_reward": [],
    "win_rate_agent1": [],
    "win_rate_agent2": [],
    "average_episode_length": []
}

# Directory for saving models
model_dir = "PPO_saved_models"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Directory for saving random games
game_info_dir = "saved_games"
if not os.path.exists(game_info_dir):
    os.makedirs(game_info_dir)

# Training parameters
num_epochs = 100  # Number of epochs for training
num_episodes = 1000  # Number of episodes per epoch
save_interval = 1  # Save model every epoch (can adjust)

In [None]:
# Main training loop
for epoch in range(num_epochs):
    total_rewards = 0
    total_steps = 0
    agent1_wins, agent2_wins = 0, 0

    for episode in range(num_episodes):
        state = env.reset()  # Reset environment for each episode
        done = False  # Flag to check if game is over
        memory1 = Memory()  # Memory object for agent1
        memory2 = Memory()  # Memory object for agent2

        # Track game info if sampling condition is met
        game_info = [] if random.random() < 0.01 else None  # Log moves for ~1% of games
        
        # Assign agents based on sides for the current episode
        current_agent, opponent_agent = (agent1, agent2) if agent1_side == BLUE else (agent2, agent1)
        current_memory, opponent_memory = (memory1, memory2) if agent1_side == BLUE else (memory2, memory1)
        
        episode_reward = 0
        episode_steps = 0
        
        while not done:
            # Get action and log probabilities from the current agent
            action, log_prob, _ = current_agent.select_action(state)
            
            # Step the environment
            next_state, reward, done, info = env.step(action)

            # Track rewards
            episode_reward += reward
            total_rewards += reward
            episode_steps += 1

            # Record in memory based on current agent's perspective
            current_memory.add(state, info[action], reward, log_prob)
            
            # Save selected games stats
            if game_info is not None:
                game_info.append({
                    "epoch": epoch,
                    "episode": episode,
                    "turn": env.checkers.turn,
                    "move": info["legal_moves"][action] if info["move_success"] else None,
                    "reward": reward,
                    "success": info["move_success"],
                    "winner": info["winner"]
                })

            # Track win/loss for agents
            if done:
                if info["winner"] == "Blue Wins!":
                    agent1_wins += 1 if agent1_side == BLUE else 0
                    agent2_wins += 1 if agent2_side == BLUE else 0
                elif info["winner"] == "Red Wins!":
                    agent1_wins += 1 if agent1_side == RED else 0
                    agent2_wins += 1 if agent2_side == RED else 0

            # Alternate agents
            current_agent, opponent_agent = opponent_agent, current_agent
            current_memory, opponent_memory = opponent_memory, current_memory

            # Update state for next step
            state = next_state
        
        # Save game information if sampled
        if game_info:
            game_info_df = pd.DataFrame(game_info)
            game_info_df.to_csv(f"{game_info_dir}/game_info_epoch_{epoch}_episode_{episode}.csv", index=False)
        
        # Store episode data
        total_steps += episode_steps
        
        # After episode ends, update each agent using its memory
        agent1.update(memory1)
        agent2.update(memory2)
        
        # Swap sides after each episode
        agent1_side, agent2_side = agent2_side, agent1_side

    # Calculate epoch statistics
    avg_reward = total_rewards / num_episodes
    avg_steps = total_steps / num_episodes
    win_rate1 = agent1_wins / num_episodes
    win_rate2 = agent2_wins / num_episodes

    # Append data for this epoch
    training_data["epoch"].append(epoch)
    training_data["episode_reward"].append(avg_reward)
    training_data["win_rate_agent1"].append(win_rate1)
    training_data["win_rate_agent2"].append(win_rate2)
    training_data["average_episode_length"].append(avg_steps)

    print(f"Epoch {epoch + 1}/{num_epochs} - Avg Reward: {avg_reward:.2f}, Win Rate Agent1: {win_rate1:.2%}, Win Rate Agent2: {win_rate2:.2%}, Avg Steps: {avg_steps:.2f}")

    # Save model after every save_interval epochs
    if (epoch + 1) % save_interval == 0:
        torch.save(agent1.policy.state_dict(), os.path.join(model_dir, f"agent1_epoch_{epoch + 1}.pt"))
        torch.save(agent2.policy.state_dict(), os.path.join(model_dir, f"agent2_epoch_{epoch + 1}.pt"))

# Save training data as a CSV for later analysis
training_df = pd.DataFrame(training_data)
training_df.to_csv("training_progress.csv", index=False)

print("Training complete.")
env.close()