# Sokoban RL Notebook

In [1]:
%cd ..
%ls
!pip install -r requirements.txt

c:\Users\canid\OneDrive\Masaüstü\Python\CS_175\sokobanRL\sokobanRL


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


 Volume in drive C is Windows
 Volume Serial Number is A45C-F748

 Directory of c:\Users\canid\OneDrive\Masa�st�\Python\CS_175\sokobanRL\sokobanRL

12/05/2025  02:57 PM    <DIR>          .
12/05/2025  02:15 PM    <DIR>          ..
12/03/2025  07:28 PM               952 check_import_by_can.py
12/05/2025  02:14 PM    <DIR>          checkpoints
12/05/2025  02:14 PM    <DIR>          logs
12/03/2025  06:25 PM                 8 README.md
12/05/2025  02:58 PM                86 requirements.txt
12/05/2025  03:08 PM    <DIR>          src
12/05/2025  02:08 PM    <DIR>          test
               3 File(s)          1,046 bytes
               6 Dir(s)  63,032,307,712 bytes free


In [2]:
import gym
import gym_sokoban
import pygame
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import matplotlib.pyplot as plt
import os

# Check if GPU is available
device = torch.device("cpu")
print(f"Using device: {device}")

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Using device: cpu


## Actor-Critic Network Architecture

In [3]:
class ActorCritic(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(ActorCritic, self).__init__()
        
        # Convolutional layers for processing the game state
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        
        conv_out_size = self._get_conv_out(input_shape)
        
        # Actor head (policy)
        self.actor = nn.Sequential(
            nn.Linear(conv_out_size, 256),
            nn.ReLU(),
            nn.Linear(256, n_actions)
        )
        
        # Critic head (value function)
        self.critic = nn.Sequential(
            nn.Linear(conv_out_size, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
    
    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))
    
    def forward(self, x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.actor(conv_out), self.critic(conv_out)
    
    def get_action_probs(self, x):
        logits, _ = self.forward(x)
        return torch.softmax(logits, dim=-1)
    
    def get_value(self, x):
        _, value = self.forward(x)
        return value

## PPO Agent

In [4]:
class PPOAgent:
    def __init__(self, env, lr=3e-4, gamma=0.99, eps_clip=0.2, K_epochs=4, gae_lambda=0.95):
        self.env = env
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.gae_lambda = gae_lambda
        
        # Get observation shape
        obs = env.reset()
        if len(obs.shape) == 3:
            obs = np.transpose(obs, (2, 0, 1))
        
        self.input_shape = obs.shape
        self.n_actions = env.action_space.n
        
        self.device = torch.device("cpu")
        self.policy = ActorCritic(self.input_shape, self.n_actions).to(self.device)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        
        self.policy_old = ActorCritic(self.input_shape, self.n_actions).to(self.device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()
    
    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            action_probs = self.policy_old.get_action_probs(state)
        
        dist = torch.distributions.Categorical(action_probs)
        action = dist.sample()
        action_logprob = dist.log_prob(action)
        
        return action.item(), action_logprob.item()
    
    def compute_gae(self, rewards, values, dones):
        advantages = []
        gae = 0
        
        for t in reversed(range(len(rewards))):
            if t == len(rewards) - 1:
                next_value = 0
            else:
                next_value = values[t + 1]
            
            delta = rewards[t] + self.gamma * next_value * (1 - dones[t]) - values[t]
            gae = delta + self.gamma * self.gae_lambda * (1 - dones[t]) * gae
            advantages.insert(0, gae)
        
        return advantages
    
    def update(self, memory):
        states = torch.FloatTensor(np.array(memory['states'])).to(self.device)
        actions = torch.LongTensor(memory['actions']).to(self.device)
        old_logprobs = torch.FloatTensor(memory['logprobs']).to(self.device)
        
        rewards = memory['rewards']
        dones = memory['dones']
        
        # Compute values and advantages
        with torch.no_grad():
            values = self.policy_old.get_value(states).squeeze().cpu().numpy()
        
        advantages = self.compute_gae(rewards, values, dones)
        advantages_tensor = torch.FloatTensor(advantages).to(self.device)
        
        # Store raw advantage statistics before normalization
        raw_adv_mean = advantages_tensor.mean().item()
        raw_adv_std = advantages_tensor.std().item()
        raw_adv_max = advantages_tensor.max().item()
        raw_adv_min = advantages_tensor.min().item()
        
        # Normalize advantages
        advantages_tensor = (advantages_tensor - advantages_tensor.mean()) / (advantages_tensor.std() + 1e-8)
        
        returns = advantages_tensor + torch.FloatTensor(values).to(self.device)
        
        # Optimize policy for K epochs
        total_grad_norm = 0.0
        for _ in range(self.K_epochs):
            logits, state_values = self.policy(states)
            dist = torch.distributions.Categorical(logits=logits)
            action_logprobs = dist.log_prob(actions)
            dist_entropy = dist.entropy()
            
            ratios = torch.exp(action_logprobs - old_logprobs)
            
            surr1 = ratios * advantages_tensor
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages_tensor
            
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = self.MseLoss(state_values.squeeze(), returns)
            entropy_loss = -0.01 * dist_entropy.mean()
            
            loss = actor_loss + 0.5 * critic_loss + entropy_loss
            
            self.optimizer.zero_grad()
            loss.backward()
            grad_norm = torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5)
            total_grad_norm += grad_norm.item()
            self.optimizer.step()
        
        avg_grad_norm = total_grad_norm / self.K_epochs
        
        # Calculate policy ratio statistics
        with torch.no_grad():
            final_logits, _ = self.policy(states)
            final_dist = torch.distributions.Categorical(logits=final_logits)
            final_logprobs = final_dist.log_prob(actions)
            final_ratios = torch.exp(final_logprobs - old_logprobs)
            
            ratio_mean = final_ratios.mean().item()
            ratio_std = final_ratios.std().item()
            ratio_max = final_ratios.max().item()
            ratio_min = final_ratios.min().item()
        
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        # Return comprehensive metrics
        metrics = {
            'actor_loss': actor_loss.item(),
            'critic_loss': critic_loss.item(),
            'entropy': -entropy_loss.item() / 0.01,  # Undo the scaling to get raw entropy
            'grad_norm': avg_grad_norm,
            'advantage_mean': raw_adv_mean,
            'advantage_std': raw_adv_std,
            'advantage_max': raw_adv_max,
            'advantage_min': raw_adv_min,
            'ratio_mean': ratio_mean,
            'ratio_std': ratio_std,
            'ratio_max': ratio_max,
            'ratio_min': ratio_min,
            'value_mean': np.mean(values),
            'value_std': np.std(values),
        }
        
        return metrics
    
    def save(self, path):
        torch.save(self.policy.state_dict(), path)
    
    def load(self, path):
        self.policy.load_state_dict(torch.load(path))
        self.policy_old.load_state_dict(torch.load(path))

## Training Loop

In [5]:
def train(env_name='Sokoban-v0', max_episodes=10000, max_timesteps=300, update_timestep=2048, save_freq=100):
    import datetime
    env = gym.make(env_name)
    agent = PPOAgent(env)
    
    os.makedirs('checkpoints', exist_ok=True)
    os.makedirs('logs', exist_ok=True)
    
    # Create log file with timestamp
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = f'logs/training_log_{timestamp}.txt'
    
    # Write header to log file
    with open(log_file, 'w') as f:
        f.write("=" * 100 + "\n")
        f.write(f"SOKOBAN PPO TRAINING LOG - Started at {datetime.datetime.now()}\n")
        f.write("=" * 100 + "\n")
        f.write(f"Environment: {env_name}\n")
        f.write(f"Max Episodes: {max_episodes}\n")
        f.write(f"Max Timesteps per Episode: {max_timesteps}\n")
        f.write(f"Update Timestep: {update_timestep}\n")
        f.write(f"Save Frequency: {save_freq}\n")
        f.write("=" * 100 + "\n\n")
        f.write("METRICS EXPLANATION:\n")
        f.write("- Episode: Episode number\n")
        f.write("- Reward: Total reward for this episode\n")
        f.write("- Running Reward: Exponential moving average of rewards (smoothed trend)\n")
        f.write("- Steps: Number of steps taken in this episode\n")
        f.write("- Timestep: Total timesteps so far\n")
        f.write("- Actor Loss: Policy improvement metric (more negative = more improvement)\n")
        f.write("- Critic Loss: Value estimation error (lower = better predictions)\n")
        f.write("- Entropy: Action randomness (higher = more exploration)\n")
        f.write("- Grad Norm: Gradient magnitude (watch for explosion/vanishing)\n")
        f.write("- Adv Mean/Std: Advantage statistics (measures action quality)\n")
        f.write("- Ratio Mean: Policy change magnitude (should stay near 1.0)\n")
        f.write("- Value Mean: Average predicted state value\n")
        f.write("=" * 100 + "\n\n")
    
    print(f"Logging to: {log_file}\n")
    
    episode_rewards = []
    episode_steps = []
    running_reward = 0
    timestep = 0
    
    # Track latest update metrics
    latest_metrics = None
    
    memory = {
        'states': [],
        'actions': [],
        'logprobs': [],
        'rewards': [],
        'dones': []
    }
    
    for episode in range(1, max_episodes + 1):
        state = env.reset()
        if len(state.shape) == 3:
            state = np.transpose(state, (2, 0, 1))
        
        episode_reward = 0
        
        for t in range(max_timesteps):
            timestep += 1
            
            action, action_logprob = agent.select_action(state)
            next_state, reward, done, info = env.step(action)
            
            if len(next_state.shape) == 3:
                next_state = np.transpose(next_state, (2, 0, 1))
            
            memory['states'].append(state)
            memory['actions'].append(action)
            memory['logprobs'].append(action_logprob)
            memory['rewards'].append(reward)
            memory['dones'].append(done)
            
            state = next_state
            episode_reward += reward
            
            if timestep % update_timestep == 0:
                latest_metrics = agent.update(memory)
                memory = {
                    'states': [],
                    'actions': [],
                    'logprobs': [],
                    'rewards': [],
                    'dones': []
                }
                print(f"[UPDATE] Timestep {timestep} - Actor Loss: {latest_metrics['actor_loss']:.4f}, "
                      f"Critic Loss: {latest_metrics['critic_loss']:.4f}, "
                      f"Entropy: {latest_metrics['entropy']:.4f}")
            
            if done:
                break
        
        episode_rewards.append(episode_reward)
        episode_steps.append(t + 1)
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
        
        # Console output
        print(f"Episode {episode:5d} | Reward: {episode_reward:7.2f} | "
              f"Running: {running_reward:7.2f} | Steps: {t+1:3d}")
        
        # Write to log file after EVERY episode
        with open(log_file, 'a') as f:
            f.write(f"\n{'='*100}\n")
            f.write(f"EPISODE {episode} (Timestep: {timestep})\n")
            f.write(f"{'='*100}\n")
            f.write(f"  Reward:         {episode_reward:10.4f}\n")
            f.write(f"  Running Reward: {running_reward:10.4f}\n")
            f.write(f"  Steps:          {t+1:10d}\n")
            f.write(f"  Total Timestep: {timestep:10d}\n")
            
            # Add update metrics if available (will be None for first few episodes)
            if latest_metrics is not None:
                f.write(f"\n  --- Latest Update Metrics (from timestep {(timestep // update_timestep) * update_timestep}) ---\n")
                f.write(f"  Actor Loss:      {latest_metrics['actor_loss']:10.6f}  (policy improvement)\n")
                f.write(f"  Critic Loss:     {latest_metrics['critic_loss']:10.6f}  (value prediction error)\n")
                f.write(f"  Entropy:         {latest_metrics['entropy']:10.6f}  (exploration level)\n")
                f.write(f"  Grad Norm:       {latest_metrics['grad_norm']:10.6f}  (gradient magnitude)\n")
                f.write(f"  \n")
                f.write(f"  Advantage Mean:  {latest_metrics['advantage_mean']:10.6f}\n")
                f.write(f"  Advantage Std:   {latest_metrics['advantage_std']:10.6f}\n")
                f.write(f"  Advantage Max:   {latest_metrics['advantage_max']:10.6f}\n")
                f.write(f"  Advantage Min:   {latest_metrics['advantage_min']:10.6f}\n")
                f.write(f"  \n")
                f.write(f"  Ratio Mean:      {latest_metrics['ratio_mean']:10.6f}  (policy change, should be ~1.0)\n")
                f.write(f"  Ratio Std:       {latest_metrics['ratio_std']:10.6f}\n")
                f.write(f"  Ratio Max:       {latest_metrics['ratio_max']:10.6f}\n")
                f.write(f"  Ratio Min:       {latest_metrics['ratio_min']:10.6f}\n")
                f.write(f"  \n")
                f.write(f"  Value Mean:      {latest_metrics['value_mean']:10.6f}  (avg predicted value)\n")
                f.write(f"  Value Std:       {latest_metrics['value_std']:10.6f}\n")
        
        # Save checkpoints
        if episode % save_freq == 0:
            agent.save(f'checkpoints/ppo_sokoban_ep{episode}.pth')
            print(f"[CHECKPOINT] Model saved at episode {episode}")
            
            with open(log_file, 'a') as f:
                f.write(f"\n  >>> CHECKPOINT SAVED: checkpoints/ppo_sokoban_ep{episode}.pth\n")
    
    env.close()
    
    # Final summary
    with open(log_file, 'a') as f:
        f.write(f"\n\n{'='*100}\n")
        f.write(f"TRAINING COMPLETED - {datetime.datetime.now()}\n")
        f.write(f"{'='*100}\n")
        f.write(f"Total Episodes:     {max_episodes}\n")
        f.write(f"Total Timesteps:    {timestep}\n")
        f.write(f"Final Running Reward: {running_reward:.4f}\n")
        f.write(f"Best Episode Reward:  {max(episode_rewards):.4f} (Episode {episode_rewards.index(max(episode_rewards)) + 1})\n")
        f.write(f"Worst Episode Reward: {min(episode_rewards):.4f} (Episode {episode_rewards.index(min(episode_rewards)) + 1})\n")
        f.write(f"Average Reward:       {np.mean(episode_rewards):.4f}\n")
        f.write(f"Average Steps:        {np.mean(episode_steps):.2f}\n")
        f.write(f"{'='*100}\n")
    
    print(f"\nTraining complete! Log saved to: {log_file}")
    
    return episode_rewards

## Start Training

Run the cell below to start training. You can adjust the parameters:
- `max_episodes`: Total number of episodes to train
- `max_timesteps`: Maximum steps per episode
- `update_timestep`: How often to update the policy
- `save_freq`: How often to save checkpoints

In [None]:
# Start training
episode_rewards = train(
    env_name='Sokoban-v0',
    max_episodes=10000,
    max_timesteps=300,
    update_timestep=2048,
    save_freq=100
)

  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


Logging to: logs/training_log_20251205_151025.txt



  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):


Episode     1 | Reward:  -20.00 | Running:   -1.00 | Steps: 200
Episode     2 | Reward:  -20.00 | Running:   -1.95 | Steps: 200
Episode     3 | Reward:  -20.00 | Running:   -2.85 | Steps: 200
Episode     4 | Reward:  -20.00 | Running:   -3.71 | Steps: 200
Episode     5 | Reward:  -20.00 | Running:   -4.52 | Steps: 200
Episode     6 | Reward:  -20.00 | Running:   -5.30 | Steps: 200
Episode     7 | Reward:  -20.00 | Running:   -6.03 | Steps: 200
Episode     8 | Reward:  -20.00 | Running:   -6.73 | Steps: 200
Episode     9 | Reward:  -19.00 | Running:   -7.35 | Steps: 200
Episode    10 | Reward:  -20.00 | Running:   -7.98 | Steps: 200
[UPDATE] Timestep 2048 - Actor Loss: 0.4347, Critic Loss: 96.3063, Entropy: 0.0000
Episode    11 | Reward:  -20.00 | Running:   -8.58 | Steps: 200
Episode    12 | Reward:  -20.00 | Running:   -9.15 | Steps: 200
Episode    13 | Reward:  -20.00 | Running:   -9.69 | Steps: 200
Episode    14 | Reward:  -20.00 | Running:  -10.21 | Steps: 200
Episode    15 | Rewar

## Visualize Training Progress

In [None]:
# Plot training rewards
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(episode_rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Episode Rewards')
plt.grid(True)

plt.subplot(1, 2, 2)
# Plot moving average
window_size = 100
if len(episode_rewards) >= window_size:
    moving_avg = np.convolve(episode_rewards, np.ones(window_size)/window_size, mode='valid')
    plt.plot(moving_avg)
    plt.xlabel('Episode')
    plt.ylabel('Average Reward')
    plt.title(f'Moving Average (window={window_size})')
    plt.grid(True)

plt.tight_layout()
plt.show()

## Test Trained Agent

In [None]:
def test_agent(checkpoint_path, num_episodes=5, render=False):
    """Test a trained agent"""
    env = gym.make('Sokoban-v0')
    agent = PPOAgent(env)
    agent.load(checkpoint_path)
    
    total_rewards = []
    
    for episode in range(num_episodes):
        state = env.reset()
        if len(state.shape) == 3:
            state = np.transpose(state, (2, 0, 1))
        
        episode_reward = 0
        done = False
        steps = 0
        
        while not done and steps < 300:
            if render:
                env.render()
            
            action, _ = agent.select_action(state)
            next_state, reward, done, info = env.step(action)
            
            if len(next_state.shape) == 3:
                next_state = np.transpose(next_state, (2, 0, 1))
            
            state = next_state
            episode_reward += reward
            steps += 1
        
        total_rewards.append(episode_reward)
        print(f"Test Episode {episode + 1}: Reward = {episode_reward:.2f}, Steps = {steps}")
    
    env.close()
    print(f"\nAverage Reward: {np.mean(total_rewards):.2f}")
    return total_rewards