In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import copy
import torch.nn.functional as F
from tqdm import tqdm
import matplotlib.pyplot as plt
import os

class WavefrontEnv:
    def __init__(self, slm_dim1=64, slm_dim2=64, eng_size=1,
                 num_pix_per_block=32, alpha=0.3, noise_sigma=0.05, k=0.45, phi=None):
        # geometry
        self.slm_dim1, self.slm_dim2 = slm_dim1, slm_dim2
        self.n_pix   = slm_dim1 * slm_dim2            
        self.eng_size = eng_size                      

        # phase mask
        if phi is None:
            self.phi = np.random.rand(slm_dim1, slm_dim2)
        else:
            assert phi.shape == (slm_dim1, slm_dim2), "Phase mask shape mismatch."
            self.phi = phi

        # block grid
        self.blocks = self._make_blocks(num_pix_per_block)
        self.num_blocks = len(self.blocks)            

        # RL bookkeeping
        self.state_dim    = self.num_blocks           
        self.action_space = self.num_blocks
        self.alpha  = alpha
        self.sigma  = noise_sigma                     
        self.I0_mean = 0.0
        self.I_max  = 0.0
        self.I_t   = 0.0

        self.k = k

        self.best_mask = None

        self.set_I0_mean()

        self.reset()

    #utils
    def _make_blocks(self, p_per_block):
        """Return list of numpy arrays, each array holds pixel indices of one block."""
        idx = np.arange(self.n_pix)
        return [idx[k : k + p_per_block]
                for k in range(0, self.n_pix, p_per_block)]

    def _blocks_to_pixels(self):
        """Convert block-level mask → 2-D pixel mask."""
        pixel = np.zeros(self.n_pix, dtype=np.float32)
        for bid, bit in enumerate(self.block_mask):
            if bit:                                     
                pixel[self.blocks[bid]] = 1.0
        return pixel.reshape(self.slm_dim1, self.slm_dim2)

    def reset(self):
        if self.best_mask is not None:
            if np.random.rand() < 0.2:
                self.block_mask = self.best_mask.copy()

                # Flip a small number of bits randomly (e.g., 5 out of 128)
                flip_indices = np.random.choice(self.num_blocks, size=5, replace=False)
                for idx in flip_indices:
                    self.block_mask[idx] = 1.0 - self.block_mask[idx]
            else:
                self.block_mask = self.best_mask.copy()
        else:
            self.block_mask = np.random.choice([0.0, 1.0], size=self.num_blocks).astype(np.float32)
            
        self.I_prev = self._intensity()
        self.I_max  = self.I_prev
        return self._state()
    
    def tanh_reward(self, I_1):
        return np.tanh(self.k * I_1)

    def step(self, action):
        
        for a in action:
            self.block_mask[a] = 1.0 - self.block_mask[a]

        self.I_t = self._intensity()
        reward = self.tanh_reward(self.I_t)
         
        # Bookkeeping best mask
        if self.I_t > self.I_max:
            self.I_max = self.I_t
            self.best_mask = self.block_mask.copy()

        return self._state(), reward

    #optics
    def _intensity(self):
        mask2d = self._blocks_to_pixels()
        field  = np.exp(1j * 2 * np.pi * self.phi) * mask2d
        spec   = np.fft.fftshift(np.fft.fft2(field))
        I      = np.abs(spec[self.slm_dim1 // 2, self.slm_dim2 // 2])**2 / spec.size
        I     += self.sigma * np.random.randn()         # additive Gaussian noise
        return float(I)

    #state
    def _state(self):
        return self.block_mask.astype(np.float32)
    
    def set_I0_mean(self):
        for i in range(1000):
            self.block_mask = np.random.choice([0.0, 1.0], size=self.num_blocks).astype(np.float32)
            I = self._intensity()
            self.I0_mean += I
        self.I0_mean /= 1000.0

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        
        # Actor head
        self.actor_head = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, action_dim)
        )

        # Critic head
        self.critic_head = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
    def forward(self, x):
    
        action_logits = self.actor_head(x)
        
        # Critic: State value
        state_value = self.critic_head(x)
        
        # return action_probs, state_value
        return action_logits, state_value
    
    def get_action(self, state, deterministic=False):
        state = torch.FloatTensor(state).unsqueeze(0)

        action_logits, _ = self.forward(state)         # [1, 128]
    
        dist = torch.distributions.Bernoulli(logits=action_logits)
        
        if deterministic:
            # Using threshold 0.5 to binarize
            action = (torch.sigmoid(action_logits) > 0.5).float()
        else:
            action = dist.sample()
        
        # Convert to list of indices where action is 1
        action = action.squeeze(0).detach().cpu().numpy().astype(int)
        flip_indices = np.where(action == 1)[0].tolist()
        
        return flip_indices, action
    
    def evaluate(self, states, actions):
        action_logits, state_values = self.forward(states)
        dist = torch.distributions.Bernoulli(logits=action_logits)
        
        # Element-wise log-probabilities: shape [T, 128]
        action_log_probs = dist.log_prob(actions)

        # Sum over action dimensions to get [T]
        log_probs = action_log_probs.sum(dim=1)

        # Entropy per sample (sum over 128 dimensions): shape [T]
        dist_entropy = dist.entropy().sum(dim=1)

        return log_probs, state_values, dist_entropy

# PPO Agent
class PPOAgent:
    def __init__(self, state_dim, action_dim, lr=3e-4, actor_lr=3e-4, critic_lr=3e-4, gamma=0.99, 
                 clip_ratio=0.2, value_coef=0.5, entropy_coef=0.01, gae_lambda=0.95):
        self.gamma = gamma
        self.clip_ratio = clip_ratio
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.gae_lambda = gae_lambda
        
        self.policy = ActorCritic(state_dim, action_dim)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.actor_optimizer = optim.Adam(self.policy.actor_head.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.policy.critic_head.parameters(), lr=critic_lr)
        
        self.old_policy = copy.deepcopy(self.policy)
        self.old_policy.eval()  # Set to evaluation mode
    
    def update(self, states, actions, rewards, next_states, steps_per_epoch=128, epochs=10):
        # Convert to tensors
        states = torch.FloatTensor(np.array(states))
        actions = torch.FloatTensor(np.array(actions))
        rewards = torch.FloatTensor(np.array(rewards))
        next_states = torch.FloatTensor(np.array(next_states))

        kl_divergences = []

        with torch.no_grad():
            old_action_log_probs, old_values, _ = self.old_policy.evaluate(states, actions)
            old_values = old_values.squeeze(-1)
            next_values = self.old_policy.forward(next_states)[1].squeeze(-1)
            values = torch.cat([old_values, next_values[-1].unsqueeze(0)], dim=0)

        # Calculate advantages using GAE (Generalized Advantage Estimation)
        advantages = torch.zeros_like(rewards)
        returns = torch.zeros_like(rewards)
        gae = 0

        for t in reversed(range(len(rewards))):
            next_value = values[t + 1]
            delta = rewards[t] + self.gamma * next_value - old_values[t]
            gae = delta + self.gamma * self.gae_lambda * gae
            advantages[t] = gae
        
        # Compute returns (used for value function loss)
        returns = advantages + old_values.detach()
        
        # Normalize advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        # PPO update loop
        for _ in range(epochs):
            # Create random indices for minibatches
            indices = np.random.permutation(len(states))
            
            # Iterate through mini-batches
            for start in range(0, len(states), steps_per_epoch):
                end = start + steps_per_epoch
                if end > len(states):
                    end = len(states)
                    
                batch_indices = indices[start:end]
                
                # Get batch data
                batch_states = states[batch_indices]
                batch_actions = actions[batch_indices]
                batch_advantages = advantages[batch_indices]
                batch_returns = returns[batch_indices]
                batch_old_action_log_probs = old_action_log_probs[batch_indices]
                
                # Evaluate current policy
                action_log_probs, values, entropy = self.policy.evaluate(batch_states, batch_actions)
                
                # Calculate ratios
                ratios = torch.exp(action_log_probs - batch_old_action_log_probs)

                kl = (batch_old_action_log_probs - action_log_probs).mean().item()  
                kl_divergences.append(kl)
                
                # Compute surrogate losses
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1.0 - self.clip_ratio, 1.0 + self.clip_ratio) * batch_advantages
                
                # Calculate loss components
                policy_loss = -torch.min(surr1, surr2).mean()
                # value_loss
                value_loss = F.smooth_l1_loss(values.squeeze(-1), batch_returns)
                entropy_loss = -entropy.mean()

                actor_loss = policy_loss + self.entropy_coef * entropy_loss
                critic_loss = self.value_coef * value_loss

                # Update actor and critic
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                nn.utils.clip_grad_norm_(self.policy.actor_head.parameters(), 0.5)
                self.actor_optimizer.step()

                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                nn.utils.clip_grad_norm_(self.policy.critic_head.parameters(), 0.5)
                self.critic_optimizer.step()
        
        with torch.no_grad():
            new_values = self.policy.forward(states)[1].squeeze(-1)

        # Update old policy
        self.old_policy.load_state_dict(self.policy.state_dict())

        average_kl = np.mean(kl_divergences)
        
        return policy_loss.item(), value_loss.item(), entropy_loss.item(), advantages, old_values, new_values, returns, average_kl
    
    def get_action(self, state, deterministic=False):
        return self.policy.get_action(state, deterministic)
    
    def save(self, path):
        torch.save(self.policy.state_dict(), path)
        
    def load(self, path):
        self.policy.load_state_dict(torch.load(path))
        self.old_policy.load_state_dict(torch.load(path))

# Training function
def train_ppo(env, agent, max_episodes=1000, steps_per_episode=128, 
              update_interval=10, eval_interval=50, verbose=True):
    
    # Initialize logging variables
    episode_rewards = []
    eval_rewards = []
    best_intensity = 0
    best_mask = None

    # Storage for the all states, actions, rewards, and next_states
    states = []
    actions = []
    rewards = []
    next_states = []

    all_advantage_means = []

    value_losses = []
    policy_losses = []
    entropy_losses = []
    all_old_values = []
    all_new_values = []
    all_returns = []
    all_kl_divergences = []
    
    # Training loop
    for episode in tqdm(range(1, max_episodes + 1)):
        
        episode_reward = 0
        state = env.reset()
        
        # Generate trajectory
        for step in range(steps_per_episode):
            # Select action
            action, action_mask = agent.get_action(state)
            
            # Execute action
            next_state, reward = env.step(action)
            
            # Store transition
            states.append(state)
            actions.append(action_mask)
            rewards.append(reward)
            next_states.append(next_state)
            
            # Update state and reward
            state = next_state
            episode_reward += reward
            
        # Store episode reward
        episode_rewards.append(episode_reward)
        
        # Update agent
        if episode % update_interval == 0:
            policy_loss, value_loss, entropy_loss, advantages, old_values, new_values, returns, kl_divergence = agent.update(
                states, actions, rewards, next_states
            )
            
            advantages = advantages.detach().cpu().numpy().reshape(update_interval, steps_per_episode)
            per_episode_advantage_means = advantages.mean(axis=1)

            all_advantage_means.extend(per_episode_advantage_means)

            # Append scalar value loss
            value_losses.append(value_loss)
            policy_losses.append(policy_loss)
            entropy_losses.append(entropy_loss)

            # Detach and move tensors to CPU
            all_old_values.append(old_values.detach().cpu().numpy())
            all_new_values.append(new_values.detach().cpu().numpy())
            all_returns.append(returns.detach().cpu().numpy())
            all_kl_divergences.append(kl_divergence)

            if verbose:
                print(f"Episode {episode}, Reward: {episode_reward:.4f}")
                print(f"Policy Loss: {policy_loss:.4f}, Value Loss: {value_loss:.4f}, Entropy Loss: {entropy_loss:.4f}")
                print(f"Current Intensity: {env.I_prev:.4f}, Max Intensity: {env.I_max:.4f}")
                print("---")
            
            states.clear()
            actions.clear()
            rewards.clear()
            next_states.clear()
        
        # Track best solution found
        if env.I_max > best_intensity:
            best_intensity = env.I_max
            best_mask = env.block_mask.copy()
        
        # Evaluate performance
        if episode % eval_interval == 0:
            eval_reward = evaluate_agent(env, agent, num_episodes=5)
            eval_rewards.append(eval_reward)
            
            if verbose:
                print(f"Evaluation at episode {episode}: Average Reward = {eval_reward:.4f}")
                print("===================================")
    
    old_values_all = np.concatenate(all_old_values)
    new_values_all = np.concatenate(all_new_values)
    returns_all = np.concatenate(all_returns)

    # Return results
    return {
        'episode_rewards': episode_rewards,
        'eval_rewards': eval_rewards,
        'best_intensity': best_intensity,
        'best_mask': best_mask,
        'all_advantage_means': all_advantage_means,
        'value_losses': value_losses,
        'old_values_all': old_values_all,
        'new_values_all': new_values_all,
        'returns_all': returns_all,
        'policy_losses': policy_losses,
        'entropy_losses': entropy_losses,
        'all_kl_divergences': all_kl_divergences,
    }

# Evaluation function
def evaluate_agent(env, agent, num_episodes=5, steps_per_episode=128):
    total_rewards = 0
    
    for _ in range(num_episodes):
        state = env.reset()
        episode_reward = 0
        
        for _ in range(steps_per_episode):
            action, _ = agent.get_action(state, deterministic=True)  # Use deterministic policy for evaluation
            next_state, reward = env.step(action)
            
            state = next_state
            episode_reward += reward
                
        total_rewards += episode_reward
        
    return total_rewards / num_episodes

# Visualization functions 
def plot_training_curve(results, run): 

    plt.figure(figsize=(16, 16))

    # Plot episode rewards
    plt.subplot(7, 1, 1)
    plt.plot(results['episode_rewards'], label='Episode Reward')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.title('Training Rewards')
    plt.grid(True)
    plt.legend()
    
    # Plot evaluation rewards if available
    if len(results['eval_rewards']) > 0:
        plt.subplot(7, 1, 2)
        eval_x = np.linspace(0, len(results['episode_rewards']), len(results['eval_rewards']))
        plt.plot(eval_x, results['eval_rewards'], label='Evaluation Reward', color='orange')
        plt.xlabel('Episode')
        plt.ylabel('Average Reward')
        plt.title('Evaluation Rewards')
        plt.grid(True)
        plt.legend()

    # Plot mean advantage per episode
    if 'all_advantage_means' in results and len(results['all_advantage_means']) > 0:
        plt.subplot(7, 1, 3)
        update_x = np.linspace(0, len(results['episode_rewards']), len(results['all_advantage_means']))
        plt.plot(update_x, results['all_advantage_means'], label='Mean Advantage', color='green')
        plt.xlabel('Episode')
        plt.ylabel('Mean Advantage')
        plt.title('Advantage Trend Over Updates')
        plt.grid(True)
        plt.legend()

    # Plot value loss over updates
    if 'value_losses' in results and len(results['value_losses']) > 0:
        plt.subplot(7, 1, 4)
        plt.plot(results['value_losses'], label='Value Loss', color='red')
        plt.xlabel('Update Step')
        plt.ylabel('Value Loss')
        plt.title('Value Loss Trend Over Updates')
        plt.grid(True)
        plt.legend()

    # Plot policy loss over updates
    if 'policy_losses' in results and len(results['policy_losses']) > 0:
        plt.subplot(7, 1, 5)
        plt.plot(results['policy_losses'], label='Policy Loss', color='blue')
        plt.xlabel('Update Step')
        plt.ylabel('Policy Loss')
        plt.title('Policy Loss Trend Over Updates')
        plt.grid(True)
        plt.legend()

    # Plot entropy loss over updates
    if 'entropy_losses' in results and len(results['entropy_losses']) > 0:
        plt.subplot(7, 1, 6)
        plt.plot(results['entropy_losses'], label='Entropy Loss', color='purple')
        plt.xlabel('Update Step')
        plt.ylabel('Entropy Loss')
        plt.title('Entropy Loss Trend Over Updates')
        plt.grid(True)
        plt.legend()
    
    # Plot KL divergence over updates
    if 'all_kl_divergences' in results and len(results['all_kl_divergences']) > 0:
        plt.subplot(7, 1, 7)
        plt.plot(results['all_kl_divergences'], label='KL Divergence', color='brown')
        plt.xlabel('Update Step')
        plt.ylabel('KL Divergence')
        plt.title('KL Divergence Trend Over Updates')
        plt.grid(True)
        plt.legend()

    plt.tight_layout()
    plt.savefig(f'results/run_{run}_training_curve.png')
    plt.close()

    # Plot value estimates vs returns
    if 'old_values_all' in results and 'returns_all' in results:
        plt.figure(figsize=(8, 6))
        plt.scatter(results['returns_all'], results['old_values_all'], alpha=0.5, label='Old Value Estimates')
        plt.scatter(results['returns_all'], results['new_values_all'], alpha=0.5, label='New Value Estimates')
        plt.plot(results['returns_all'], results['returns_all'], 'k--', label='Ideal Match')
        plt.xlabel('Returns')
        plt.ylabel('Value Estimates')
        plt.title('Value Estimates vs Returns')
        plt.legend()
        plt.grid(True)
        plt.savefig(f'results/run_{run}_value_vs_returns.png')
        plt.close()


def visualize_best_mask(env, best_mask, run):
    # Store original mask
    original_mask = env.block_mask.copy()
    
    # Set the best mask
    env.block_mask = best_mask.copy()
    
    # Get pixel mask
    pixel_mask = env._blocks_to_pixels()
    
    # Get intensity
    intensity = env._intensity()
    
    # Plot mask
    plt.figure(figsize=(10, 8))
    plt.imshow(pixel_mask, cmap='viridis')
    plt.colorbar(label='Mask Value')
    plt.title(f'Best Mask (Intensity: {intensity:.4f})')
    plt.savefig(f'results/run_{run}_best_mask.png')
    plt.close()
    
    # Restore original mask
    env.block_mask = original_mask

def main(run):
    np.random.seed(42)
    
    phi_path = "phase_mask/phi_64.npy"

    if os.path.exists(phi_path):
        phi = np.load(phi_path)
    else:
        phi = np.random.rand(64, 64)
        os.makedirs(os.path.dirname(phi_path), exist_ok=True)
        np.save(phi_path, phi)
    

    # Create environment
    env = WavefrontEnv(slm_dim1=64, slm_dim2=64, num_pix_per_block=32, alpha=0.3, noise_sigma=0.05, k=0.45, phi=phi)
    
    # Create PPO agent
    agent = PPOAgent(
        state_dim=env.state_dim,
        action_dim=env.action_space,
        lr=3e-4,
        actor_lr=3e-4,
        critic_lr=1e-4,
        gamma=0.95,
        clip_ratio=0.2,
        value_coef=0.7,
        entropy_coef=0.001,
        gae_lambda=0.95
    )
    
    # Train agent
    print("Starting training...")
    results = train_ppo(
        env=env,
        agent=agent,
        max_episodes=3000,  # Adjust based on your time constraints
        steps_per_episode=128,
        update_interval=5,
        eval_interval=25
    )
    
    # Visualize results
    plot_training_curve(results, run)
    visualize_best_mask(env, results['best_mask'], run)
    
    print(f"Training completed. Best intensity achieved: {results['best_intensity']:.4f}")


main(1)

Starting training...


  1%|          | 5/500 [00:01<03:39,  2.26it/s]

Episode 5, Reward: 17.1293
Policy Loss: -0.0118, Value Loss: 0.0716, Entropy Loss: -88.6023
Current Intensity: 1.7904, Max Intensity: 1.7904
---


  2%|▏         | 10/500 [00:03<03:41,  2.21it/s]

Episode 10, Reward: 15.2031
Policy Loss: -0.2264, Value Loss: 0.0446, Entropy Loss: -88.5742
Current Intensity: 2.2056, Max Intensity: 2.2056
---


  3%|▎         | 15/500 [00:05<04:08,  1.95it/s]

Episode 15, Reward: 15.4198
Policy Loss: -0.0277, Value Loss: 0.0574, Entropy Loss: -88.5366
Current Intensity: 2.2647, Max Intensity: 2.2647
---


  4%|▍         | 20/500 [00:07<03:37,  2.20it/s]

Episode 20, Reward: 17.0469
Policy Loss: -0.1616, Value Loss: 0.0583, Entropy Loss: -88.5086
Current Intensity: 2.1818, Max Intensity: 2.1818
---


  5%|▍         | 24/500 [00:08<02:14,  3.55it/s]

Episode 25, Reward: 17.9176
Policy Loss: -0.0926, Value Loss: 0.0572, Entropy Loss: -88.4832
Current Intensity: 2.3377, Max Intensity: 2.8476
---


  5%|▌         | 25/500 [00:10<06:39,  1.19it/s]

Evaluation at episode 25: Average Reward = 21.7776


  6%|▌         | 30/500 [00:13<05:28,  1.43it/s]

Episode 30, Reward: 18.3720
Policy Loss: -0.0705, Value Loss: 0.0535, Entropy Loss: -88.4475
Current Intensity: 2.8325, Max Intensity: 2.8325
---


  7%|▋         | 35/500 [00:15<04:59,  1.55it/s]

Episode 35, Reward: 18.6310
Policy Loss: -0.1603, Value Loss: 0.0389, Entropy Loss: -88.4033
Current Intensity: 2.7246, Max Intensity: 2.7246
---


  8%|▊         | 40/500 [00:18<04:47,  1.60it/s]

Episode 40, Reward: 16.4556
Policy Loss: -0.1825, Value Loss: 0.0380, Entropy Loss: -88.4088
Current Intensity: 2.7023, Max Intensity: 2.7023
---


  9%|▉         | 45/500 [00:20<04:14,  1.79it/s]

Episode 45, Reward: 19.5596
Policy Loss: -0.1621, Value Loss: 0.0642, Entropy Loss: -88.3584
Current Intensity: 2.8558, Max Intensity: 2.8558
---


 10%|▉         | 49/500 [00:21<02:44,  2.75it/s]

Episode 50, Reward: 19.8273
Policy Loss: -0.1142, Value Loss: 0.0493, Entropy Loss: -88.3643
Current Intensity: 2.8833, Max Intensity: 2.8833
---


 10%|█         | 50/500 [00:24<08:05,  1.08s/it]

Evaluation at episode 50: Average Reward = 23.4539


 11%|█         | 55/500 [00:26<04:41,  1.58it/s]

Episode 55, Reward: 15.9919
Policy Loss: -0.1915, Value Loss: 0.0382, Entropy Loss: -88.3165
Current Intensity: 2.7465, Max Intensity: 2.7465
---


 12%|█▏        | 60/500 [00:29<04:22,  1.68it/s]

Episode 60, Reward: 19.0705
Policy Loss: -0.2364, Value Loss: 0.0425, Entropy Loss: -88.2977
Current Intensity: 2.8145, Max Intensity: 2.8145
---


 13%|█▎        | 65/500 [00:31<03:52,  1.87it/s]

Episode 65, Reward: 17.2090
Policy Loss: -0.2152, Value Loss: 0.0374, Entropy Loss: -88.2656
Current Intensity: 2.8400, Max Intensity: 2.8400
---


 14%|█▍        | 70/500 [00:33<04:32,  1.58it/s]

Episode 70, Reward: 20.0959
Policy Loss: -0.1173, Value Loss: 0.0473, Entropy Loss: -88.2551
Current Intensity: 2.8135, Max Intensity: 2.8135
---


 15%|█▍        | 74/500 [00:34<02:34,  2.75it/s]

Episode 75, Reward: 17.3560
Policy Loss: -0.1285, Value Loss: 0.0361, Entropy Loss: -88.2160
Current Intensity: 2.5865, Max Intensity: 2.5865
---


 15%|█▌        | 75/500 [00:37<07:05,  1.00s/it]

Evaluation at episode 75: Average Reward = 46.5794


 16%|█▌        | 80/500 [00:39<04:18,  1.62it/s]

Episode 80, Reward: 19.2606
Policy Loss: -0.2288, Value Loss: 0.0350, Entropy Loss: -88.1267
Current Intensity: 2.8495, Max Intensity: 2.8495
---


 17%|█▋        | 85/500 [00:41<04:06,  1.69it/s]

Episode 85, Reward: 16.9741
Policy Loss: -0.2844, Value Loss: 0.0358, Entropy Loss: -88.0909
Current Intensity: 2.7527, Max Intensity: 2.7527
---


 18%|█▊        | 90/500 [00:44<04:09,  1.64it/s]

Episode 90, Reward: 18.0818
Policy Loss: -0.1039, Value Loss: 0.0412, Entropy Loss: -88.0575
Current Intensity: 2.9149, Max Intensity: 2.9149
---


 19%|█▉        | 95/500 [00:46<03:49,  1.76it/s]

Episode 95, Reward: 17.0100
Policy Loss: -0.0623, Value Loss: 0.0378, Entropy Loss: -88.0281
Current Intensity: 2.8956, Max Intensity: 2.8956
---


 20%|█▉        | 99/500 [00:47<02:34,  2.60it/s]

Episode 100, Reward: 17.2791
Policy Loss: -0.2186, Value Loss: 0.0328, Entropy Loss: -88.0239
Current Intensity: 2.8494, Max Intensity: 2.8494
---


 20%|██        | 100/500 [00:50<06:57,  1.04s/it]

Evaluation at episode 100: Average Reward = 50.2470


 21%|██        | 105/500 [00:52<04:29,  1.47it/s]

Episode 105, Reward: 20.0831
Policy Loss: -0.1104, Value Loss: 0.0427, Entropy Loss: -88.0404
Current Intensity: 2.7885, Max Intensity: 2.7885
---


 22%|██▏       | 110/500 [00:55<03:54,  1.66it/s]

Episode 110, Reward: 17.4164
Policy Loss: -0.0802, Value Loss: 0.0235, Entropy Loss: -88.0415
Current Intensity: 2.8481, Max Intensity: 2.8481
---


 23%|██▎       | 115/500 [00:57<03:38,  1.76it/s]

Episode 115, Reward: 17.8198
Policy Loss: -0.3242, Value Loss: 0.0401, Entropy Loss: -87.9797
Current Intensity: 2.8215, Max Intensity: 2.8215
---


 24%|██▍       | 120/500 [00:59<04:01,  1.57it/s]

Episode 120, Reward: 17.8801
Policy Loss: -0.2460, Value Loss: 0.0310, Entropy Loss: -87.9351
Current Intensity: 2.8322, Max Intensity: 2.8322
---


 25%|██▍       | 124/500 [01:01<02:26,  2.57it/s]

Episode 125, Reward: 19.3985
Policy Loss: -0.0360, Value Loss: 0.0369, Entropy Loss: -87.8049
Current Intensity: 3.0190, Max Intensity: 3.0190
---


 25%|██▌       | 125/500 [01:03<06:33,  1.05s/it]

Evaluation at episode 125: Average Reward = 48.3532


 26%|██▌       | 130/500 [01:06<03:57,  1.56it/s]

Episode 130, Reward: 18.6569
Policy Loss: -0.1291, Value Loss: 0.0464, Entropy Loss: -87.7052
Current Intensity: 2.9539, Max Intensity: 2.9539
---


 27%|██▋       | 135/500 [01:08<03:40,  1.65it/s]

Episode 135, Reward: 18.0965
Policy Loss: -0.1492, Value Loss: 0.0305, Entropy Loss: -87.7103
Current Intensity: 3.6703, Max Intensity: 3.6703
---


 28%|██▊       | 140/500 [01:10<03:28,  1.73it/s]

Episode 140, Reward: 20.2840
Policy Loss: -0.1628, Value Loss: 0.0411, Entropy Loss: -87.6336
Current Intensity: 3.5825, Max Intensity: 3.5825
---


 29%|██▉       | 145/500 [01:13<03:32,  1.67it/s]

Episode 145, Reward: 19.1486
Policy Loss: -0.2094, Value Loss: 0.0307, Entropy Loss: -87.6304
Current Intensity: 3.6275, Max Intensity: 3.6275
---


 30%|██▉       | 149/500 [01:14<02:13,  2.63it/s]

Episode 150, Reward: 19.4361
Policy Loss: -0.1325, Value Loss: 0.0311, Entropy Loss: -87.5797
Current Intensity: 3.6144, Max Intensity: 3.6144
---


 30%|███       | 150/500 [01:17<06:04,  1.04s/it]

Evaluation at episode 150: Average Reward = 56.1487


 31%|███       | 155/500 [01:19<03:46,  1.53it/s]

Episode 155, Reward: 20.4945
Policy Loss: -0.2447, Value Loss: 0.0511, Entropy Loss: -87.5118
Current Intensity: 3.0174, Max Intensity: 3.0174
---


 32%|███▏      | 160/500 [01:21<03:19,  1.70it/s]

Episode 160, Reward: 18.2292
Policy Loss: -0.0624, Value Loss: 0.0356, Entropy Loss: -87.4259
Current Intensity: 3.1203, Max Intensity: 3.1203
---


 33%|███▎      | 165/500 [01:24<03:17,  1.70it/s]

Episode 165, Reward: 22.1879
Policy Loss: -0.2744, Value Loss: 0.0515, Entropy Loss: -87.2120
Current Intensity: 3.0565, Max Intensity: 3.0565
---


 34%|███▍      | 170/500 [01:26<03:40,  1.50it/s]

Episode 170, Reward: 17.3716
Policy Loss: -0.1785, Value Loss: 0.0343, Entropy Loss: -87.2152
Current Intensity: 3.0466, Max Intensity: 3.0466
---


 35%|███▍      | 174/500 [01:27<02:12,  2.46it/s]

Episode 175, Reward: 19.2408
Policy Loss: -0.2110, Value Loss: 0.0372, Entropy Loss: -87.0809
Current Intensity: 2.9347, Max Intensity: 2.9347
---


 35%|███▌      | 175/500 [01:30<05:44,  1.06s/it]

Evaluation at episode 175: Average Reward = 52.4522


 36%|███▌      | 180/500 [01:32<03:32,  1.51it/s]

Episode 180, Reward: 18.7881
Policy Loss: -0.0715, Value Loss: 0.0394, Entropy Loss: -86.9975
Current Intensity: 3.4527, Max Intensity: 3.4527
---


 37%|███▋      | 185/500 [01:35<03:02,  1.73it/s]

Episode 185, Reward: 20.5636
Policy Loss: -0.2174, Value Loss: 0.0434, Entropy Loss: -87.0756
Current Intensity: 3.3348, Max Intensity: 3.3348
---


 38%|███▊      | 190/500 [01:37<03:04,  1.68it/s]

Episode 190, Reward: 19.4714
Policy Loss: -0.2501, Value Loss: 0.0299, Entropy Loss: -87.0197
Current Intensity: 3.4182, Max Intensity: 3.4182
---


 39%|███▉      | 195/500 [01:40<03:05,  1.65it/s]

Episode 195, Reward: 18.4446
Policy Loss: -0.1120, Value Loss: 0.0304, Entropy Loss: -86.8039
Current Intensity: 3.4497, Max Intensity: 3.4497
---


 40%|███▉      | 199/500 [01:41<01:58,  2.55it/s]

Episode 200, Reward: 21.5685
Policy Loss: -0.1236, Value Loss: 0.0328, Entropy Loss: -86.7445
Current Intensity: 3.0586, Max Intensity: 3.0586
---


 40%|████      | 200/500 [01:45<07:19,  1.46s/it]

Evaluation at episode 200: Average Reward = 61.1654


 41%|████      | 205/500 [01:48<03:45,  1.31it/s]

Episode 205, Reward: 22.9438
Policy Loss: -0.1974, Value Loss: 0.0396, Entropy Loss: -86.7450
Current Intensity: 3.7045, Max Intensity: 3.7045
---


 42%|████▏     | 210/500 [01:50<02:48,  1.72it/s]

Episode 210, Reward: 18.5450
Policy Loss: -0.2396, Value Loss: 0.0286, Entropy Loss: -86.7035
Current Intensity: 3.5418, Max Intensity: 3.5418
---


 43%|████▎     | 215/500 [01:52<03:00,  1.58it/s]

Episode 215, Reward: 22.3000
Policy Loss: -0.2115, Value Loss: 0.0444, Entropy Loss: -86.7188
Current Intensity: 3.5930, Max Intensity: 3.5930
---


 44%|████▍     | 220/500 [01:55<02:41,  1.73it/s]

Episode 220, Reward: 18.1943
Policy Loss: -0.1526, Value Loss: 0.0379, Entropy Loss: -86.6723
Current Intensity: 3.6321, Max Intensity: 3.6321
---


 45%|████▍     | 224/500 [01:56<01:38,  2.80it/s]

Episode 225, Reward: 17.7896
Policy Loss: -0.1334, Value Loss: 0.0415, Entropy Loss: -86.7454
Current Intensity: 3.6285, Max Intensity: 3.6285
---


 45%|████▌     | 225/500 [01:58<04:57,  1.08s/it]

Evaluation at episode 225: Average Reward = 66.3391


 46%|████▌     | 230/500 [02:01<02:48,  1.60it/s]

Episode 230, Reward: 19.9874
Policy Loss: -0.0959, Value Loss: 0.0377, Entropy Loss: -86.6095
Current Intensity: 3.6557, Max Intensity: 3.6557
---


 47%|████▋     | 235/500 [02:03<02:47,  1.58it/s]

Episode 235, Reward: 16.6955
Policy Loss: -0.0819, Value Loss: 0.0464, Entropy Loss: -86.5739
Current Intensity: 3.8413, Max Intensity: 3.8413
---


 48%|████▊     | 240/500 [02:05<02:22,  1.82it/s]

Episode 240, Reward: 20.2072
Policy Loss: -0.1127, Value Loss: 0.0403, Entropy Loss: -86.7493
Current Intensity: 3.8070, Max Intensity: 3.8070
---


 49%|████▉     | 245/500 [02:08<02:33,  1.66it/s]

Episode 245, Reward: 19.8992
Policy Loss: -0.0482, Value Loss: 0.0409, Entropy Loss: -86.5787
Current Intensity: 3.7555, Max Intensity: 3.7555
---


 50%|████▉     | 249/500 [02:09<01:35,  2.62it/s]

Episode 250, Reward: 17.4508
Policy Loss: -0.1020, Value Loss: 0.0498, Entropy Loss: -86.5661
Current Intensity: 3.8289, Max Intensity: 3.8289
---


 50%|█████     | 250/500 [02:12<04:17,  1.03s/it]

Evaluation at episode 250: Average Reward = 85.6176


 51%|█████     | 255/500 [02:14<02:45,  1.48it/s]

Episode 255, Reward: 19.7826
Policy Loss: -0.1714, Value Loss: 0.0288, Entropy Loss: -86.5529
Current Intensity: 4.2124, Max Intensity: 4.2124
---


 52%|█████▏    | 260/500 [02:16<02:19,  1.71it/s]

Episode 260, Reward: 19.3929
Policy Loss: -0.1716, Value Loss: 0.0329, Entropy Loss: -86.6951
Current Intensity: 4.1368, Max Intensity: 4.1368
---


 53%|█████▎    | 265/500 [02:19<02:33,  1.53it/s]

Episode 265, Reward: 21.7368
Policy Loss: -0.1485, Value Loss: 0.0541, Entropy Loss: -86.6419
Current Intensity: 4.2293, Max Intensity: 4.2293
---


 54%|█████▍    | 270/500 [02:21<02:07,  1.80it/s]

Episode 270, Reward: 21.3629
Policy Loss: -0.2820, Value Loss: 0.0434, Entropy Loss: -86.6062
Current Intensity: 4.1676, Max Intensity: 4.1676
---


 55%|█████▍    | 274/500 [02:22<01:27,  2.60it/s]

Episode 275, Reward: 19.6557
Policy Loss: -0.1980, Value Loss: 0.0461, Entropy Loss: -86.7293
Current Intensity: 4.1937, Max Intensity: 4.1937
---


 55%|█████▌    | 275/500 [02:25<03:56,  1.05s/it]

Evaluation at episode 275: Average Reward = 81.0835


 56%|█████▌    | 280/500 [02:27<02:17,  1.60it/s]

Episode 280, Reward: 19.6007
Policy Loss: -0.0806, Value Loss: 0.0426, Entropy Loss: -86.6025
Current Intensity: 2.9554, Max Intensity: 2.9554
---


 57%|█████▋    | 285/500 [02:30<02:10,  1.65it/s]

Episode 285, Reward: 20.2280
Policy Loss: -0.3308, Value Loss: 0.0463, Entropy Loss: -86.7120
Current Intensity: 2.9985, Max Intensity: 2.9985
---


 58%|█████▊    | 290/500 [02:32<01:54,  1.83it/s]

Episode 290, Reward: 20.9783
Policy Loss: -0.1720, Value Loss: 0.0405, Entropy Loss: -86.7123
Current Intensity: 2.9535, Max Intensity: 2.9535
---


 59%|█████▉    | 295/500 [02:34<02:04,  1.65it/s]

Episode 295, Reward: 17.6845
Policy Loss: -0.2106, Value Loss: 0.0525, Entropy Loss: -86.5376
Current Intensity: 2.8947, Max Intensity: 2.8947
---


 60%|█████▉    | 299/500 [02:35<01:11,  2.83it/s]

Episode 300, Reward: 17.9329
Policy Loss: -0.1964, Value Loss: 0.0399, Entropy Loss: -86.5023
Current Intensity: 3.0021, Max Intensity: 3.0021
---


 60%|██████    | 300/500 [02:38<03:25,  1.03s/it]

Evaluation at episode 300: Average Reward = 79.0618


 61%|██████    | 305/500 [02:40<02:06,  1.55it/s]

Episode 305, Reward: 18.5409
Policy Loss: -0.0363, Value Loss: 0.0506, Entropy Loss: -86.5553
Current Intensity: 3.6215, Max Intensity: 3.6215
---


 62%|██████▏   | 310/500 [02:43<01:49,  1.74it/s]

Episode 310, Reward: 20.9036
Policy Loss: -0.1506, Value Loss: 0.0378, Entropy Loss: -86.3521
Current Intensity: 3.0287, Max Intensity: 3.0287
---


 63%|██████▎   | 315/500 [02:45<01:54,  1.61it/s]

Episode 315, Reward: 21.5049
Policy Loss: -0.1154, Value Loss: 0.0562, Entropy Loss: -86.2765
Current Intensity: 3.7652, Max Intensity: 3.7652
---


 64%|██████▍   | 320/500 [02:48<01:58,  1.52it/s]

Episode 320, Reward: 20.1053
Policy Loss: -0.2511, Value Loss: 0.0377, Entropy Loss: -86.1793
Current Intensity: 3.7162, Max Intensity: 3.7162
---


 65%|██████▍   | 324/500 [02:49<01:15,  2.32it/s]

Episode 325, Reward: 23.4513
Policy Loss: -0.1268, Value Loss: 0.0547, Entropy Loss: -86.1018
Current Intensity: 2.9616, Max Intensity: 2.9616
---


 65%|██████▌   | 325/500 [02:52<02:59,  1.02s/it]

Evaluation at episode 325: Average Reward = 88.2245


 66%|██████▌   | 330/500 [02:54<02:01,  1.40it/s]

Episode 330, Reward: 20.3847
Policy Loss: -0.1534, Value Loss: 0.0496, Entropy Loss: -86.1527
Current Intensity: 4.0907, Max Intensity: 4.0907
---


 67%|██████▋   | 335/500 [02:57<01:35,  1.73it/s]

Episode 335, Reward: 20.9594
Policy Loss: -0.0203, Value Loss: 0.0368, Entropy Loss: -86.1170
Current Intensity: 4.1639, Max Intensity: 4.1639
---


 68%|██████▊   | 340/500 [02:59<01:35,  1.68it/s]

Episode 340, Reward: 21.5021
Policy Loss: -0.1108, Value Loss: 0.0628, Entropy Loss: -86.0788
Current Intensity: 4.2231, Max Intensity: 4.2231
---


 69%|██████▉   | 345/500 [03:01<01:32,  1.68it/s]

Episode 345, Reward: 21.2726
Policy Loss: -0.2169, Value Loss: 0.0395, Entropy Loss: -86.0216
Current Intensity: 4.2347, Max Intensity: 4.2347
---


 70%|██████▉   | 349/500 [03:03<00:56,  2.69it/s]

Episode 350, Reward: 24.3631
Policy Loss: -0.2222, Value Loss: 0.0511, Entropy Loss: -86.0762
Current Intensity: 4.1489, Max Intensity: 4.1489
---


 70%|███████   | 350/500 [03:07<03:49,  1.53s/it]

Evaluation at episode 350: Average Reward = 92.4545


 71%|███████   | 355/500 [03:09<01:53,  1.28it/s]

Episode 355, Reward: 23.7425
Policy Loss: -0.1397, Value Loss: 0.0533, Entropy Loss: -86.0160
Current Intensity: 4.6986, Max Intensity: 4.6986
---


 72%|███████▏  | 360/500 [03:12<01:26,  1.61it/s]

Episode 360, Reward: 21.3717
Policy Loss: -0.0113, Value Loss: 0.0486, Entropy Loss: -86.0521
Current Intensity: 4.5595, Max Intensity: 4.5595
---


 73%|███████▎  | 365/500 [03:14<01:24,  1.59it/s]

Episode 365, Reward: 21.1826
Policy Loss: -0.1379, Value Loss: 0.0455, Entropy Loss: -86.0514
Current Intensity: 4.5728, Max Intensity: 4.5728
---


 74%|███████▍  | 370/500 [03:17<01:20,  1.62it/s]

Episode 370, Reward: 21.3997
Policy Loss: -0.1835, Value Loss: 0.0634, Entropy Loss: -85.9051
Current Intensity: 4.6061, Max Intensity: 4.6061
---


 75%|███████▍  | 374/500 [03:18<00:47,  2.63it/s]

Episode 375, Reward: 20.2349
Policy Loss: -0.2040, Value Loss: 0.0440, Entropy Loss: -85.7569
Current Intensity: 4.6474, Max Intensity: 4.6474
---


 75%|███████▌  | 375/500 [03:21<02:17,  1.10s/it]

Evaluation at episode 375: Average Reward = 92.5660


 76%|███████▌  | 380/500 [03:23<01:18,  1.52it/s]

Episode 380, Reward: 20.6712
Policy Loss: -0.1624, Value Loss: 0.0410, Entropy Loss: -85.7601
Current Intensity: 4.6534, Max Intensity: 4.6534
---


 77%|███████▋  | 385/500 [03:26<01:07,  1.70it/s]

Episode 385, Reward: 22.4509
Policy Loss: -0.1823, Value Loss: 0.0494, Entropy Loss: -85.8204
Current Intensity: 4.6837, Max Intensity: 4.6837
---


 78%|███████▊  | 390/500 [03:28<01:05,  1.68it/s]

Episode 390, Reward: 20.7485
Policy Loss: -0.1924, Value Loss: 0.0351, Entropy Loss: -85.6642
Current Intensity: 3.7299, Max Intensity: 3.7299
---


 79%|███████▉  | 395/500 [03:30<01:03,  1.64it/s]

Episode 395, Reward: 23.2286
Policy Loss: 0.0330, Value Loss: 0.0537, Entropy Loss: -85.6256
Current Intensity: 4.7516, Max Intensity: 4.7516
---


 80%|███████▉  | 399/500 [03:32<00:41,  2.42it/s]

Episode 400, Reward: 20.3909
Policy Loss: -0.0917, Value Loss: 0.0523, Entropy Loss: -85.6933
Current Intensity: 4.2272, Max Intensity: 4.2272
---


 80%|████████  | 400/500 [03:35<01:48,  1.09s/it]

Evaluation at episode 400: Average Reward = 99.6503


 81%|████████  | 405/500 [03:37<01:15,  1.26it/s]

Episode 405, Reward: 22.7137
Policy Loss: -0.2840, Value Loss: 0.0722, Entropy Loss: -85.5734
Current Intensity: 5.2835, Max Intensity: 5.2835
---


 82%|████████▏ | 410/500 [03:40<01:09,  1.29it/s]

Episode 410, Reward: 20.5536
Policy Loss: -0.1140, Value Loss: 0.0437, Entropy Loss: -85.6281
Current Intensity: 5.2722, Max Intensity: 5.2722
---


 83%|████████▎ | 415/500 [03:43<01:09,  1.23it/s]

Episode 415, Reward: 19.9573
Policy Loss: -0.1173, Value Loss: 0.0357, Entropy Loss: -85.6969
Current Intensity: 5.1851, Max Intensity: 5.1851
---


 84%|████████▍ | 420/500 [03:46<00:58,  1.37it/s]

Episode 420, Reward: 21.9525
Policy Loss: -0.1971, Value Loss: 0.0587, Entropy Loss: -85.5119
Current Intensity: 5.1492, Max Intensity: 5.1492
---


 85%|████████▍ | 424/500 [03:48<00:37,  2.02it/s]

Episode 425, Reward: 21.6329
Policy Loss: -0.2196, Value Loss: 0.0710, Entropy Loss: -85.5231
Current Intensity: 5.2070, Max Intensity: 5.2070
---


 85%|████████▌ | 425/500 [03:51<01:43,  1.37s/it]

Evaluation at episode 425: Average Reward = 99.9848


 86%|████████▌ | 430/500 [03:55<01:02,  1.11it/s]

Episode 430, Reward: 24.5527
Policy Loss: -0.1630, Value Loss: 0.0294, Entropy Loss: -85.1733
Current Intensity: 5.2217, Max Intensity: 5.2217
---


 87%|████████▋ | 435/500 [03:57<00:47,  1.37it/s]

Episode 435, Reward: 19.7714
Policy Loss: -0.2293, Value Loss: 0.0565, Entropy Loss: -85.2968
Current Intensity: 5.1850, Max Intensity: 5.1850
---


 88%|████████▊ | 440/500 [04:00<00:44,  1.33it/s]

Episode 440, Reward: 20.6203
Policy Loss: -0.1586, Value Loss: 0.0483, Entropy Loss: -85.5396
Current Intensity: 5.2630, Max Intensity: 5.2630
---


 89%|████████▉ | 445/500 [04:03<00:40,  1.36it/s]

Episode 445, Reward: 23.1243
Policy Loss: -0.0725, Value Loss: 0.0536, Entropy Loss: -85.3825
Current Intensity: 5.1871, Max Intensity: 5.1871
---


 90%|████████▉ | 449/500 [04:05<00:24,  2.10it/s]

Episode 450, Reward: 24.8271
Policy Loss: -0.1354, Value Loss: 0.0820, Entropy Loss: -85.4465
Current Intensity: 5.2167, Max Intensity: 5.2167
---


 90%|█████████ | 450/500 [04:08<01:01,  1.22s/it]

Evaluation at episode 450: Average Reward = 102.6954


 91%|█████████ | 455/500 [04:11<00:38,  1.18it/s]

Episode 455, Reward: 23.0717
Policy Loss: 0.0506, Value Loss: 0.0632, Entropy Loss: -85.3723
Current Intensity: 5.3338, Max Intensity: 5.3338
---


 92%|█████████▏| 460/500 [04:14<00:31,  1.28it/s]

Episode 460, Reward: 25.1200
Policy Loss: -0.1543, Value Loss: 0.0605, Entropy Loss: -85.4982
Current Intensity: 5.1754, Max Intensity: 5.1754
---


 93%|█████████▎| 465/500 [04:17<00:25,  1.36it/s]

Episode 465, Reward: 22.4024
Policy Loss: -0.0989, Value Loss: 0.0444, Entropy Loss: -85.4529
Current Intensity: 5.1969, Max Intensity: 5.1969
---


 94%|█████████▍| 470/500 [04:20<00:24,  1.23it/s]

Episode 470, Reward: 22.1612
Policy Loss: -0.3041, Value Loss: 0.0518, Entropy Loss: -85.3164
Current Intensity: 5.3112, Max Intensity: 5.3112
---


 95%|█████████▍| 474/500 [04:22<00:12,  2.09it/s]

Episode 475, Reward: 22.1897
Policy Loss: -0.1130, Value Loss: 0.0432, Entropy Loss: -85.1451
Current Intensity: 5.3003, Max Intensity: 5.3003
---


 95%|█████████▌| 475/500 [04:25<00:32,  1.32s/it]

Evaluation at episode 475: Average Reward = 109.4737


 96%|█████████▌| 480/500 [04:28<00:16,  1.24it/s]

Episode 480, Reward: 22.1540
Policy Loss: -0.2179, Value Loss: 0.0574, Entropy Loss: -85.1801
Current Intensity: 5.3350, Max Intensity: 5.3350
---


 97%|█████████▋| 485/500 [04:31<00:11,  1.27it/s]

Episode 485, Reward: 23.0255
Policy Loss: -0.0496, Value Loss: 0.0350, Entropy Loss: -85.1322
Current Intensity: 5.3013, Max Intensity: 5.3013
---


 98%|█████████▊| 490/500 [04:34<00:07,  1.39it/s]

Episode 490, Reward: 21.5783
Policy Loss: -0.2035, Value Loss: 0.0547, Entropy Loss: -84.9415
Current Intensity: 5.2498, Max Intensity: 5.2498
---


 99%|█████████▉| 495/500 [04:38<00:04,  1.07it/s]

Episode 495, Reward: 22.1507
Policy Loss: -0.1889, Value Loss: 0.0408, Entropy Loss: -84.8075
Current Intensity: 4.9643, Max Intensity: 4.9643
---


100%|█████████▉| 499/500 [04:39<00:00,  1.71it/s]

Episode 500, Reward: 22.8180
Policy Loss: -0.3219, Value Loss: 0.0555, Entropy Loss: -84.9236
Current Intensity: 5.1715, Max Intensity: 5.1715
---


100%|██████████| 500/500 [04:44<00:00,  1.76it/s]

Evaluation at episode 500: Average Reward = 109.7120





Training completed. Best intensity achieved: 5.5385
