In [69]:
from apad_env import APADEnv
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import BaseCallback
import time
import numpy as np

In [70]:
class TimerLoggingCallback(BaseCallback):
   def __init__(self):
       super().__init__()
       self.start_time = time.time()
   
   def _on_step(self):
       if self.num_timesteps % 1000 == 0:
           elapsed = time.time() - self.start_time
           rate = self.num_timesteps / elapsed
           remaining = (self.locals['total_timesteps'] - self.num_timesteps) / rate
           print(f"Step {self.num_timesteps}, {elapsed:.0f}s elapsed, {remaining:.0f}s remaining")
           
           # Add logging for tensorboard
           self.logger.record("time/elapsed_seconds", elapsed)
           self.logger.record("time/steps_per_second", rate)
           self.logger.record("time/remaining_seconds", remaining)
           
           # Force write to tensorboard
           self.logger.dump(self.num_timesteps)
       return True

In [71]:
class ComprehensiveCallback(BaseCallback):
   def __init__(self):
       super().__init__()
       self.start_time = time.time()
       self.episode_rewards = []
       self.episode_lengths = []
       self.step_rewards = []
       self.recent_q_values = []
       
   def _on_step(self):
       # Collect step-level data
       if 'rewards' in self.locals and len(self.locals['rewards']) > 0:
           self.step_rewards.append(self.locals['rewards'][0])
       
       # Collect Q-values if available
       if hasattr(self.model, 'q_net') and 'obs_tensor' in self.locals:
           with torch.no_grad():
               q_vals = self.model.q_net(self.locals['obs_tensor'])
               self.recent_q_values.append(q_vals.mean().item())
       
       # Check for episode completion
       if self.locals.get('dones', [False])[0]:
           if len(self.step_rewards) > 0:
               episode_reward = sum(self.step_rewards)
               episode_length = len(self.step_rewards)
               self.episode_rewards.append(episode_reward)
               self.episode_lengths.append(episode_length)
               self.step_rewards = []  # Reset for next episode
       
       # Log every 1000 steps
       if self.num_timesteps % 1000 == 0:
           elapsed = time.time() - self.start_time
           rate = self.num_timesteps / elapsed
           remaining = (self.locals.get('total_timesteps', 100000) - self.num_timesteps) / rate
           
           print(f"Step {self.num_timesteps}, {elapsed:.0f}s elapsed, {remaining:.0f}s remaining")
           
           # Time metrics
           self.logger.record("time/elapsed_seconds", elapsed)
           self.logger.record("time/steps_per_second", rate)
           self.logger.record("time/remaining_seconds", remaining)
           
           # Episode metrics
           if self.episode_rewards:
               self.logger.record("episode/mean_reward", np.mean(self.episode_rewards[-10:]))
               self.logger.record("episode/mean_length", np.mean(self.episode_lengths[-10:]))
               self.logger.record("episode/total_episodes", len(self.episode_rewards))
           
           # Step-level metrics
           if self.step_rewards:
               self.logger.record("step/recent_avg_reward", np.mean(self.step_rewards[-1000:]))
           
           # Q-value metrics
           if self.recent_q_values:
               self.logger.record("q_values/mean", np.mean(self.recent_q_values[-1000:]))
               self.logger.record("q_values/std", np.std(self.recent_q_values[-1000:]))
           
           self.logger.dump(self.num_timesteps)
       
       return True

# Train

In [74]:
env = APADEnv()
model = DQN(
    "MlpPolicy", 
    env, 
    exploration_initial_eps=1.0,    # Start with 100% random
    exploration_final_eps=0.1,      # End with 10% random  
    exploration_fraction=0.5,       # Take half of training to decay
    learning_rate=1e-3,             # Slightly higher learning rate
    tensorboard_log="./dqn_logs/",
    verbose=1
)

model.learn(total_timesteps=80000, callback=ComprehensiveCallback())

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./dqn_logs/DQN_11
Step 1000, 0s elapsed, 34s remaining
-----------------------------------
| rollout/             |          |
|    exploration_rate  | 0.978    |
| step/                |          |
|    recent_avg_reward | -6.991   |
| time/                |          |
|    elapsed_seconds   | 0.432    |
|    remaining_seconds | 34.1     |
|    steps_per_second  | 2.31e+03 |
| train/               |          |
|    learning_rate     | 0.001    |
|    loss              | 2.23     |
|    n_updates         | 224      |
-----------------------------------
Step 2000, 1s elapsed, 35s remaining
-----------------------------------
| rollout/             |          |
|    exploration_rate  | 0.955    |
| step/                |          |
|    recent_avg_reward | -7.876   |
| time/                |          |
|    elapsed_seconds   | 0.89     |
|    remaining_seconds | 34.7     |
|    steps_

<stable_baselines3.dqn.dqn.DQN at 0x39e494fa0>

In [75]:
model.save("apad_dqn_model")