In [1]:
%load_ext autoreload
%autoreload 2
#!pip install sb3-contrib

In [5]:
from sb3_contrib import MaskablePPO
from apad_env import APADEnv
from stable_baselines3.common.callbacks import BaseCallback
import time

In [3]:
class TimerCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.start_time = time.time()
    
    def _on_step(self):
        if self.num_timesteps % 1000 == 0:
            elapsed = time.time() - self.start_time
            rate = self.num_timesteps / elapsed
            remaining = (self.locals['total_timesteps'] - self.num_timesteps) / rate
            print(f"Step {self.num_timesteps}, {elapsed:.0f}s elapsed, {remaining:.0f}s remaining")
        return True

In [6]:
env = APADEnv()

# Adds penalty for overly confident predictions.
# Forces model to maintain some uncertainty, preventing probabilities from becoming too extreme (close to 0/1)
# where numerical errors accumulate.
entropy_coeff = 0.05

model = MaskablePPO(
    "MlpPolicy",
    env,
    learning_rate=1e-4,
    tensorboard_log="./maskable_ppo_logs/",
    verbose=2,
    ent_coef=entropy_coeff,
    batch_size=32
)

model.learn(total_timesteps=50000, callback=TimerCallback())
model.save("checkpoint_50k")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./maskable_ppo_logs/PPO_25
Step 1000, 23s elapsed, 1125s remaining
Step 2000, 46s elapsed, 1110s remaining
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3.47     |
|    ep_rew_mean     | 30.3     |
| time/              |          |
|    fps             | 43       |
|    iterations      | 1        |
|    time_elapsed    | 47       |
|    total_timesteps | 2048     |
---------------------------------
Step 3000, 73s elapsed, 1140s remaining
Step 4000, 96s elapsed, 1109s remaining
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.72        |
|    ep_rew_mean          | 35.1        |
| time/                   |             |
|    fps                  | 41          |
|    iterations           | 2           |
|    time_elapsed         | 98          |
|    total_timesteps      | 4096

In [7]:
model.learn(total_timesteps=50000, callback=TimerCallback())
model.save("checkpoint_100k")

Logging to ./maskable_ppo_logs/PPO_26
Step 1000, 19s elapsed, 949s remaining
Step 2000, 39s elapsed, 932s remaining
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4.66     |
|    ep_rew_mean     | 54.4     |
| time/              |          |
|    fps             | 51       |
|    iterations      | 1        |
|    time_elapsed    | 39       |
|    total_timesteps | 2048     |
---------------------------------
Step 3000, 61s elapsed, 953s remaining
Step 4000, 80s elapsed, 923s remaining
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.72        |
|    ep_rew_mean          | 55.8        |
| time/                   |             |
|    fps                  | 49          |
|    iterations           | 2           |
|    time_elapsed         | 82          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.012096993 |
|   

In [9]:
model.learn(total_timesteps=50000, callback=TimerCallback())
model.save("checkpoint_150k")

Logging to ./maskable_ppo_logs/PPO_28
Step 1000, 15s elapsed, 721s remaining
Step 2000, 29s elapsed, 703s remaining
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 6.91     |
|    ep_rew_mean     | 112      |
| time/              |          |
|    fps             | 68       |
|    iterations      | 1        |
|    time_elapsed    | 30       |
|    total_timesteps | 2048     |
---------------------------------


ValueError: Expected parameter probs (Tensor of shape (32, 2752)) of distribution MaskableCategorical(probs: torch.Size([32, 2752]), logits: torch.Size([32, 2752])) to satisfy the constraint Simplex(), but found invalid values:
tensor([[5.9730e-04, 7.5872e-06, 1.2978e-05,  ..., 3.1609e-05, 3.1682e-05,
         3.1722e-05],
        [3.2597e-04, 3.7507e-06, 8.0732e-06,  ..., 1.7081e-05, 1.7139e-05,
         1.7166e-05],
        [5.8032e-04, 7.2927e-06, 1.2572e-05,  ..., 3.0533e-05, 3.0606e-05,
         3.0644e-05],
        ...,
        [5.0151e-04, 5.9770e-06, 1.0925e-05,  ..., 2.5611e-05, 2.5675e-05,
         2.5711e-05],
        [5.5374e-04, 6.9121e-06, 1.1983e-05,  ..., 2.9096e-05, 2.9167e-05,
         2.9205e-05],
        [5.0690e-04, 6.2201e-06, 1.1232e-05,  ..., 2.6509e-05, 2.6581e-05,
         2.6617e-05]], grad_fn=<SoftmaxBackward0>)

Let's keep going