In [128]:
# imports 
from stable_baselines3 import DQN, PPO
import gymnasium as gym
import gym_gridworlds
import matplotlib.pyplot as plt
import pandas as pd
from stable_baselines3.common.evaluation import evaluate_policy
import os

In [129]:
# vars
env_name = "DangerMaze-6x6-v0"
save_model_name = "dangermaze"
distance_reward = True
no_stay = True
start_pos = (0,0)
random_goals = False

In [130]:
# make environment
env = gym.make(f"Gym-Gridworlds/{env_name}", 
               no_stay = no_stay, 
               distance_reward = distance_reward, 
               start_pos = start_pos, 
               random_goals = random_goals)
# logging wrapper
os.makedirs("logs/", exist_ok=True)
os.makedirs("trained_models", exist_ok=True)

In [131]:
# model = DQN(
#     "MlpPolicy",
#     env,
#     learning_rate=1e-4,
#     # buffer_size=50000,
#     # learning_starts=1000,
#     # batch_size=32,
#     # tau=1.0,
#     gamma=0.99,
#     # train_freq=(1, "step"),
#     # gradient_steps=1,
#     # target_update_interval=250,
#     # exploration_fraction=0.1,
#     # exploration_final_eps=0.05,
#     verbose=1,
# )
model = PPO(
    "MlpPolicy",
    env,
    learning_rate=1e-4,
    gamma=0.999,
    ent_coef=0.01,
    verbose=1,
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [132]:
# train model, progress bar w tdqm
print("--- Starting Training ---")
model.learn(total_timesteps=100000, progress_bar=False)
print("--- Training Finished ---")

--- Starting Training ---
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4.4      |
|    ep_rew_mean     | -102     |
| time/              |          |
|    fps             | 6001     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.61         |
|    ep_rew_mean          | -104         |
| time/                   |              |
|    fps                  | 3933         |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0061922176 |
|    clip_fraction        | 0.0414       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | -0.0

In [133]:
model.save(f"test_envs/trained_models/{save_model_name}")

In [134]:
eval_env = gym.make(f"Gym-Gridworlds/{env_name}", no_stay=no_stay, distance_reward=distance_reward)
trained_model = PPO.load(f"test_envs/trained_models/{save_model_name}")
mean_reward, std_reward = evaluate_policy(trained_model, eval_env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")




Mean reward: -27.17 +/- 0.00


## Notes
- Default timestep = 500 per run