In [2]:
import gymnasium as gym
import gym_gridworlds
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

## training on 4x5 gridworld

In [3]:
# create gridworld environment using gymnasium
env = gym.make("Gym-Gridworlds/Full-4x5-v0", grid="4x4_quicksand")#, render_mode="human")

# setup model and parameters
model = DQN(
    "MlpPolicy",
    env,
    learning_rate=5e-4,
    buffer_size=50000,
    learning_starts=1000,
    batch_size=32,
    tau=1.0,
    gamma=0.85,
    train_freq=(1, "step"),
    gradient_steps=1,
    target_update_interval=250,
    exploration_fraction=0.1,
    exploration_final_eps=0.05,
    verbose=1,
)
print(env.action_space)
print(env.observation_space)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Discrete(5)
Discrete(16)


In [4]:
# train model, progress bar w tdqm
print("--- Starting Training ---")
model.learn(total_timesteps=50000, progress_bar=False)
print("--- Training Finished ---")

# save the model
model.save("dqn_gridworld_quicksand")

--- Starting Training ---
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 27.2     |
|    ep_rew_mean      | -16.8    |
|    exploration_rate | 0.979    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 15733    |
|    time_elapsed     | 0        |
|    total_timesteps  | 109      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 32       |
|    ep_rew_mean      | -33      |
|    exploration_rate | 0.951    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 13443    |
|    time_elapsed     | 0        |
|    total_timesteps  | 256      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 36.5     |
|    ep_rew_mean      | -30.2    |
|    exploration_rate | 0.917    |
| time/               |      

### loading and evaluating model

In [None]:
eval_env = gym.make("Gym-Gridworlds/Full-4x5-v0", grid="4x4_quicksand") #, render_mode="human")

trained_model = DQN.load("dqn_gridworld_quicksand", env=eval_env)

mean_reward, std_reward = evaluate_policy(trained_model, eval_env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Mean reward: 1.00 +/- 0.00




## training on cliffwalk

In [13]:
# create gridworld environment using gymnasium
env = gym.make("Gym-Gridworlds/CliffWalk-4x12-v0")#, render_mode="human")

# setup model and parameters
model = DQN(
    "MlpPolicy",
    env,
    learning_rate=1e-4,
    buffer_size=50000,
    learning_starts=1000,
    batch_size=32,
    tau=1.0,
    gamma=0.9,
    train_freq=(1, "step"),
    gradient_steps=1,
    target_update_interval=250,
    exploration_fraction=0.1,
    exploration_final_eps=0.05,
    verbose=1,
)
print(env.action_space)
print(env.observation_space)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Discrete(5)
Discrete(48)


In [None]:
# train model, progress bar w tdqm
print("--- Starting Training ---")
model.learn(total_timesteps=50000, progress_bar=False)
print("--- Training Finished ---")

# save the model
model.save("dqn_gridworld_cliffwalk")

--- Starting Training ---
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 5.5      |
|    ep_rew_mean      | -100     |
|    exploration_rate | 0.998    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3042     |
|    time_elapsed     | 0        |
|    total_timesteps  | 22       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 6.5      |
|    ep_rew_mean      | -100     |
|    exploration_rate | 0.995    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4672     |
|    time_elapsed     | 0        |
|    total_timesteps  | 52       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 5.42     |
|    ep_rew_mean      | -100     |
|    exploration_rate | 0.994    |
| time/               |      

### loading and evaluating model


In [12]:
eval_env = gym.make("Gym-Gridworlds/CliffWalk-4x12-v0") #, render_mode="human")

trained_model = DQN.load("dqn_gridworld_cliffwalk", env=eval_env)

mean_reward, std_reward = evaluate_policy(trained_model, eval_env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Mean reward: 0.00 +/- 0.00


## Notes
- Default timestep = 500 per run