In [5]:
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback

_________________________________________________________________________________
---------------
| ENVIRONMENT |
---------------
_________________________________________________________________________________

In [6]:
env_args = {
    "id" : "FrozenLake-v1",
    "map_name" : "4x4",
    "is_slippery" : False,
    "render_mode" : "human"
}
env = Monitor(gym.make(**env_args))

_________________________________________________________________________________
-------------------
| HYPERPARAMETERS |
-------------------

The Deep Q-Network hyperparameters determine how the agent behaves and how the 
policy is updated.

Many of these values come from Q-tables. Many others were introduced for the 
neural network approach.

For more information, visit:

https://stable-baselines3.readthedocs.io/en/master/modules/dqn.html#
_________________________________________________________________________________

In [7]:

dqn_args =  {
    "policy" : "MlpPolicy", 
    "learning_rate" : 0.0007,
    "buffer_size" : 10_000,
    "learning_starts" : 100,
    "target_update_interval" : 1_000,
    "gamma" : 0.99,
    "train_freq" : 4,
    "tau" : 1.0,
    "gradient_steps" : 1,
    "exploration_fraction" : 0.1,
    "exploration_initial_eps" : 1.0,
    "exploration_final_eps" : 0.05,
    "batch_size" : 32,
    "verbose" : 1

}
model = DQN(env= env, **dqn_args)


Using cpu device
Wrapping the env in a DummyVecEnv.


_________________________________________________________________________________
------------
| TRAINING |
------------

Parameters
----------
`eval_freq` : int
    Number of training timesteps before evaluating the agent's performance
    on a fresh environment without updating the policy.

`deterministic` : bool
    Indicates whether the agent should ever act randomly or not during evaluation.

`n_eval_episodes` : int
    Number of episodes to perform evaluation before calculating the mean reward.

`total_timesteps` : int
    Determines the number of steps the agent will take before training ends.

`callback` : EvalCallback
    A callback to intermittently perform agent evaluation

`log_interval` : int 
    Determines the number of timesteps before printing training stats to 
    stdout.
_________________________________________________________________________________

In [None]:
callback_args = {
    "eval_freq" : 10_000, 
    "deterministic" : True,
    "n_eval_episodes" : 25
}
eval_callback = EvalCallback(env, **callback_args)

train_args = {
    "total_timesteps" : 1_000,
    "callback" : eval_callback,
    "log_interval" : 100
}
model.learn(**train_args)


2024-12-11 19:13:31.342 Python[81639:31890251] +[IMKClient subclass]: chose IMKClient_Modern
2024-12-11 19:13:31.342 Python[81639:31890251] +[IMKInputSession subclass]: chose IMKInputSession_Modern


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 9.83     |
|    ep_rew_mean      | 0.59     |
|    exploration_rate | 0.147    |
| time/               |          |
|    episodes         | 500      |
|    fps              | 3        |
|    time_elapsed     | 1255     |
|    total_timesteps  | 4491     |
| train/              |          |
|    learning_rate    | 0.0007   |
|    loss             | 8.22e-06 |
|    n_updates        | 1097     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 13.2     |
|    ep_rew_mean      | 0.79     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 1000     |
|    fps              | 3        |
|    time_elapsed     | 2548     |
|    total_timesteps  | 9128     |
| train/              |          |
|    learning_rate    | 0.0007   |
|    loss             | 3.79e-07 |
|    n_updates      

<stable_baselines3.dqn.dqn.DQN at 0x16b4ea7e0>

_________________________________________________________________________________
--------------
| EVALUATION |
--------------

Performs the final model evaluation after training is complete.
_________________________________________________________________________________

In [9]:
n_eval_eps = 100
mean_rwd, rwd_std = evaluate_policy(model= model,
                                    env= env,
                                    deterministic= True,
                                    n_eval_episodes= n_eval_eps,
                                    render= True)

print(f"Final Evaluation:\n> Mean Reward: {mean_rwd}\n> Reward STD: {rwd_std}")

Final Evaluation:
> Mean Reward: 1.0
> Reward STD: 0.0


: 