In [3]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
vec_env = make_vec_env("LunarLander-v2", n_envs=4)

model = PPO(**{
    "policy": "MlpPolicy", 
    "env": vec_env,
    "n_steps": 1024,
    "batch_size": 64,
    "n_epochs": 4,
    "gamma": .999,
    "gae_lambda":.98,
    "ent_coef":.01,
    "verbose":1
    })
model.learn(total_timesteps=1e6)
model.save("lunar_lander")

# del model # remove to demonstrate saving and loading
# model = PPO.load("lunar_lander")

obs = vec_env.reset()
step = 0
while step < 1000:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = vec_env.step(action)
    # vec_env.render("human")
    step += 1
vec_env.close()

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 93       |
|    ep_rew_mean     | -211     |
| time/              |          |
|    fps             | 7100     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 4096     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 93.5         |
|    ep_rew_mean          | -195         |
| time/                   |              |
|    fps                  | 5385         |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 8192         |
| train/                  |              |
|    approx_kl            | 0.0027297675 |
|    clip_fraction        | 0.00128      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | 0.00232      

In [2]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
import gymnasium as gym

eval_env = Monitor(gym.make("LunarLander-v2"))
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
mean_reward

245.0336957