In [2]:
import gym

from huggingface_sb3 import load_from_hub, package_to_hub, push_to_hub
from huggingface_hub import (
    notebook_login,
)

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

In [4]:
env = gym.make("LunarLander-v2")

observation = env.reset()

for _ in range(20):
    action = env.action_space.sample()
    print(action)

    observation, reward, done, info = env.step(action)

    if done:
        print('done.')
        observation = env.reset()

3
2
0
3
0
3
1
0
3
0
3
0
0
0
2
3
1
1
0
1


In [5]:
env.reset()

print('Observation space shape:', env.observation_space.shape)
print('Sample observation: ', env.observation_space.sample())

print('Action space shape:', env.action_space.n)
print('Sample action: ', env.action_space.sample())

Observation space shape: (8,)
Sample observation:  [ 0.57194936  0.09561267  0.26689327  1.054416    0.2426307   1.8458557
 -0.3834373   1.3069929 ]
Action space shape: 4
Sample action:  3


In [12]:
env = make_vec_env("LunarLander-v2", n_envs=4)

model = PPO(
    policy='MlpPolicy',
    env=env,
    learning_rate=0.0003,
    n_steps=2048,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.001,
    verbose=1,
)

model.learn(total_timesteps=1000000)

model_name = "ppo-lunarlander-v2"
model.save(model_name)

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 91.7     |
|    ep_rew_mean     | -172     |
| time/              |          |
|    fps             | 2923     |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 92.9        |
|    ep_rew_mean          | -165        |
| time/                   |             |
|    fps                  | 2212        |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.006750088 |
|    clip_fraction        | 0.0286      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | 0.00148     |
|    learnin

In [13]:
eval_env = gym.make("LunarLander-v2")

mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward}")

mean_reward:267.43 +/- 15.407416146036882


In [12]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import gym
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env

from huggingface_sb3 import package_to_hub

repo_id = "akghxhs55/ppo-lunarlander-v2"

env_id = "LunarLander-v2"

eval_env = DummyVecEnv([lambda: gym.make(env_id)])


model_architecture = "PPO"

commit_message = "example result"

package_to_hub(model=model,
               model_name=model_name,
               model_architecture=model_architecture,
               env_id=env_id,
               eval_env=eval_env,
               repo_id=repo_id,
               commit_message=commit_message,
)