In [1]:
%%capture
import sys
if 'google.colab' in sys.modules:
    !apt install python-opengl ffmpeg xvfb
    !pip install pyvirtualdisplay
    !pip install gym[box2d] stable-baselines3[extra] huggingface_sb3 pyglet
    !pip install ale-py==0.7.4 # To overcome an issue with gym (https://github.com/DLR-RM/stable-baselines3/issues/875)
    !pip install wandb

In [2]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7fd37808cdd0>

In [12]:
import gym

from huggingface_sb3 import load_from_hub, package_to_hub, push_to_hub
from huggingface_hub import notebook_login

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CallbackList, EvalCallback

import wandb
from wandb.integration.sb3 import WandbCallback

In [4]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
notebook_login()
!git config --global credential.helper store

Login successful
Your token has been saved to /root/.huggingface/token


In [10]:
ENV_NAME = 'LunarLander-v2'
max_steps = int(2e6)

config = dict(
    policy_type = "MlpPolicy",
    total_timesteps = max_steps,
    seed = 124,
    n_epochs=10,
    lr = 1e-3,
    clip_range = 0.2,
    decay = True,
    gamma = 0.99,
    gae_lambda = 0.95,
    batch_size = 64,
    target_kl = None,
    ent_coef = 0.01,
    vf_coef = 0.5,
)

In [7]:
def linear_decay_sched(x, factor):
    return lambda pct: ((1/factor - 1)*(1-pct) + 1)*x

In [27]:
from stable_baselines3.common.callbacks import BaseCallback

class ReduceEntCoef(BaseCallback):

    def __init__(self, step):
        super().__init__()
        self._step = step

    def _on_step(self) -> bool:
        if self.n_calls == self._step:
            self.model.ent_coef = 0.
        return True

In [33]:
_d = "linear" if config['decay'] else 'no'
experiment_name = f"{ENV_NAME}-ppo-{config['lr']:.0e}-{_d}-decay-{config['total_timesteps']:.0e}steps"
with wandb.init(
        project="hf-deep-rl-class",
        name=f"{experiment_name}",
        group=experiment_name,
        config=config,
        sync_tensorboard=True,
        monitor_gym=True,
        save_code=False,
    ) as run:


    env = make_vec_env(ENV_NAME, n_envs=16)
    env = VecVideoRecorder(env, f"videos/{run.id}", record_video_trigger=lambda x: x>max_steps)

    model = PPO(
        config["policy_type"], 
        env,
        n_steps=1024,
        n_epochs=config["n_epochs"],
        learning_rate=linear_decay_sched(config['lr'], 10) if config["decay"] else config['lr'],
        clip_range=linear_decay_sched(config['clip_range'], 10) if config["decay"] else config['clip_range'],
        verbose=1,
        tensorboard_log=f"runs/{run.id}",
        batch_size=config["batch_size"],
        gamma = config["gamma"],
        gae_lambda = config["gae_lambda"],
        target_kl = config["target_kl"],
        ent_coef=config["ent_coef"],
        vf_coef = config["vf_coef"],
    )
    model_name = f"{ENV_NAME}-ppo"

    wandb_cb = WandbCallback(
        gradient_save_freq=100,
        model_save_path=model_name,
        verbose=2,
    )
    eval_env = DummyVecEnv([lambda: Monitor(gym.make(ENV_NAME))])
    eval_cb = EvalCallback(
        eval_env=eval_env,
        eval_freq=int(1e4),
        log_path="./log",
        best_model_save_path=model_name
    )
    ent_coef_cb = ReduceEntCoef(15e5)

    model.learn(
        total_timesteps=config["total_timesteps"],
        callback=CallbackList([wandb_cb, eval_cb, ent_coef_cb]),
    )
    model.load(f'{model_name}/best_model.zip', env)
    package_to_hub(
        model=model,
        model_name=model_name,
        model_architecture="PPO",
        env_id=ENV_NAME,
        eval_env=eval_env,
        repo_id=f"arampacha/{model_name}-2",
        commit_message=f"trained model {config['total_timesteps']:.0e} steps"
    )

Using cpu device




Logging to runs/27c28ti0/PPO_1




Saving video to /content/videos/27c28ti0/rl-video-step-0-to-step-200.mp4
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 91.3     |
|    ep_rew_mean     | -181     |
| time/              |          |
|    fps             | 393      |
|    iterations      | 1        |
|    time_elapsed    | 41       |
|    total_timesteps | 16384    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 94.1        |
|    ep_rew_mean          | -118        |
| time/                   |             |
|    fps                  | 572         |
|    iterations           | 2           |
|    time_elapsed         | 57          |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.010106336 |
|    clip_fraction        | 0.112       |
|    clip_range           | 0.199       |
|    entropy_loss         | -1.38       |

/content/hub/LunarLander-v2-ppo-2 is already a clone of https://huggingface.co/arampacha/LunarLander-v2-ppo-2. Make sure you pull the latest changes with `repo.git_pull()`.


Saving video to /content/-step-0-to-step-1000.mp4
[38;5;4mℹ Pushing repo LunarLander-v2-ppo-2 to the Hugging Face Hub[0m


Upload file replay.mp4:   2%|1         | 3.34k/196k [00:00<?, ?B/s]

Upload file LunarLander-v2-ppo/policy.optimizer.pth:   4%|4         | 3.34k/82.7k [00:00<?, ?B/s]

Upload file LunarLander-v2-ppo.zip:   2%|2         | 3.34k/141k [00:00<?, ?B/s]

Upload file LunarLander-v2-ppo/policy.pth:   8%|7         | 3.34k/42.1k [00:00<?, ?B/s]

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/arampacha/LunarLander-v2-ppo-2
   ae67035..c2322fb  main -> main



[38;5;4mℹ Your model is pushed to the hub. You can view your model here:
https://huggingface.co/arampacha/LunarLander-v2-ppo-2[0m


VBox(children=(Label(value='1.036 MB of 1.036 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/mean_ep_length,█▅▂▂▃▁▁▁▁▁▁▁
eval/mean_reward,▁▆██▇███████
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▂▄▇█▇▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
rollout/ep_rew_mean,▁▂▃▄▅▆▆▇█▇██████████████████████████████
time/fps,▁▆▇▅▅▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇██████████
train/approx_kl,▆█▇▃▄▃▄▆▄▃▄▄▃▃▃▄▂▃▃▂▂▂▂▂▂▂▂▂▂▃▁▁▁▁▁▁▁▁▁▁
train/clip_fraction,▂█▅▁▂▁▂▃▃▂▂▃▂▁▂▃▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▃▂▂▂▂▂▃
train/clip_range,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/entropy_loss,▁▂▃▄▄▅▆▅▅▆▆▆▇▇▇▇▇▇▇▇▇████████▇██████████

0,1
eval/mean_ep_length,170.8
eval/mean_reward,278.57394
global_step,2015232.0
rollout/ep_len_mean,169.69
rollout/ep_rew_mean,282.46021
time/fps,863.0
train/approx_kl,0.00011
train/clip_fraction,0.1169
train/clip_range,0.0201
train/entropy_loss,-0.46241
