<a href="https://colab.research.google.com/github/amitdamri/HuggingFace-DRL/blob/main/ppo_LunarLander_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Downloads

In [None]:
!apt install swig cmake

In [None]:
!pip install -r https://huggingface.co/spaces/ThomasSimonini/temp-space-requirements/raw/main/requirements/requirements-unit1.txt

In [None]:
!sudo apt-get update
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

In [None]:
!pip install importlib-metadata==4.12.0 # To overcome an issue with importlib-metadata https://stackoverflow.com/questions/73929564/entrypoints-object-has-no-attribute-get-digital-ocean
!pip install gym[box2d]
!pip install gym[Atari]
!pip install stable-baselines3[extra]
!pip install huggingface_sb3
!pip install pyglet
!pip install ale-py==0.7.4 # To overcome an issue with gym (https://github.com/DLR-RM/stable-baselines3/issues/875)

!pip install pickle5

In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install colabgymrender==1.0.2
!pip install imageio==2.4.1

In [None]:
import os
os.kill(os.getpid(), 9)

In [2]:
# Virtual display
from pyvirtualdisplay import Display
virtual_display = Display(visible = 0, size = (1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7fa507ad86a0>

In [3]:
import gym

from huggingface_hub import notebook_login
from huggingface_sb3 import load_from_hub, package_to_hub, push_to_hub

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv


# PPO

### Train the PPO model

In [None]:
# create environment
env = make_vec_env('LunarLander-v2', n_envs = 16)

# Create PPO model
model = PPO(
    policy = 'MlpPolicy',
    env = env,
    n_steps = 1024,
    batch_size = 64,
    n_epochs = 4,
    gamma = 0.999,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    verbose=1)

# Train the model
model.learn(total_timesteps=1e6, progress_bar = True)
model_name = "ppo-LunarLander-v2"
model.save(model_name)

Using cuda device


Output()

  self.pbar = tqdm(total=self.locals["total_timesteps"] - self.model.num_timesteps)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 91.3     |
|    ep_rew_mean     | -172     |
| time/              |          |
|    fps             | 2105     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 16384    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 92.8         |
|    ep_rew_mean          | -147         |
| time/                   |              |
|    fps                  | 1713         |
|    iterations           | 2            |
|    time_elapsed         | 19           |
|    total_timesteps      | 32768        |
| train/                  |              |
|    approx_kl            | 0.0053383047 |
|    clip_fraction        | 0.0463       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.00102      |
|    learning_r

### Train PPO Model Optimized

In [29]:
from stable_baselines3.common.callbacks import EvalCallback

# create environment
n_envs = 16

env = make_vec_env('LunarLander-v2', n_envs = n_envs)

# Create the evaluation envs
eval_envs = make_vec_env('LunarLander-v2', n_envs=5)

# Adjust evaluation interval depending on the number of envs
eval_freq = int(1e5)
eval_freq = max(eval_freq // n_envs, 1)

# Create evaluation callback to save best model
# and monitor agent performance
eval_callback = EvalCallback(
    eval_envs,
    best_model_save_path="./logs/",
    eval_freq=eval_freq,
    n_eval_episodes=10,
)

# Create PPO model
model = PPO(
    policy = 'MlpPolicy',
    env = env,
    n_steps = 1024,
    batch_size = 64,
    n_epochs = 8,
    gamma = 0.999,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    verbose=1)

# Train the model
model.learn(total_timesteps=5e6, callback=eval_callback)
model_name = "ppo-LunarLander-v2"
model.save(model_name)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 262          |
|    ep_rew_mean          | 273          |
| time/                   |              |
|    fps                  | 806          |
|    iterations           | 85           |
|    time_elapsed         | 1726         |
|    total_timesteps      | 1392640      |
| train/                  |              |
|    approx_kl            | 0.0049238214 |
|    clip_fraction        | 0.0537       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.667       |
|    explained_variance   | 0.995        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.974        |
|    n_updates            | 672          |
|    policy_gradient_loss | -0.000784    |
|    value_loss           | 4.66         |
--------------------------------

### Evaluate the model

In [13]:
# Create a new environment for evaluation
eval_env = gym.make('LunarLander-v2')

# Evaluate the model with 10 evaluation episodes and deterministic
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes = 10, deterministic = True) 

#Print the results
print("Mean reward %0.2f, Std reward %0.2f" % (mean_reward, std_reward))



Mean reward 288.08, Std reward 35.44


### Publish the thrained model to the hub

In [4]:
notebook_login()
!git config --global credential.helper store

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [7]:
!ls

rl-baselines3-zoo  sample_data


In [11]:
model = PPO.load("/content/rl-baselines3-zoo/logs/best_model.zip")

In [14]:
# method save, evaluate, generate a model card and record a replay video of your agent before pushing the repo to the hub
package_to_hub(model=model, # Our trained model
               model_name="ppo-LunarLander-v2", # The name of our trained model 
               model_architecture="PPO", # The model architecture we used: in our case PPO
               env_id="LunarLander-v2", # Name of the environment
               eval_env=eval_env, # Evaluation Environment
               repo_id="LuniLand/ppo-LunarLander-v2", # id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
               commit_message="Push to Hub")

[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m
Saving video to /tmp/tmpjx48o1ze/-step-0-to-step-1000.mp4
[38;5;4mℹ Pushing repo LuniLand/ppo-LunarLander-v2 to the Hugging Face Hub[0m
[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/LuniLand/ppo-LunarLander-v2/tree/main/[0m


'https://huggingface.co/LuniLand/ppo-LunarLander-v2/tree/main/'

### Load a trained model

In [None]:
repo_id = "ThomasSimonini/ppo-LunarLander-v2"
filename = "ppo-LunarLander-v2.zip"
custom_objects = {
    "learning_rate": lambda _: 0.0,
    "lr_schedule": lambda _: 0.0,
    "clip_range": lambda _: 0.0
}
checkpoint = load_from_hub(repo_id, filename)
model = PPO.load(checkpoint, custom_objects = custom_objects, print_system_info = True)

evalute this model

In [None]:
eval_env = gym.make('LunarLander-v2')
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic = True)
print(f"mean reward {mean_reward:.2f} +/- {std_reward:.2f}")




mean reward 261.28 +/- 21.97


watch the agent in action

In [None]:
from stable_baselines3.common.vec_env import VecFrameStack
from colabgymrender.recorder import Recorder


env = make_vec_env('LunarLander-v2', n_envs=1)
env = VecFrameStack(env, n_stack = 1)

directory = './video'
env = Recorder(env, directory)

obs = env.reset()
done = False
while not done:
  action, _state = model.predict(obs)
  obs, reward, done, info = env.step(action)
env.play()

# DQN

In [None]:
from stable_baselines3 import DQN

env = make_vec_env('LunarLander-v2', n_envs = 16)

model = DQN('MlpPolicy', env, verbose = 1, exploration_final_eps=0.1, target_update_interval=250)

model.learn(total_timesteps = int(1e5), progress_bar = True)
model_name = "dqn-LunarLander-v2"
model.save(model_name)

  self.pbar = tqdm(total=self.locals["total_timesteps"] - self.model.num_timesteps)



Output()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.42     |
|    n_updates        | 687      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 117      |
|    ep_rew_mean      | -218     |
|    exploration_rate | 0.103    |
| time/               |          |
|    episodes         | 920      |
|    fps              | 3262     |
|    time_elapsed     | 28       |
|    total_timesteps  | 94464    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 2.83     |
|    n_updates        | 694      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 115      |
|    ep_rew_mean      | -206     |
|    exploration_rate | 0.1      |
| time/               |          |
|    episodes         | 9

In [None]:
eval_env = gym.make('LunarLander-v2')

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10, deterministic = True)

print(f"mean reward {mean_reward:.2f} +/- {std_reward:.2f}")

DQN RL ZOO Optimized

In [None]:
!pip install rl_zoo3

In [None]:
!git clone https://github.com/DLR-RM/rl-baselines3-zoo

In [None]:
%cd rl-baselines3-zoo/

In [None]:
!python train.py --algo dqn --env LunarLander-v2 --eval-episodes 10 --eval-freq 10000

In [None]:
!python enjoy.py --algo dqn --env LunarLander-v2 -f logs/ --exp-id 0

In [None]:
notebook_login()

In [None]:
!python -m rl_zoo3.push_to_hub --algo dqn --env LunarLander-v2 -f logs/ -orga LuniLand -m "Upload LunarLander-v2 optimized" --repo-name dqn-LunarLander-v2