In [1]:
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7f6620d29340>

In [2]:
import gym

from huggingface_sb3 import load_from_hub, package_to_hub, push_to_hub
from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

In [3]:
import gym

# First, we create our environment called LunarLander-v2
env = gym.make("LunarLander-v2")

# Then we reset this environment
observation = env.reset()

for _ in range(20):
  # Take a random action
  action = env.action_space.sample()
  print("Action taken:", action)

  # Do this action in the environment and get
  # next_state, reward, done and info
  observation, reward, done, info = env.step(action)
  
  # If the game is done (in our case we land, crashed or timeout)
  if done:
      # Reset the environment
      print("Environment is reset")
      observation = env.reset()

Action taken: 0
Action taken: 2
Action taken: 1
Action taken: 2
Action taken: 0
Action taken: 2
Action taken: 2
Action taken: 3
Action taken: 3
Action taken: 0
Action taken: 1
Action taken: 0
Action taken: 1
Action taken: 3
Action taken: 0
Action taken: 1
Action taken: 3
Action taken: 1
Action taken: 3
Action taken: 0


In [4]:
# We create our environment with gym.make("<name_of_the_environment>")
env = gym.make("LunarLander-v2")
env.reset()
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space Shape", env.observation_space.shape)
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

Observation Space Shape (8,)
Sample observation [ 1.0956166   0.08822614 -0.27322415 -0.9062121   1.4270515  -0.6515286
  0.03129386  0.44468513]


In [5]:
print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample()) # Take a random action


 _____ACTION SPACE_____ 

Action Space Shape 4
Action Space Sample 2


In [6]:
# Create the environment
env = make_vec_env('LunarLander-v2', n_envs=16)

In [7]:
model = PPO(
            policy = "MlpPolicy",
            env=env,
            n_steps=1024,
            batch_size=64,
            n_epochs=4,
            gamma=0.999,
            gae_lambda=0.98,
            ent_coef=0.01,
            verbose=1)

Using cuda device


In [8]:
# Train it for 1,000,000 timesteps
model.learn(total_timesteps=1000000)
# Save the model
model_name = "ppo-LunarLander-v2"
model.save(model_name)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 92.3     |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 3267     |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 16384    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 94.2        |
|    ep_rew_mean          | -154        |
| time/                   |             |
|    fps                  | 2423        |
|    iterations           | 2           |
|    time_elapsed         | 13          |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.009077182 |
|    clip_fraction        | 0.0432      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | -0.0013     |
|    learning_rate        | 0.

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 208          |
|    ep_rew_mean          | -26.3        |
| time/                   |              |
|    fps                  | 1897         |
|    iterations           | 11           |
|    time_elapsed         | 94           |
|    total_timesteps      | 180224       |
| train/                  |              |
|    approx_kl            | 0.0084845815 |
|    clip_fraction        | 0.0562       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.2         |
|    explained_variance   | -0.00131     |
|    learning_rate        | 0.0003       |
|    loss                 | 278          |
|    n_updates            | 40           |
|    policy_gradient_loss | -0.00211     |
|    value_loss           | 591          |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 538          |
|    ep_rew_mean          | 39.3         |
| time/                   |              |
|    fps                  | 1337         |
|    iterations           | 21           |
|    time_elapsed         | 257          |
|    total_timesteps      | 344064       |
| train/                  |              |
|    approx_kl            | 0.0042040916 |
|    clip_fraction        | 0.0327       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.18        |
|    explained_variance   | 0.876        |
|    learning_rate        | 0.0003       |
|    loss                 | 38.6         |
|    n_updates            | 80           |
|    policy_gradient_loss | -0.000901    |
|    value_loss           | 99.2         |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 851          |
|    ep_rew_mean          | 91           |
| time/                   |              |
|    fps                  | 1166         |
|    iterations           | 31           |
|    time_elapsed         | 435          |
|    total_timesteps      | 507904       |
| train/                  |              |
|    approx_kl            | 0.0046630716 |
|    clip_fraction        | 0.0346       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.09        |
|    explained_variance   | 0.975        |
|    learning_rate        | 0.0003       |
|    loss                 | 13.7         |
|    n_updates            | 120          |
|    policy_gradient_loss | -0.00165     |
|    value_loss           | 25.2         |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 926          |
|    ep_rew_mean          | 131          |
| time/                   |              |
|    fps                  | 1100         |
|    iterations           | 41           |
|    time_elapsed         | 610          |
|    total_timesteps      | 671744       |
| train/                  |              |
|    approx_kl            | 0.0044086524 |
|    clip_fraction        | 0.0371       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.98        |
|    explained_variance   | 0.986        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.33         |
|    n_updates            | 160          |
|    policy_gradient_loss | 0.000208     |
|    value_loss           | 15.9         |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 928         |
|    ep_rew_mean          | 146         |
| time/                   |             |
|    fps                  | 1077        |
|    iterations           | 51          |
|    time_elapsed         | 775         |
|    total_timesteps      | 835584      |
| train/                  |             |
|    approx_kl            | 0.004552711 |
|    clip_fraction        | 0.0155      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.885      |
|    explained_variance   | 0.965       |
|    learning_rate        | 0.0003      |
|    loss                 | 7.54        |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.00108    |
|    value_loss           | 46.1        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 926   

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 799          |
|    ep_rew_mean          | 206          |
| time/                   |              |
|    fps                  | 1075         |
|    iterations           | 61           |
|    time_elapsed         | 929          |
|    total_timesteps      | 999424       |
| train/                  |              |
|    approx_kl            | 0.0035351098 |
|    clip_fraction        | 0.043        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.66        |
|    explained_variance   | 0.904        |
|    learning_rate        | 0.0003       |
|    loss                 | 44.8         |
|    n_updates            | 240          |
|    policy_gradient_loss | -0.00163     |
|    value_loss           | 182          |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

In [9]:
#@title
eval_env = gym.make("LunarLander-v2")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")



mean_reward=255.69 +/- 20.244327934762797


In [10]:
notebook_login()
!git config --global credential.helper store

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/ankdesh/.huggingface/token
Login successful


In [11]:
import gym

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env

from huggingface_sb3 import package_to_hub

# PLACE the variables you've just defined two cells above
# Define the name of the environment
env_id = "LunarLander-v2"

# TODO: Define the model architecture we used
model_architecture = "PPO"

## Define a repo_id
## repo_id is the id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
## CHANGE WITH YOUR REPO ID
repo_id = "ankdesh/ppo-LunarLander-v2" # Change with your repo id, you can't push with mine 😄

## Define the commit message
commit_message = "Upload PPO LunarLander-v2 trained agent"

# Create the evaluation env
eval_env = DummyVecEnv([lambda: gym.make(env_id)])

# PLACE the package_to_hub function you've just filled here
package_to_hub(model=model, # Our trained model
               model_name=model_name, # The name of our trained model 
               model_architecture=model_architecture, # The model architecture we used: in our case PPO
               env_id=env_id, # Name of the environment
               eval_env=eval_env, # Evaluation Environment
               repo_id=repo_id, # id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
               commit_message=commit_message)

[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m




Saving video to /tmp/tmp0y4nbezv/-step-0-to-step-1000.mp4


ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

[38;5;4mℹ Pushing repo ankdesh/ppo-LunarLander-v2 to the Hugging Face Hub[0m
[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/ankdesh/ppo-LunarLander-v2/tree/main/[0m


'https://huggingface.co/ankdesh/ppo-LunarLander-v2/tree/main/'

In [12]:
from huggingface_sb3 import load_from_hub
repo_id = "Classroom-workshop/assignment2-omar" # The repo_id
filename = "ppo-LunarLander-v2.zip" # The model filename.zip

# When the model was trained on Python 3.8 the pickle protocol is 5
# But Python 3.6, 3.7 use protocol 4
# In order to get compatibility we need to:
# 1. Install pickle5 (we done it at the beginning of the colab)
# 2. Create a custom empty object we pass as parameter to PPO.load()
custom_objects = {
            "learning_rate": 0.0,
            "lr_schedule": lambda _: 0.0,
            "clip_range": lambda _: 0.0,
}

checkpoint = load_from_hub(repo_id, filename)
model = PPO.load(checkpoint, custom_objects=custom_objects, print_system_info=True)

Downloading:   0%|          | 0.00/146k [00:00<?, ?B/s]

== CURRENT SYSTEM INFO ==
OS: Linux-5.4.0-135-generic-x86_64-with-glibc2.29 #152-Ubuntu SMP Wed Nov 23 20:19:22 UTC 2022
Python: 3.8.10
Stable-Baselines3: 1.6.2
PyTorch: 1.13.0+cu117
GPU Enabled: True
Numpy: 1.23.5
Gym: 0.21.0

== SAVED MODEL SYSTEM INFO ==
OS: Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic #1 SMP Sun Apr 24 10:03:06 PDT 2022
Python: 3.7.13
Stable-Baselines3: 1.5.0
PyTorch: 1.11.0+cu113
GPU Enabled: True
Numpy: 1.21.6
Gym: 0.21.0



In [13]:
#@title
eval_env = gym.make("LunarLander-v2")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")



mean_reward=302.82 +/- 19.110658853012456
