In [None]:
%reload_ext autoreload
%autoreload 2

from os import getenv
%env PATH={getenv("PATH") + ":/usr/X11/bin/"}

### Step 2: Import the packages 📦



In [None]:
import gym
import pickle

from tqdm import tqdm
from pathlib import Path
from colabgymrender.recorder import Recorder
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

In [None]:
env_name = "LunarLander-v2"

In [None]:
env = gym.make(env_name)
observation = env.reset()

actions = []
for _ in range(5):
  # Take a random action
  action = env.action_space.sample()
  actions.append(action)

  # Do this action in the environment and get
  # next_state, reward, done and info
  observation, reward, done, info = env.step(action)
  
  # If the game is done (in our case we land, crashed or timeout)
  if done:
      # Reset the environment
      print("Environment is reset")
      observation = env.reset()

print("Actions taken:", actions)

### Docs

https://www.gymlibrary.ml/environments/box2d/lunar_lander/


In [None]:
# We create our environment with gym.make("<name_of_the_environment>")
env = gym.make(env_name)
env.reset()

print("Observation Space Shape", env.observation_space.shape)
print("Sample observation", env.observation_space.sample()) # Get a random observation

We see with `Observation Space Shape (8,)` that the observation is a vector of size 8, each value is a different information about the lander:
- Horizontal pad coordinate (x)
- Vertical pad coordinate (y)
- Horizontal speed (x)
- Vertical speed (y)
- Angle
- Angular speed
- If the left leg has contact point touched the land
- If the right leg has contact point touched the land


In [None]:
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample()) # Take a random action

#### Vectorized Environment
- We create a vectorized environment (method for stacking multiple independent environments into a single environment) of 16 environments, this way, **we'll have more diverse experiences during the training.**

In [None]:
# Create the environment
env = make_vec_env(env_name, n_envs=16)
env = gym.make(env_name)

In [None]:
class TqdmCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.progress_bar = None
    
    def _on_training_start(self):
        self.progress_bar = tqdm(total=self.locals['total_timesteps'])
    
    def _on_step(self):
        self.progress_bar.update(1)
        return True

    def _on_training_end(self):
        self.progress_bar.close()
        self.progress_bar = None

In [None]:
model = PPO(
    policy="MlpPolicy",
    env=env,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=False,
)

with open(Path('data', 'models', "263_44.pkl"), "rb") as f:
    model_params = pickle.loads(f.read())
model.set_parameters(model_params)


### Step 6: Train the PPO agent 🏃
- Let's train our agent for 500,000 timesteps, don't forget to use GPU on Colab. It will take approximately ~10min, but you can use less timesteps if you just want to try it out.
- During the training, take a ☕ break you deserved it 🤗

In [None]:
# model.learn(total_timesteps=1000000)
c = TqdmCallback()
model.learn(total_timesteps=5000, callback=c)

### Step 7: Evaluate the agent 📈

When you evaluate your agent, you should not use your training environment but create an evaluation environment.

In [None]:
eval_env = gym.make(env_name)
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

#target: `200.20 +/- 20.80` after training for 1 million steps

In [None]:
# model.save("data/models/{}_{}.bin".format(int(mean_reward), int(std_reward)))

with open(Path("data", "models", "{}_{}.pkl".format(int(mean_reward), int(std_reward))), "wb") as f:
    f.write(pickle.dumps(model.get_parameters()))
print("Model saved")


In [None]:
# avoids creating multiple screens
if type(env).__name__ != "Recorder":
    env = gym.make("LunarLander-v2")
    directory = "data/video"
    env = Recorder(env, directory)

obs = env.reset()
done = False
while not done:
    action, _state = model.predict(obs)
    obs, reward, done, info = env.step(action)

# env.play()
