In [None]:
%reload_ext autoreload
%autoreload 2

from os import getenv
from platform import system

if system() == "Darwin":
    %env PATH={getenv("PATH") + ":/usr/X11/bin/"}

current_jupyter_path = %pwd # type: ignore
if not current_jupyter_path.endswith("deep-rl-class"):
    %cd ..
# type: ignore

### Step 2: Import the packages 📦



In [None]:
import gym
import pickle
import numpy
from tqdm import tqdm
from multiprocessing import cpu_count
from pathlib import Path
import matplotlib.pyplot as plt
from colabgymrender.recorder import Recorder

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy, X_EPISODES, window_func
from stable_baselines3.common.vec_env import SubprocVecEnv

from logic.callbacks import TqdmCallback, SaveOnBestTrainingRewardCallback

In [None]:
env_name = "LunarLander-v2"

In [None]:
env = gym.make(env_name)
observation = env.reset()

actions = []
for _ in range(5):
  # Take a random action
  action = env.action_space.sample()
  actions.append(action)

  # Do this action in the environment and get
  # next_state, reward, done and info
  observation, reward, done, info = env.step(action)
  
  # If the game is done (in our case we land, crashed or timeout)
  if done:
      # Reset the environment
      print("Environment is reset")
      observation = env.reset()

print("Actions taken:", actions)

### Docs

https://www.gymlibrary.ml/environments/box2d/lunar_lander/


In [None]:
# We create our environment with gym.make("<name_of_the_environment>")
env = gym.make(env_name)
env.reset()

print("Observation Space Shape", env.observation_space.shape)
print("Sample observation", env.observation_space.sample()) # Get a random observation

We see with `Observation Space Shape (8,)` that the observation is a vector of size 8, each value is a different information about the lander:
- Horizontal pad coordinate (x)
- Vertical pad coordinate (y)
- Horizontal speed (x)
- Vertical speed (y)
- Angle
- Angular speed
- If the left leg has contact point touched the land
- If the right leg has contact point touched the land


In [None]:
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample()) # Take a random action

## Create Environment
- More envs, more diverse experiences during the training
- Use SubprocVecEnv if processor has more than 8 cores

In [None]:
log_dir = Path("data", "monitor")
log_dir.mkdir(parents=True, exist_ok=True)

n_envs = cpu_count() # 1
if n_envs == 1:
    env = Monitor(gym.make(env_name), log_dir.as_posix())
else:
    # use SubprocVecEnv only for high cpu core count
    env = make_vec_env(env_name, n_envs=n_envs, vec_env_cls=SubprocVecEnv, monitor_dir=log_dir.as_posix())

print("Using {} envs".format(n_envs))

In [None]:
model = PPO(
    policy="MlpPolicy",
    env=env,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=False,
)

best_model_path = Path("data", "models", "303_14.pkl")
if best_model_path.exists():
    print("Loading best model")
    with open(best_model_path, "rb") as f:
        model_params = pickle.loads(f.read())
    model.set_parameters(model_params)


## Train model

In [None]:
total_timesteps = model.n_steps * 10
c1 = TqdmCallback(n_envs=n_envs)
callbacks = [c1]
callbacks.append(SaveOnBestTrainingRewardCallback(check_freq=model.n_steps*20, env_name=env_name, verbose=0))
results = model.learn(total_timesteps=total_timesteps, callback=callbacks)

## Plot progress

In [None]:
data_frames = []
data_frame = load_results(log_dir)

data_frame = data_frame[data_frame.l.cumsum() <= total_timesteps]
data_frames.append(data_frame)

In [None]:
rolling_window_size= 100 #total_timesteps//1000
xy_list = [ts2xy(data_frame, X_EPISODES) for data_frame in data_frames]

plt.figure(env_name, figsize=(8, 2))
max_x = max(xy[0][-1] for xy in xy_list)
min_x = 0
for (_, (x, y)) in enumerate(xy_list):
    plt.scatter(x, y, s=2)
    if x.shape[0] >= rolling_window_size:
        x, y_mean = window_func(x, y, rolling_window_size, numpy.mean)
        plt.plot(x, y_mean, "r")
plt.xlim(min_x, max_x)
plt.title(env_name)
plt.xlabel("Epochs")
plt.ylabel("Rewards")
plt.tight_layout()

## Evaluate model

When you evaluate your agent, you should not use your training environment but create an evaluation environment.

In [None]:
eval_env = gym.make(env_name)
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
# model.save("data/models/{}_{}.bin".format(int(mean_reward), int(std_reward)))

with open(Path("data", "models", "{}_{}.pkl".format(int(mean_reward), int(std_reward))), "wb") as f:
    f.write(pickle.dumps(model.get_parameters()))
print("Model saved")


## Records a test run

If this triggers an error, please make sure that you launched XQuartz on the terminal

In [None]:
export_test_run = False

# avoids creating multiple screens
if type(env).__name__ != "Recorder":
    env = gym.make(env_name)
    directory = Path("data", "video")
    env = Recorder(env, directory.as_posix())

obs = env.reset()
done = False
while not done:
    action, _state = model.predict(obs)
    obs, reward, done, info = env.step(action)

# This exports makes sure that the video is encoded properly (that is why is take more time)
if export_test_run:
    env.play()
    !mv "__temp__.mp4" "data/video/test_run_recording.mp4"