# Projet - Deep Reinforcement Learning - LunarLander

In [1]:
import os
import time
import gym
import numpy as np
from stable_baselines3 import PPO, A2C, SAC, TD3, DQN
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from sb3_contrib import QRDQN, TQC

### Launching the visualization after searching for the best hyper parameter and learning with PPO

In [5]:
env_id = "LunarLander-v2"
env = gym.make(env_id)
model = PPO.load("PPO_Best")
evaluate_policy(model, env, n_eval_episodes=10, render=True)
env.close()

### Hyper parameter search process with Optuna

In [None]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

In [None]:
N_TRIALS = 200  # Maximum number of trials
N_STARTUP_TRIALS = 5  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 2  # Number of evaluations during training
N_JOBS = 10 # Number of parallel jobs
N_TIMESTEPS = int(1e5)  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 5
N_EVAL_EPISODES = 10
TIMEOUT = int(60 * 15)  # 15 minutes

ENV_ID = "LunarLander-v2"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID,
}

In [None]:
from typing import Any, Dict
import torch
import torch.nn as nn

def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for PPO hyperparameters."""
    # Discount factor between 0.9 and 0.9999
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    n_epochs = trial.suggest_int("n_epochs", 2, 10, log=True)
    # 8, 16, 32, ... 1024
    n_steps = 2 ** trial.suggest_int("exponent_n_steps", 3, 10)

    # - define the learning rate search space [1e-5, 1] (log) -> `suggest_float`
    # - define the network architecture search space ["tiny", "small"] -> `suggest_categorical`
    # - define the activation function search space ["tanh", "relu"]
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)
    net_arch = trial.suggest_categorical("archi", ["tiny", "small","medium","big"])
    activation_fn = trial.suggest_categorical("activ fun", ["tanh", "relu"])

    # Display true values
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("n_steps", n_steps)
    trial.set_user_attr("n_epochs", n_epochs)

    net_arch = {
        "tiny": [32],
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
        # Uncomment for tuning HER
        # "large": [256, 256, 256],
        # "verybig": [512, 512, 512],
    }[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "n_epochs": n_epochs,
        "policy_kwargs": {
            "net_arch": net_arch,
            "activation_fn": activation_fn,
        },
    }

In [None]:
from stable_baselines3.common.callbacks import EvalCallback

class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 10,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

In [None]:
def objective(trial: optuna.Trial) -> float:

    kwargs = DEFAULT_HYPERPARAMS.copy()

    # Sample hyperparameters and update the keyword arguments
    new_dict = sample_ppo_params(trial)
    kwargs.update(new_dict)

    # Create the RL model
    model = PPO(**kwargs)
    # Create envs used for evaluation using `make_vec_env`, `ENV_ID` and `N_EVAL_ENVS`

    # Create the callback that will periodically evaluate
    # and report the performance using `N_EVAL_EPISODES` every `EVAL_FREQ`
    eval_callback = TrialEvalCallback(vec_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ)

    ### END OF YOUR CODE

    nan_encountered = False
    try:
        # Train the model
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        vec_env.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

In [None]:
import torch as th

# Set pytorch num threads to 1 for faster training
th.set_num_threads(1)
# Select the sampler, can be random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)
# Create the study and start the hyperparameter optimization
study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")

try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs = N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass
except (AssertionError, ValueError) as e: 
     # Sometimes, random hyperparams can generate NaN 
     print(e)

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")

print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

print("  User attrs:")
for key, value in trial.user_attrs.items():
    print(f"    {key}: {value}")

# Write report
study.trials_dataframe().to_csv("study_results_a2c_cartpole.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()

### Reinforcement learning on our previously found hyper parameters

In [None]:
env_id = "LunarLander-v2"
env = gym.make(env_id)
n_steps = 1024
gamma = 0.9994783509590176
n_epochs = 7
budget = 1e6

In [None]:
log_dir = "data/save/"
os.makedirs(log_dir, exist_ok=True)
file_name = "PPO_" + env_id
log_file_name = log_dir + file_name

In [None]:
start_time_vec_env = time.time()
# Create 16 environments
vec_env = make_vec_env(env_id, n_envs=16)
# At each call to `env.step()`, 16 transitions will be collected, so we account for that for fair comparison
# model = PPO("MlpPolicy", vec_env, n_epochs=1, n_steps=n_steps // 16, verbose=0, tensorboard_log=log_file_name).learn(budget)

model = PPO("MlpPolicy",vec_env, n_steps = n_steps, gamma = gamma, n_epochs=n_epochs, tensorboard_log=log_file_name)
model.learn(total_timesteps = budget)
time_vec_env = time.time() - start_time_vec_env
print(f"Took {time_vec_env:.2f}s")

In [None]:
evaluate_policy(model, env, n_eval_episodes=3, render=True)
env.close()

In [None]:
model.save("ppo_lunarlander_opti")