In [None]:
# Compare learning rates of VPG and VPG-GAE(gamma, lambda) with different values for lambda

In [None]:
# Imports
import logging
import os
from pathlib import Path
logging.basicConfig(level=logging.INFO)
ROOT = Path("/Users/debugneantoine/Documents/personal/dqn-super-mario-bros/")
os.chdir(ROOT)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
params = {
    "ytick.color" : "w",
    "xtick.color" : "w",
    "axes.labelcolor" : "w",
    "axes.edgecolor" : "w",
    "axes.titlecolor": "w"
}
plt.rcParams.update(params)
import torch
import gym
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

from agent import VPGAgent, VPGGAEAgent, CategoricalMLP, CategoricalCNN
from wrappers import make_env


In [None]:
# Create environment
env = gym.make("LunarLander-v2")
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n


In [None]:
# Declare all combinations we want to test
# 1a. VPG undiscounted
# 1b. VPG w/ gamma = 0.99
# 2a. VPG-GAE: gamma = 0.99, lambda = 0
# 2b. VPG-GAE: gamma = 0.99, lambda = 0.9
# 2c. VPG-GAE: gamma = 0.99, lambda = 0.95
# 2d. VPG-GAE: gamma = 0.99, lambda = 0.97
# 2e. VPG-GAE: gamma = 0.99, lambda = 0.99

agent_classes = [VPGAgent] * 2 + [VPGGAEAgent] * 5
gammas = [1.] + [0.99] * 6
lambdas = [None, None, 0, 0.9, 0.95, 0.97, 0.99]

def get_base_args_vpg_gae(observation_space, action_space) -> dict:
    return {
        # Environment params
        "state_space": observation_space,
        "action_space": action_space,
        # Policy net params
        "lr": 0.02,
        "policy_net": CategoricalMLP,
        "policy_net_kwargs": {"input_shape": observation_space, "n_actions": action_space, "hidden_sizes": [32]},
        # Value func net params
        "value_func_net": CategoricalMLP,
        "value_func_net_kwargs": {"input_shape": observation_space, "n_actions": 1, "hidden_sizes": [32]},
        "value_func_lr": 0.02,
    }

def get_base_args_vpg(observation_space, action_space) -> dict:
    return {
        # Environment params
        "state_space": observation_space,
        "action_space": action_space,
        # Policy net params
        "lr": 0.02,
        "policy_net": CategoricalMLP,
        "policy_net_kwargs": {"input_shape": observation_space, "n_actions": action_space, "hidden_sizes": [32]},
    }
    
args_funcs = [get_base_args_vpg] * 2 + [get_base_args_vpg_gae] * 5


In [None]:
avg_returns = []

for i, (agent_class, get_args, gamma, _lambda) in enumerate(zip(agent_classes, args_funcs, gammas, lambdas)):

    print(f"Run {i}: (gamma, lambda) = ({gamma}, {_lambda})")
    
    # Reset environment
    env.seed(0)
    env.reset()

    # Instantiate agent
    _args = get_args(observation_space, action_space)
    _args["gamma"] = gamma
    if _lambda is not None:
        _args["value_func_lambda"] = _lambda
    _agent = agent_class(**_args)

    # Train agent on 200 epochs
    _infos = _agent.run(
        env, num_epochs=200, steps_per_epoch=500, save_after_epochs=9999, print_progress_after=50,
    )
    # Save avg. return per epoch
    loss_col = "loss_policy" if "loss_policy" in _infos.columns else "loss"
    _res = _infos[["epoch", "average_return", loss_col]].drop_duplicates()\
        .rename(columns={loss_col: "loss_policy"})
    _res["exp"] = i
    avg_returns.append(_res)

avg_returns = pd.concat(avg_returns)


In [None]:
f, axes = plt.subplots(figsize=(14, 5), nrows=1, ncols=2)
legends= []

for i, (gamma, _lambda) in enumerate(zip(gammas, lambdas)):
    _class = "VPG" if i < 2 else "VPG-GAE"
    legends.append(f"{_class}, (gamma, lambda) = ({gamma}, {_lambda})")    
    
    _returns = avg_returns.loc[avg_returns["exp"] == i]
    axes[0].plot(_returns["epoch"], _returns["average_return"])
    axes[1].plot(_returns["epoch"], _returns["loss_policy"])

axes[0].set_title("Avg. returns per episode")
axes[0].grid(axis="y", color="0.9")
axes[1].set_title("Policy loss per episode")
axes[1].grid(axis="y", color="0.9")
axes[1].legend(legends)
