In [151]:
# Auxiliar imports
import sys, os, time
import matplotlib.pyplot as plt
import numpy as np

# Gym imports
import gym
from gym.vector import SyncVectorEnv

# PyTorch imports
import torch
from torch import nn, optim

# Custom imports
sys.path.append(os.path.abspath('..')) # Add parent directory to path
import ppo_network, ppo_wrapper, importlib
importlib.reload(ppo_network) # Prevents caching issues with notebooks
from ppo_network import PPONetwork
importlib.reload(ppo_wrapper) # Prevents caching issues with notebooks
from ppo_wrapper import PPOWrapper

import hp_optimizer
importlib.reload(hp_optimizer) # Prevents caching issues with notebooks
from hp_optimizer import HPOptimizer

In [152]:
# LunarLander environment
env_id = 'LunarLander-v2'
max_episode_steps = 1024
num_envs = 16

env_kwargs = {
    'id': env_id,
    'max_episode_steps': max_episode_steps,
}

# Create vectorized environment
envs_vector = SyncVectorEnv([lambda: gym.make(**env_kwargs)] * num_envs)
states, infos = envs_vector.reset()

In [153]:
# Policy-Value Network
input_dims = 8
output_dims = 4
shared_hidden_dims = [512, 256, 128]
shared_norm = nn.LayerNorm
policy_hidden_dims = [128, 64]
policy_norm = nn.LayerNorm
value_hidden_dims = [128, 64]
value_norm = nn.LayerNorm
activation = nn.ReLU

network_kwargs = {
    'input_dims': input_dims,
    'output_dims': output_dims,
    'shared_hidden_dims': shared_hidden_dims,
    'shared_norm': shared_norm,
    'policy_hidden_dims': policy_hidden_dims,
    'policy_norm': policy_norm,
    'value_hidden_dims': value_hidden_dims,
    'value_norm': value_norm,
    'activation': activation,
}

network = PPONetwork(**network_kwargs)

# Test forward pass
now = time.time()
for _ in range(100):
    states_tensor = torch.tensor(states, dtype=torch.float32)
    policy, value = network(states_tensor)
    
    actions_dist = torch.distributions.Categorical(logits=policy)
    actions = actions_dist.sample().numpy()
    
    states, rewards, dones, truncateds, infos = envs_vector.step(actions)
    #print(dones)

print(
    f'Elapsed time: per vectorized env: {(time.time() - now)/num_envs:.2f} s'
    )

Elapsed time: per vectorized env: 0.01 s


In [154]:
# PPO Wrapper
lr = 3e-4
final_lr = 1e-6

gamma = 0.99
lam = 0.975

clip_eps = 0.25
final_clip_eps = 0.01

value_coef = 0.7

entropy_coef = 0.1
final_entropy_coef = 0.01

batch_size = 256
batch_epochs = 5
batch_shuffle = True

iterations = 2048

truncated_reward = -300

debug_prints = False

ppo_kwargs = {
    'num_envs': num_envs,
    'lr': lr,
    'final_lr': final_lr,
    'gamma': gamma,
    'lam': lam,
    'clip_eps': clip_eps,
    'final_clip_eps': final_clip_eps,
    'value_coef': value_coef,
    'entropy_coef': entropy_coef,
    'final_entropy_coef': final_entropy_coef,
    'batch_size': batch_size,
    'batch_epochs': batch_epochs,
    'batch_shuffle': batch_shuffle,
    'iterations': iterations,
    'truncated_reward': truncated_reward,
    'debug_prints': debug_prints,   
}

ppo_wrapper = PPOWrapper(envs_vector, network, **ppo_kwargs)

#ppo_wrapper.train(generations=50)

In [None]:
# Hyperparameter optimization
hp_optimizer = HPOptimizer(
    env_kwargs=env_kwargs,
    num_envs=num_envs,
    network_class = PPONetwork,
    network_kwargs=network_kwargs,
    ppo_class=PPOWrapper,
    ppo_kwargs=ppo_kwargs,
)

parameters = [
    'clip_eps', 
    'value_coef', 
    'entropy_coef', 
    'batch_size', 
    'batch_epochs',
    ]

# evolutions = hp_optimizer.optimize_hyperparameters(
#      parameters, generations=250, num_trials = 8,
#      )

hp_optimizer.evolution_video(
    generations=100, video_folder = 'videos', increments=10, max_frames=max_episode_steps,
    )

Running evolution with save generations: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
Generation 0	- Reward: -1082.34,	w/o trunc.: -1082.34
Generation 1	- Reward: -716.93,	w/o trunc.: -716.93
Generation 2	- Reward: -1242.10,	w/o trunc.: -1242.10
Generation 3	- Reward: -492.57,	w/o trunc.: -248.82
Generation 4	- Reward: -349.24,	w/o trunc.: -86.74
Generation 5	- Reward: -216.67,	w/o trunc.: -122.92
Generation 6	- Reward: -371.95,	w/o trunc.: -90.70
Generation 7	- Reward: -478.63,	w/o trunc.: -216.13
Generation 8	- Reward: -177.92,	w/o trunc.: -27.92
Generation 9	- Reward: 12.10,	w/o trunc.: 49.60
Generation 10	- Reward: -60.55,	w/o trunc.: 70.70
Generation 11	- Reward: 77.50,	w/o trunc.: 152.50
Generation 12	- Reward: -89.02,	w/o trunc.: 60.98
Generation 13	- Reward: -119.54,	w/o trunc.: -25.79
Generation 14	- Reward: 84.73,	w/o trunc.: 103.48
Generation 15	- Reward: -35.03,	w/o trunc.: 21.22
Generation 16	- Reward: 68.65,	w/o trunc.: 106.15
Generation 17	- Reward: -2.12,	w/o trunc.: -2