In [7]:
# Auxiliar imports
import sys, os, time, importlib
import matplotlib.pyplot as plt
import numpy as np

# Gym imports
import gym
from gym.vector import SyncVectorEnv

# PyTorch imports
import torch
from torch import nn, optim

# Custom imports
sys.path.append(os.path.abspath('..')) # Add parent directory to path

import ppo_network
importlib.reload(ppo_network) # Prevents caching issues with notebooks
from ppo_network import PPONetworkContinuous

import ppo
importlib.reload(ppo) # Prevents caching issues with notebooks
from ppo import PPOContinuous

import hp_tuner # TODO - Rename to hp_tuner
importlib.reload(hp_tuner) # Prevents caching issues with notebooks
from hp_tuner import HPOptimizer

In [8]:
# BipedalWalker environment
env_id = 'BipedalWalker-v3'
max_episode_steps = 1024
num_envs = 16

env_kwargs = {
    'id': env_id,
    'max_episode_steps': max_episode_steps,
}


# Create vectorized environment
envs_vector = SyncVectorEnv([lambda: gym.make(**env_kwargs)] * num_envs)
states, infos = envs_vector.reset()

In [9]:
# Policy-Value Network
# TODO - Move to PPO-kwargs
input_dims = 24
output_dims = 4

shared_hidden_dims = [1024, 1024, 512]
shared_norm = nn.LayerNorm
shared_activation = nn.ReLU

mean_hidden_dims = [512, 256, 128, 64]
mean_norm = nn.LayerNorm
mean_activation = nn.ReLU

log_var_hidden_dims = [512, 256, 128, 64]
log_var_norm = nn.LayerNorm
log_var_activation = nn.ReLU

value_hidden_dims = [512, 256, 128, 64]
value_norm = nn.LayerNorm
value_activation = nn.ReLU

network_kwargs = {
    'input_dims': input_dims,
    'output_dims': output_dims,
    
    'shared_hidden_dims': shared_hidden_dims,
    'shared_norm': shared_norm,
    'shared_activation': shared_activation,
    
    'mean_hidden_dims': mean_hidden_dims,
    'mean_norm': mean_norm,
    'mean_activation': mean_activation,
    
    'log_var_hidden_dims': log_var_hidden_dims,
    'log_var_norm': log_var_norm,
    'log_var_activation': log_var_activation,
    
    'value_hidden_dims': value_hidden_dims,
    'value_norm': value_norm,
    'value_activation': value_activation,
}

network = PPONetworkContinuous(**network_kwargs)

In [10]:
# Test forward passes
now = time.time()
for _ in range(10):
    states_tensor = torch.tensor(states, dtype=torch.float32)
    mean, log_var, value = network(states_tensor)
    std_dev = torch.exp(log_var / 2)
    
    actions_dist = torch.distributions.Normal(mean, std_dev)
    actions = actions_dist.sample()
    
    states, rewards, dones, truncateds, infos = envs_vector.step(actions)
print(
    f'Elapsed time: per vectorized env: {(time.time() - now)/num_envs:.2f} s'
    )

Elapsed time: per vectorized env: 0.01 s


  logger.warn(


In [11]:
# PPO 
action_dims = 4

lr = 3e-4
final_lr = 5e-6

gamma = 0.99
lam = 0.95

clip_eps = 0.25
final_clip_eps = 0.025

value_coef = 0.7

entropy_coef = 0.05
final_entropy_coef = 0.025

batch_size = 512 # TODO - rename to mini_batch
batch_epochs = 8
batch_shuffle = True
seperate_envs_shuffle = True

iterations = 2048  # TODO - rename to batch

reward_normalize = True
truncated_reward = 50

debug_prints = False

ppo_kwargs = {
    'action_dims': action_dims,
    'num_envs': num_envs,
    'lr': lr,
    'final_lr': final_lr,
    'gamma': gamma,
    'lam': lam,
    'clip_eps': clip_eps,
    'final_clip_eps': final_clip_eps,
    'value_coef': value_coef,
    'entropy_coef': entropy_coef,
    'final_entropy_coef': final_entropy_coef,
    'batch_size': batch_size,
    'batch_epochs': batch_epochs,
    'batch_shuffle': batch_shuffle,
    'seperate_envs_shuffle': seperate_envs_shuffle,
    'iterations': iterations,
    'reward_normalize': reward_normalize,
    'truncated_reward': truncated_reward,
    'debug_prints': debug_prints,   
}

ppo = PPOContinuous(envs_vector, network, **ppo_kwargs)

#ppo.train(generations=500)


In [12]:
# Hyperparameter optimization
hp_optimizer = HPOptimizer(
    env_kwargs=env_kwargs,
    num_envs=num_envs,
    network_class = PPONetworkContinuous,
    network_kwargs=network_kwargs,
    ppo_class=PPOContinuous,
    ppo_kwargs=ppo_kwargs,
)

parameters = [
    ('entropy_coef', [0.1, -0.1]),
    ('batch_size', [64, 128, 256, 512]),
    ('batch_epochs', [2, 4, 8, 16]),
    ]

# evolutions = hp_optimizer.optimize_hyperparameters(
#     parameters, generations=50, num_trials = 16,
#     )


hp_optimizer.evolution_video(
    generations=100, video_folder = 'videos', increments=10, max_frames=max_episode_steps,
    )

Running evolution with save generations: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
Generation    0 - Reward:   -30.98, w/o trunc.:   -80.98
Generation    1 - Reward:   -37.33, w/o trunc.:   -87.33
Generation    2 - Reward:   -36.50, w/o trunc.:   -86.50
Generation    3 - Reward:   -38.95, w/o trunc.:   -88.95
Generation    4 - Reward:   -37.70, w/o trunc.:   -87.70
Generation    5 - Reward:   -37.84, w/o trunc.:   -87.84
Generation    6 - Reward:   -37.67, w/o trunc.:   -87.67
Generation    7 - Reward:   -37.68, w/o trunc.:   -87.68
Generation    8 - Reward:   -37.69, w/o trunc.:   -87.69
Generation    9 - Reward:   -37.57, w/o trunc.:   -87.57
Generation   10 - Reward:   -37.63, w/o trunc.:   -87.63
Generation   11 - Reward:   -37.64, w/o trunc.:   -87.64
Generation   12 - Reward:   -37.84, w/o trunc.:   -87.84
Generation   13 - Reward:   -37.85, w/o trunc.:   -87.85
Generation   14 - Reward:   -37.75, w/o trunc.:   -87.75
Generation   15 - Reward:   -37.78, w/o trunc.:   -87.78
Ge