In [None]:
# Enable autoreload for development
import IPython
ipython = IPython.get_ipython()
ipython.run_line_magic('load_ext', 'autoreload')
ipython.run_line_magic('autoreload', '2')

# Auxiliar imports
import sys, os, time
import matplotlib.pyplot as plt
import numpy as np

# Gym imports
import gym
from gym.vector import SyncVectorEnv

# PyTorch imports
import torch
from torch import nn, optim

# Custom imports
sys.path.append(os.path.abspath('..')) # Add parent directory to path

from ppo_network import PPONetworkContinuous
from ppo import PPOContinuous
from hp_tuner import HPTuner

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# BipedalWalker environment
env_id = 'BipedalWalker-v3'
max_episode_steps = 1024
num_envs = 16

env_kwargs = {
    'id': env_id,
    'max_episode_steps': max_episode_steps,
}

# Create vectorized environment
envs_vector = SyncVectorEnv([lambda: gym.make(**env_kwargs)] * num_envs)
states, infos = envs_vector.reset()

In [None]:
# Mean-var-value network
network_kwargs = {
    'input_dims': 24,
    'output_dims': 4,
    'shared_hidden_dims': [1024, 1024, 512],
    'shared_norm': nn.LayerNorm,
    'shared_activation': nn.SiLU,
    'mean_hidden_dims': [512, 256, 128, 64],
    'mean_norm': nn.LayerNorm,
    'mean_activation': nn.SiLU,
    'log_var_hidden_dims': [512, 256, 128, 64],
    'log_var_norm': nn.LayerNorm,
    'log_var_activation': nn.SiLU,
    'value_hidden_dims': [512, 256, 128, 64],
    'value_norm': nn.LayerNorm,
    'value_activation': nn.SiLU,
}

# Create the mean-var-value network
network = PPONetworkContinuous(**network_kwargs)

In [None]:
# Test forward passes
for _ in range(3):
    states_tensor = torch.tensor(states, dtype=torch.float32)
    mean, log_var, value = network(states_tensor)
    std_dev = torch.exp(log_var / 2)
    
    actions_dist = torch.distributions.Normal(mean, std_dev)
    actions = actions_dist.sample().detach().numpy()
    
    states, rewards, dones, truncateds, infos = envs_vector.step(actions)
    print(f"State: {states[0]}"[:65])

State: [-0.02630167 -0.04093861 -0.0264974  -0.00217259  0.471512
State: [-0.05390408 -0.05521606 -0.02996548 -0.01982511  0.511444
State: [-0.06219898 -0.01661935 -0.00316062  0.01299398  0.532144
State: [-0.09121178 -0.05808256 -0.02399905  0.00456464  0.611245
State: [-0.11475505 -0.0471875  -0.01811942 -0.01543638  0.689527
State: [-0.15634106 -0.08339607 -0.03936466 -0.0361727   0.791444
State: [-0.21213384 -0.11151683 -0.04510277 -0.0315982   0.900696
State: [-0.25759536 -0.09099551 -0.03491103 -0.05181476  0.978399
State: [-0.3035023  -0.09192593 -0.0350437  -0.0696331   1.056652
State: [-0.357984   -0.1091228  -0.03873978 -0.07200801  1.135344


In [None]:
# PPO hyperparameters
ppo_kwargs = {
    'network_class': PPONetworkContinuous,
    'network_kwargs': network_kwargs,
    'action_dims': 4,
    'num_envs': num_envs,
    'lr': 3e-4,
    'final_lr': 5e-6,
    'gamma': 0.99,
    'lam': 0.95,
    'clip_eps': 0.25,
    'final_clip_eps': 0.025,
    'value_coef': 0.7,
    'entropy_coef': 0.05,
    'final_entropy_coef': 0.025,
    'batch_size': 2048,
    'mini_batch_size': 512,
    'batch_epochs': 8,
    'batch_shuffle': True,
    'seperate_envs_shuffle': True,
    'reward_normalize': True,
    'truncated_reward': 0,
    'debug_prints': False,
}

ppo = PPOContinuous(envs_vector, **ppo_kwargs)

# Test training
ppo.train(1)


array([-124.14])

In [None]:
# Hyperparameter optimization
hp_tuner = HPTuner(
    env_kwargs=env_kwargs,
    num_envs=num_envs,
    ppo_class=PPOContinuous,
    ppo_kwargs=ppo_kwargs,
)

# Define hyperparameters to optimize
parameters = [
    ('entropy_coef', [0.1, -0.1]),
    ('batch_size', [64, 128, 256, 512]),
    ('batch_epochs', [2, 4, 8, 16]),
    ]

# Optimize hyperparameters
evolutions = hp_tuner.optimize_hyperparameters(
    parameters, generations=50, num_trials = 16,
    )

# Save evolution data
hp_tuner.evolution_video(
    generations=100, video_folder = 'videos', increments=20, max_frames=max_episode_steps,
    )

Optimizing entropy_coef with values: [0.1, -0.1]
Running trials for entropy_coef = 0.1
