In [1]:
import os
from pathlib import Path
project_root = os.path.join(str(Path.home()) + '/Documents', 'PPGADev')
os.chdir(project_root)
%pwd # should be PPGA root dir

'/home/icaros/Documents/PPGADev'

In [2]:
import pickle

import numpy as np
from attrdict import AttrDict
from RL.ppo import *
from utils.utilities import log
from envs.brax_custom.brax_env import make_vec_env_brax
from models.actor_critic import Actor, PGAMEActor
from pandas import DataFrame

from IPython.display import HTML, Image
from IPython.display import display
from brax.io import html, image

In [3]:
# params to config
device = torch.device('cuda')
# env_name = 'humanoid'
env_name = 'ant'
seed = 1111
normalize_obs = True
normalize_rewards = True
# non-configurable params
obs_shapes = {
    'humanoid': (227,),
    'ant': (87,),
    'halfcheetah': (18,),
    'walker2d': (17,)
}
action_shapes = {
    'humanoid': (17,),
    'ant': (8,),
    'halfcheetah': (6,),
    'walker2d': (6,)
}

# define the final config objects
actor_cfg = AttrDict({
        'obs_shape': obs_shapes[env_name],
        'action_shape': action_shapes[env_name],
        'normalize_obs': normalize_obs,
        'normalize_rewards': normalize_rewards,
})
env_cfg = AttrDict({
        'env_name': env_name,
        'env_batch_size': None,
        'num_dims': 2 if not 'ant' in env_name else 4,
        'envs_per_model': 1,
        'seed': seed,
        'num_envs': 1,
        'clip_obs_rew': True
})


In [4]:
# now lets load in a saved archive dataframe and scheduler
# change this to be your own checkpoint path
archive_path = 'experiments/paper_ppga_ant/1111/checkpoints/cp_00002000/archive_df_00002000.pkl'
scheduler_path = 'experiments/paper_ppga_ant/1111/checkpoints/cp_00002000/scheduler_00002000.pkl'
with open(archive_path, 'rb') as f:
    archive_df = pickle.load(f)
with open(scheduler_path, 'rb') as f:
    scheduler = pickle.load(f)

In [5]:
# create the environment
env = make_vec_env_brax(env_cfg)

In [6]:
def get_best_elite():
    best_elite = scheduler.archive.best_elite
    print(f'Loading agent with reward {best_elite.objective} and measures {best_elite.measures}')
    agent = Actor(obs_shape=actor_cfg.obs_shape[0], action_shape=actor_cfg.action_shape, normalize_obs=normalize_obs, normalize_returns=normalize_rewards).deserialize(best_elite.solution).to(device)
    if actor_cfg.normalize_obs:
        norm = best_elite.metadata['obs_normalizer']
        if isinstance(norm, dict):
            agent.obs_normalizer.load_state_dict(norm)
        else:
            agent.obs_normalizer = norm
    return agent

In [20]:
def get_random_elite():
    elite = scheduler.archive.sample_elites(1)
    print(f'Loading agent with reward {elite.objective[0]} and measures {elite.measures[0]}')
    agent = Actor(obs_shape=actor_cfg.obs_shape[0], action_shape=actor_cfg.action_shape, normalize_obs=normalize_obs, normalize_returns=normalize_rewards).deserialize(elite.solution_batch.flatten()).to(device)
    if actor_cfg.normalize_obs:
        norm = elite.metadata['obs_normalizer']
        if isinstance(norm, dict):
            agent.obs_normalizer.load_state_dict(norm)
        else:
            agent.obs_normalizer = norm
    return agent

In [18]:
def get_elite(measures):
    elite = scheduler.archive.elites_with_measure_single(measures)
    print(f'Loading agent with reward {elite.objective_batch} and measures {elite.measures_batch}')
    agent = Actor(obs_shape=actor_cfg.obs_shape[0], action_shape=actor_cfg.action_shape, normalize_obs=normalize_obs, normalize_returns=normalize_rewards).deserialize(elite.solution_batch.flatten()).to(device)
    if actor_cfg.normalize_obs:
        norm = elite.metadata_batch[0]['obs_normalizer']
        if isinstance(norm, dict):
            agent.obs_normalizer.load_state_dict(norm)
        else:
            agent.obs_normalizer = norm
    return agent

In [9]:
def enjoy_brax(agent, render=True, deterministic=True):
    if actor_cfg.normalize_obs:
        obs_mean, obs_var = agent.obs_normalizer.obs_rms.mean, agent.obs_normalizer.obs_rms.var
        print(f'{obs_mean=}, {obs_var=}')

    obs = env.reset()
    rollout = [env.unwrapped._state]
    total_reward = 0
    measures = torch.zeros(env_cfg.num_dims).to(device)
    done = False
    while not done:
        with torch.no_grad():
            obs = obs.unsqueeze(dim=0).to(device)
            if actor_cfg.normalize_obs:
                obs = (obs - obs_mean) / torch.sqrt(obs_var + 1e-8)

            if deterministic:
                act = agent.actor_mean(obs)
            else:
                act, _, _ = agent.get_action(obs)
            act = act.squeeze()
            obs, rew, done, info = env.step(act.cpu())
            measures += info['measures']
            rollout.append(env.unwrapped._state)
            total_reward += rew
    if render:
        i = HTML(html.render(env.unwrapped._env.sys, [s.qp for s in rollout]))
        display(i)
        print(f'{total_reward=}')
        print(f' Rollout length: {len(rollout)}')
        measures /= len(rollout)
        print(f'Measures: {measures.cpu().numpy()}')
    return total_reward.detach().cpu().numpy()

In [26]:
# agent = get_random_elite()
agent = get_best_elite()
enjoy_brax(agent, render=True, deterministic=True)

Loading agent with reward 8823.600077173685 and measures [0.16945767 0.20681413 0.13881017 0.13228038]
obs_mean=tensor([ 6.5301e-01,  6.3354e-01, -3.4612e-02, -3.6092e-02,  5.9431e-01,
        -1.8513e-02,  7.5502e-01, -1.7157e-01, -7.0986e-01, -1.7584e-03,
        -8.6272e-01, -5.7578e-03,  7.0354e-01,  4.1718e+00,  2.6171e-01,
         9.0299e-04, -1.7017e-02,  1.2161e-01,  5.6365e-02, -6.4819e-03,
        -6.0192e-02, -1.6615e-02,  3.4406e-02, -1.9879e-02, -8.4628e-03,
         2.6814e-03, -1.2641e-02, -1.1064e-05, -2.7860e-05,  2.4297e-02,
         3.4619e-10,  3.4619e-10,  3.4619e-10, -1.4340e-01,  4.7115e-02,
         2.8587e-01,  3.4619e-10,  3.4619e-10,  3.4619e-10, -2.0682e-01,
        -6.1307e-02,  2.7230e-01,  3.4619e-10,  3.4619e-10,  3.4619e-10,
         9.9396e-02, -8.5295e-03,  2.3821e-01,  3.4619e-10,  3.4619e-10,
         3.4619e-10,  6.4761e-02,  1.6999e-04,  1.4527e-01,  3.4619e-10,
         3.4619e-10,  3.4619e-10, -8.5106e-05,  2.2965e-05,  3.4619e-10,
         3.4

total_reward=tensor(9035.1855, device='cuda:0')
 Rollout length: 1001
Measures: [0.17382617 0.2017982  0.13086912 0.13186812]


array(9035.186, dtype=float32)

In [19]:
agent = get_elite([0.4, 0.4, 0.1, 0.1])
enjoy_brax(agent, render=True, deterministic=True)

AttributeError: 'Elite' object has no attribute 'objective_batch'