In [1]:
import pybullet_envs
# Don't forget to install PyBullet!
from gym import make
import numpy as np
import torch
from torch import nn
from torch.distributions import Normal
from torch.nn import functional as F
from torch.optim import Adam
import random
from itertools import product
import joblib
from os import mkdir
import uuid
from train import *
import json
from joblib import Parallel, delayed

In [2]:
def run(iterations: int = 10000, 
        min_episodes_per_update: int = 4, 
        min_transitions_per_update: int = 2048,
        actor_lr: float = 3e-4,
        critic_lr: float = 2e-4,
        lam: float = 0.97,
        gamma: float = 0.99,
        clip: float = 0.2,
        entropy_coef: float = 1e-2,
        batches_per_update: int = 4,
        batch_size: int = 64
        ):
    working_dir = "experiments/" + str(uuid.uuid4())
    os.mkdir(working_dir)
    with open(f"{working_dir}/params.json", "w") as param:
        json.dump({
            "iterations": iterations,
            "min_episodes_per_update": min_episodes_per_update,
            "min_transitions_per_update": min_transitions_per_update,
            "actor_lr": actor_lr,
            "critic_lr": critic_lr,
            "lam": lam,
            "gamma": gamma,
            "clip": clip,
            "entropy_coef": entropy_coef,
            "batches_per_update": batches_per_update,
            "batch_size": batch_size
        }, param, indent=4)

    log = open(f"{working_dir}/log.csv", "a+")

    env = make("Walker2DBulletEnv-v0")
    ppo = PPO(state_dim=env.observation_space.shape[0], 
              action_dim=env.action_space.shape[0], 
              actor_lr=actor_lr, 
              critic_lr=critic_lr)
    state = env.reset()
    episodes_sampled = 0
    steps_sampled = 0

    for i in range(iterations):
        trajectories = []
        steps_ctn = 0
        
        while len(trajectories) < min_episodes_per_update or steps_ctn < min_transitions_per_update:
            traj = sample_episode(env, ppo, lam=lam, gamma=gamma)
            steps_ctn += len(traj)
            trajectories.append(traj)
        episodes_sampled += len(trajectories)
        steps_sampled += steps_ctn

        ppo.update(trajectories,
                   clip=clip, 
                   entropy_coef=entropy_coef, 
                   batches_per_update=batches_per_update, 
                   batch_size=batch_size)
        
        if (i + 1) % (iterations // 100) == 0:
            rewards = evaluate_policy(env, ppo, 50)
            rmean = np.mean(rewards)
            rstd = np.std(rewards)
            log.write(f"Rmean: {rmean:0.4f}, Rstd: {rstd:0.4f}, Episodes: {episodes_sampled}, Steps: {steps_sampled}\n")
            ppo.save(name=f"{working_dir}/{i + 1}_{int(rmean)}_{int(rstd)}.pkl")
    log.close()

In [3]:
drun = delayed(run)

In [4]:
base_config = {
    "iterations": 10000,
    "min_episodes_per_update": 4,
    "min_transitions_per_update": 2048,
    "actor_lr": 0.0003,
    "critic_lr": 0.0002,
    "lam": 0.97,
    "gamma": 0.99,
    "clip": 0.2,
    "entropy_coef": 0.01,
    "batches_per_update": 64,
    "batch_size": 128
}

config1 = base_config.copy()
config2 = base_config.copy()
config3 = base_config.copy()
config4 = base_config.copy()
config5 = base_config.copy()

In [5]:
config1["entropy_coef"] = 0
config2["lam"] = 0.99
config3["clip"] = 0.01
config4["batch_size"] = 64
config5["min_episodes_per_update"] = 16

In [6]:
config_lst = [
    config1,
    config2,
    config3,
    config4,
    config5,
    base_config
]

In [7]:
Parallel(n_jobs = 6)(drun(**config) for config in config_lst)

KeyboardInterrupt: 