In [5]:
import json, os

In [12]:
def setup(env_id):

    with open(os.path.join('conf', env_id + '.json'), 'r+') as json_file: #open read and overwrite.
        
        data = json.load(json_file)

        global ENV_ID
        global NUM_ENVS            
        global HIDDEN_SIZE         
        global LEARNING_RATE       
        global GAMMA               
        global GAE_LAMBDA          
        global PPO_EPSILON         
        global CRITIC_DISCOUNT     
        global ENTROPY_BETA        
        global PPO_STEPS           
        global MINI_BATCH_SIZE     
        global PPO_EPOCHS          
        global TEST_EPOCHS         
        global NUM_TESTS           
        global TARGET_REWARD       

        ENV_ID              = data.setdefault('env_id', 'RoboschoolHalfCheetah-v1')
        NUM_ENVS            = data.setdefault('num_envs', 1)
        HIDDEN_SIZE         = data.setdefault('hidden_size', 256)
        LEARNING_RATE       = data.setdefault('learning_rate', 1e-4)
        GAMMA               = data.setdefault('gamma', 0.99)
        GAE_LAMBDA          = data.setdefault('gae_lambda', 0.95)
        PPO_EPSILON         = data.setdefault('ppo_epsilon', 0.2)
        CRITIC_DISCOUNT     = data.setdefault('critic_discount', 0.5)
        ENTROPY_BETA        = data.setdefault('entropy_beta', 0.00)
        PPO_STEPS           = data.setdefault('ppo_steps', 256)
        MINI_BATCH_SIZE     = data.setdefault('mini_batch_size', 64)
        PPO_EPOCHS          = data.setdefault('ppo_epochs', 10)
        TEST_EPOCHS         = data.setdefault('test_epochs', 10)
        NUM_TESTS           = data.setdefault('num_tests', 10)
        TARGET_REWARD       = data.setdefault('target_reward', 2500)
        teste=data.setdefault('teste', 'carlos')

        json_file.seek(0)
        json.dump(data, json_file)
        json_file.truncate()


In [13]:
LEARNING_RATE

0.0001

In [14]:
setup('RoboschoolHalfCheetah-v1')

In [9]:

def make_env(env_id):
    # returns a function which creates a single environment
    def _thunk():
        env = gym.make(env_id)
        return env
    return _thunk

    
def test_env(env, model, device, deterministic=True):
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        action = dist.mean.detach().cpu().numpy()[0] if deterministic \
            else dist.sample().cpu().numpy()[0]
        next_state, reward, done, _ = env.step(action)
        state = next_state
        total_reward += reward
    return total_reward


def normalize(x):
    x -= x.mean()
    x /= (x.std() + 1e-8)
    return x


def compute_gae(next_value, rewards, masks, values, gamma=GAMMA, lam=GAE_LAMBDA):
    values = values + [next_value]
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * lam * masks[step] * gae
        # prepend to get correct order back
        returns.insert(0, gae + values[step])
    return returns


def ppo_iter(states, actions, log_probs, returns, advantage):
    batch_size = states.size(0)
    # generates random mini-batches until we have covered the full batch
    for _ in range(batch_size // MINI_BATCH_SIZE):
        rand_ids = np.random.randint(0, batch_size, MINI_BATCH_SIZE)
        yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]
        

def ppo_update(frame_idx, states, actions, log_probs, returns, advantages, clip_param=PPO_EPSILON):
    count_steps = 0
    sum_returns = 0.0
    sum_advantage = 0.0
    sum_loss_actor = 0.0
    sum_loss_critic = 0.0
    sum_entropy = 0.0
    sum_loss_total = 0.0

    # PPO EPOCHS is the number of times we will go through ALL the training data to make updates
    for _ in range(PPO_EPOCHS):
        # grabs random mini-batches several times until we have covered all data
        for state, action, old_log_probs, return_, advantage in ppo_iter(states, actions, log_probs, returns, advantages):
            dist, value = model(state)
            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(action)

            ratio = (new_log_probs - old_log_probs).exp()
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

            actor_loss  = - torch.min(surr1, surr2).mean()
            critic_loss = (return_ - value).pow(2).mean()

            loss = CRITIC_DISCOUNT * critic_loss + actor_loss - ENTROPY_BETA * entropy

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # track statistics
            sum_returns += return_.mean()
            sum_advantage += advantage.mean()
            sum_loss_actor += actor_loss
            sum_loss_critic += critic_loss
            sum_loss_total += loss
            sum_entropy += entropy
            
            count_steps += 1
    
    writer.add_scalar("returns", sum_returns / count_steps, frame_idx)
    writer.add_scalar("advantage", sum_advantage / count_steps, frame_idx)
    writer.add_scalar("loss_actor", sum_loss_actor / count_steps, frame_idx)
    writer.add_scalar("loss_critic", sum_loss_critic / count_steps, frame_idx)
    writer.add_scalar("entropy", sum_entropy / count_steps, frame_idx)
    writer.add_scalar("loss_total", sum_loss_total / count_steps, frame_idx)


In [11]:
use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")
print('Device:', device)

# Prepare environments
envs = [make_env(env_id) for i in range(NUM_ENVS)]
envs = SubprocVecEnv(envs)
env = gym.make(env_id)

Device: cuda


In [16]:
dir(envs)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'action_space',
 'close',
 'closed',
 'nenvs',
 'num_envs',
 'observation_space',
 'ps',
 'remotes',
 'reset',
 'reset_task',
 'step',
 'step_async',
 'step_wait',
 'waiting',
 'work_remotes']

In [23]:
envs.observation_space

Box(210, 160, 3)

In [12]:
num_inputs  = envs.observation_space.shape[0]


IndexError: tuple index out of range