# Using Behavior Cloning to Play HumanoidBulletEnv-v0

PyTorch version

In [1]:
import sys
import logging
import imp
import itertools
import time

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import pybullet_envs
import torch
torch.manual_seed(0)
import torch.optim as optim
import torch.nn as nn

imp.reload(logging)
logging.basicConfig(level=logging.DEBUG,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

#### Environment

In [2]:
env = gym.make("HumanoidBulletEnv-v0")
env.seed(0)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])

00:02:33 [INFO] env: <HumanoidBulletEnv<HumanoidBulletEnv-v0>>
00:02:33 [INFO] action_space: Box(-1.0, 1.0, (17,), float32)
00:02:33 [INFO] observation_space: Box(-inf, inf, (44,), float32)
00:02:33 [INFO] reward_range: (-inf, inf)
00:02:33 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 60}
00:02:33 [INFO] _max_episode_steps: 1000
00:02:33 [INFO] _elapsed_steps: None
00:02:33 [INFO] id: HumanoidBulletEnv-v0
00:02:33 [INFO] entry_point: pybullet_envs.gym_locomotion_envs:HumanoidBulletEnv
00:02:33 [INFO] reward_threshold: None
00:02:33 [INFO] nondeterministic: False
00:02:33 [INFO] max_episode_steps: 1000
00:02:33 [INFO] _kwargs: {}
00:02:33 [INFO] _env_name: HumanoidBulletEnv


#### Agent

In [3]:
from expertagent import ExpertAgent
expert_agent = ExpertAgent(env)

In [4]:
class SAReplayer:
    def __init__(self):
        self.fields = ['state', 'action']
        self.data = {field: [] for field in self.fields}
        self.memory = pd.DataFrame()

    def store(self, *args):
        for field, arg in zip(self.fields, args):
            self.data[field].append(arg)

    def sample(self, size=None):
        if len(self.memory) < len(self.data[self.fields[0]]):
            self.memory = pd.DataFrame(self.data, columns=self.fields)
        if size is None:
            indices = self.memory.index
        else:
            indices = np.random.choice(self.memory.index, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.fields)

In [5]:
class BCAgent:
    def __init__(self, env, expert_agent):
        self.expert_agent = expert_agent

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]

        self.net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 128], output_size=action_dim)
        self.loss = nn.MSELoss()
        self.optimizer = optim.Adam(self.net.parameters())

    def build_net(self, input_size, hidden_sizes, output_size=1,
            output_activator=None):
        layers = []
        for input_size, output_size in zip(
                [input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        if output_activator:
            layers.append(output_activator)
        net = nn.Sequential(*layers)
        return net

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'expert':
            self.expert_agent.reset(mode)
            self.expert_replayer = SAReplayer()

    def step(self, observation, reward, done):
        if self.mode == 'expert':
            action = expert_agent.step(observation, reward, done)
            self.expert_replayer.store(observation, action)
        else:
            state_tensor = torch.as_tensor(observation, dtype=torch.float
                    ).unsqueeze(0)
            action_tensor = self.net(state_tensor)
            action = action_tensor.detach().numpy()[0]
        return action

    def close(self):
        if self.mode == 'expert':
            self.expert_agent.close()
            for _ in range(10):
                self.learn()

    def learn(self):
        states, actions = self.expert_replayer.sample(1024)
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        action_tensor = torch.as_tensor(actions, dtype=torch.float)

        pred_tensor = self.net(state_tensor)
        loss_tensor = self.loss(pred_tensor, action_tensor)
        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()



agent = BCAgent(env, expert_agent)

In [6]:
def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):
    observation, reward, done = env.reset(), 0., False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, done)
        if render:
            env.render()
        if done:
            break
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
        if max_episode_steps and elapsed_steps >= max_episode_steps:
            break
    agent.close()
    return episode_reward, elapsed_steps



logging.info('==== expert ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent, mode='expert')
    episode_rewards.append(episode_reward)
    logging.debug('expert episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average expert episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.debug('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

00:02:33 [INFO] ==== expert ====
00:02:44 [DEBUG] expert episode 0: reward = 3636.76, steps = 1000
00:02:55 [DEBUG] expert episode 1: reward = 3584.22, steps = 1000
00:03:06 [DEBUG] expert episode 2: reward = 3582.76, steps = 1000
00:03:17 [DEBUG] expert episode 3: reward = 3641.40, steps = 1000
00:03:28 [DEBUG] expert episode 4: reward = 3581.67, steps = 1000
00:03:39 [DEBUG] expert episode 5: reward = 3597.90, steps = 1000
00:03:50 [DEBUG] expert episode 6: reward = 3576.58, steps = 1000
00:04:01 [DEBUG] expert episode 7: reward = 3627.58, steps = 1000
00:04:13 [DEBUG] expert episode 8: reward = 3585.69, steps = 1000
00:04:24 [DEBUG] expert episode 9: reward = 3612.16, steps = 1000
00:04:36 [DEBUG] expert episode 10: reward = 3534.30, steps = 1000
00:04:50 [DEBUG] expert episode 11: reward = 3621.15, steps = 1000
00:04:51 [DEBUG] expert episode 12: reward = 19.42, steps = 67
00:05:05 [DEBUG] expert episode 13: reward = 3595.52, steps = 1000
00:05:17 [DEBUG] expert episode 14: reward 

00:19:27 [DEBUG] test episode 24: reward = 107.31, steps = 67
00:19:27 [DEBUG] test episode 25: reward = -48.03, steps = 20
00:19:27 [DEBUG] test episode 26: reward = 84.42, steps = 47
00:19:27 [DEBUG] test episode 27: reward = 87.39, steps = 49
00:19:28 [DEBUG] test episode 28: reward = 76.35, steps = 40
00:19:28 [DEBUG] test episode 29: reward = 88.63, steps = 47
00:19:28 [DEBUG] test episode 30: reward = 44.59, steps = 38
00:19:29 [DEBUG] test episode 31: reward = 121.62, steps = 68
00:19:29 [DEBUG] test episode 32: reward = 90.10, steps = 47
00:19:29 [DEBUG] test episode 33: reward = 79.29, steps = 43
00:19:29 [DEBUG] test episode 34: reward = 103.21, steps = 53
00:19:30 [DEBUG] test episode 35: reward = 87.00, steps = 48
00:19:30 [DEBUG] test episode 36: reward = 176.82, steps = 87
00:19:30 [DEBUG] test episode 37: reward = 82.27, steps = 43
00:19:31 [DEBUG] test episode 38: reward = 86.58, steps = 46
00:19:31 [DEBUG] test episode 39: reward = 105.01, steps = 84
00:19:32 [DEBUG] t

In [7]:
env.close()