# Using Behavior Cloning to Play HumanoidBulletEnv-v0

TensorFlow version

In [1]:
import sys
import logging
import itertools
import time

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import pybullet_envs
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import keras
from tensorflow import nn
from tensorflow import optimizers
from tensorflow import losses
from tensorflow.keras import layers

logging.basicConfig(level=logging.DEBUG,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

#### Environment

In [2]:
env = gym.make("HumanoidBulletEnv-v0")
env.seed(0)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])

00:09:50 [INFO] env: <HumanoidBulletEnv<HumanoidBulletEnv-v0>>
00:09:50 [INFO] action_space: Box(-1.0, 1.0, (17,), float32)
00:09:50 [INFO] observation_space: Box(-inf, inf, (44,), float32)
00:09:50 [INFO] reward_range: (-inf, inf)
00:09:50 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 60}
00:09:50 [INFO] _max_episode_steps: 1000
00:09:50 [INFO] _elapsed_steps: None
00:09:50 [INFO] id: HumanoidBulletEnv-v0
00:09:50 [INFO] entry_point: pybullet_envs.gym_locomotion_envs:HumanoidBulletEnv
00:09:50 [INFO] reward_threshold: None
00:09:50 [INFO] nondeterministic: False
00:09:50 [INFO] max_episode_steps: 1000
00:09:50 [INFO] _kwargs: {}
00:09:50 [INFO] _env_name: HumanoidBulletEnv


#### Agent

In [3]:
from expertagent import ExpertAgent
expert_agent = ExpertAgent(env)

In [4]:
class SAReplayer:
    def __init__(self):
        self.fields = ['state', 'action']
        self.data = {field: [] for field in self.fields}
        self.memory = pd.DataFrame()

    def store(self, *args):
        for field, arg in zip(self.fields, args):
            self.data[field].append(arg)

    def sample(self, size=None):
        if len(self.memory) < len(self.data[self.fields[0]]):
            self.memory = pd.DataFrame(self.data, columns=self.fields)
        if size is None:
            indices = self.memory.index
        else:
            indices = np.random.choice(self.memory.index, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.fields)

In [5]:
class BCAgent:
    def __init__(self, env, expert_agent):
        self.expert_agent = expert_agent

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]

        self.net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 128], output_size=action_dim)

    def build_net(self, input_size=None, hidden_sizes=None, output_size=1,
                activation=nn.relu, output_activation=None,
                loss=losses.mse, learning_rate=0.001):
        model = keras.Sequential()
        for hidden_size in hidden_sizes:
            model.add(layers.Dense(units=hidden_size,
                    activation=activation))
        model.add(layers.Dense(units=output_size,
                activation=output_activation))
        optimizer = optimizers.Adam(learning_rate)
        model.compile(optimizer=optimizer, loss=loss)
        return model

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'expert':
            self.expert_agent.reset(mode)
            self.expert_replayer = SAReplayer()

    def step(self, observation, reward, done):
        if self.mode == 'expert':
            action = expert_agent.step(observation, reward, done)
            self.expert_replayer.store(observation, action)
        else:
            action = self.net(observation[np.newaxis])[0]
        return action

    def close(self):
        if self.mode == 'expert':
            self.expert_agent.close()
            for _ in range(10):
                self.learn()

    def learn(self):
        states, actions = self.expert_replayer.sample(1024)
        self.net.fit(states, actions, verbose=0)



agent = BCAgent(env, expert_agent)

In [6]:
def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):
    observation, reward, done = env.reset(), 0., False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, done)
        if render:
            env.render()
        if done:
            break
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
        if max_episode_steps and elapsed_steps >= max_episode_steps:
            break
    agent.close()
    return episode_reward, elapsed_steps



logging.info('==== expert ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent, mode='expert')
    episode_rewards.append(episode_reward)
    logging.debug('expert episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average expert episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.debug('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

00:09:52 [INFO] ==== expert ====
00:10:07 [DEBUG] expert episode 0: reward = 3636.76, steps = 1000
00:10:19 [DEBUG] expert episode 1: reward = 3584.22, steps = 1000
00:10:31 [DEBUG] expert episode 2: reward = 3582.76, steps = 1000
00:10:43 [DEBUG] expert episode 3: reward = 3641.40, steps = 1000
00:10:55 [DEBUG] expert episode 4: reward = 3581.67, steps = 1000
00:11:07 [DEBUG] expert episode 5: reward = 3597.90, steps = 1000
00:11:19 [DEBUG] expert episode 6: reward = 3576.58, steps = 1000
00:11:32 [DEBUG] expert episode 7: reward = 3627.58, steps = 1000
00:11:44 [DEBUG] expert episode 8: reward = 3585.69, steps = 1000
00:11:57 [DEBUG] expert episode 9: reward = 3612.16, steps = 1000
00:12:09 [DEBUG] expert episode 10: reward = 3534.30, steps = 1000
00:12:21 [DEBUG] expert episode 11: reward = 3621.15, steps = 1000
00:12:23 [DEBUG] expert episode 12: reward = 19.42, steps = 67
00:12:36 [DEBUG] expert episode 13: reward = 3595.52, steps = 1000
00:12:48 [DEBUG] expert episode 14: reward 

In [7]:
env.close()