In [1]:
import os
import time 
import numpy as np

In [2]:
from cs285.infrastructure.rl_trainer import RL_Trainer
from cs285.agents.bc_agent import BCAgent
from cs285.policies.loaded_gaussian_policy import Loaded_Gaussian_Policy

In [3]:
!pip install easydict

import easydict
args = easydict.EasyDict({
    'expert_policy_file': 'cs285/policies/experts/Ant.pkl',
    'expert_data': 'cs285/expert_data/expert_data_Ant-v2.pkl', 
    'env_name': 'Ant-v2', 
    'exp_name': 'test_bc_ant',
    'do_dagger': False,
    'ep_len': 0,
    'num_agent_train_steps_per_iter': 1000, 
    'n_iter': 1, 
    'batch_size': 1000, 
    'eval_batch_size': 200, 
    'train_batch_size': 100, 
    'n_layers': 2, 
    'size': 64, 
    'learning_rate': 5e-3, 
    'video_log_freq': 5, 
    'scalar_log_freq': 1, 
    'use_gpu': True, 
    'which_gpu': 0, 
    'max_replay_buffer_size': 1000000, 
    'seed': 1
})
params = vars(args)



In [4]:
class BC_Trainer(object):
    def __init__(self, params):
        agent_params = {
            'n_layers': params['n_layers'], 
            'size': params['size'], 
            'learning_rate': params['learning_rate'], 
            'max_replay_buffer_size': params['max_replay_buffer_size'], 
        }
        self.params = params
        self.params['agent_class'] = BCAgent
        self.params['agent_params'] = agent_params

        self.rl_trainer = RL_Trainer(self.params)

        print('Loading expert policy from ...', self.params['expert_policy_file'])
        self.loaded_expert_policy = Loaded_Gaussian_Policy(self.params['expert_policy_file'])
        print('Done restoring expert policy...')

    def run_training_loop(self):
        self.rl_trainer.run_training_loop(
            n_iter=self.params['n_iter'],
            initial_expertdata=self.params['expert_data'], 
            collect_policy=self.rl_trainer.agent.actor, 
            eval_policy=self.rl_trainer.agent.actor, 
            relabel_with_expert=self.params['do_dagger'], 
            expert_policy=self.loaded_expert_policy,
        )
    

In [5]:
logdir_prefix = 'bc_'
if args.do_dagger:
    logdir_prefix = 'dagger_'
    assert args.n_iter>1, 'DAGGER needs more than 1 iteration of training, to iteratively query to expert and train'
else:
    assert args.n_iter == 1, 'Vanilla behavior cloning collects expert data just once'

data_path = os.path.join(os.getcwd(), './cs285/data')
if not os.path.exists(data_path):
    os.makedirs(data_path)

logdir=logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime('%Y-%m-%d-%H-%M-%S')
logdir=os.path.join(data_path, logdir)
params['logdir'] = logdir
if not os.path.exists(logdir):
    os.makedirs(logdir)

In [6]:

import gym
import pybullet_envs
import pybullet as p

import torch
from torch import nn
from cs285.infrastructure.torch_utils import MLP

p.connect(p.DIRECT)
env_name = params['env_name'].split('-')[0] + 'BulletEnv-v0'
env = gym.make(env_name)
env.seed(params['seed'])
env.render()

params['ep_len'] = params['ep_len'] or env.spec.max_episode_steps

ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.shape[0]

params['agent_class'] = BCAgent
params['agent_params'] = {'n_layers': params['n_layers'],
                          'size': params['size'],
                          'learning_rate': params['learning_rate'],
                          'max_replay_buffer_size': params              ['max_replay_buffer_size'],}
params['agent_params']['ac_dim'] = ac_dim
params['agent_params']['ob_dim'] = ob_dim
params['agent_params']['discrete'] = False

fps = env.env.metadata['video.frames_per_second']

agent_class = params['agent_class']

mean = MLP(ob_dim, output_size=ac_dim, n_layers=params['n_layers'], size=params['size'])
logstd = torch.zeros(ac_dim, dtype=torch.float32, requires_grad=True)
a = {'mean':mean, 'logstd': nn.Parameter(logstd)}

optimizer = torch.optim.Adam([{'params':a['mean'].parameters(), 'params': a['logstd']}], params['learning_rate'])

print(a)


agent = agent_class(env, params['agent_params'])
policy = agent.actor


{'mean': MLP(
  (layers): ModuleList(
    (0): Linear(in_features=28, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=64, bias=True)
    (2): Linear(in_features=64, out_features=8, bias=True)
  )
), 'logstd': Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)}


In [7]:
from cs285.infrastructure.utils import *

ob = env.reset()
policy = agent.actor
# sample_trajectory(env, policy, 300, False)
obs = torch.tensor(ob, dtype=torch.float32)
policy.parameters['mean'](obs)
a = policy.get_action(obs)


tensor([[-1.2914,  1.2602, -0.5985,  1.0054, -1.0766, -1.1620, -1.0520,  0.6074]],
       grad_fn=<AddBackward0>)

In [8]:


trainer = BC_Trainer(params)
trainer.run_training_loop()

########################
logging outputs to  d:\projects\re_papers\lectures\cs285\hw1\./cs285/data\bc_test_bc_ant_Ant-v2_2020-09-01-15-52-47
########################
Loading expert policy from ... cs285/policies/experts/Ant.pkl
Done restoring expert policy...


********** Iteration 0 ************


TypeError: string indices must be integers