# imports

In [1]:
import argparse
import yaml
import torch

import rlkit.torch.pytorch_util as ptu
from rlkit.data_management.torch_replay_buffer import TorchReplayBuffer
from rlkit.envs import make_env
from rlkit.envs.vecenv import SubprocVectorEnv, VectorEnv
from rlkit.launchers.launcher_util import set_seed, setup_logger
from rlkit.samplers.data_collector import (VecMdpPathCollector, VecMdpStepCollector)
from rlkit.torch.idsac.idsac import IDSACTrainer
from rlkit.torch.idsac.networks import QuantileMlp, Critic, softmax
from rlkit.torch.networks import FlattenMlp
from rlkit.torch.sac.policies import MakeDeterministic, TanhGaussianPolicy
from rlkit.torch.torch_iq_algorithm import TorchVecOnlineIQAlgorithm

torch.set_num_threads(10)
torch.set_num_interop_threads(10)

No personal conf_private.py found.
doodad not detected


In [7]:
import gym
env = gym.make('Ant-v2')

In [8]:
expert_buffer = TorchReplayBuffer(
    10000,
    env,
)
expert_buffer.load('experts/Ant-v2_25.pkl', 0.25, 1, 0)

In [9]:
expert_buffer.get_diagnostics()

OrderedDict([('size', 250)])

In [10]:
expert_buffer.random_batch(10)

{'observations': tensor([[ 0.7290,  0.9249, -0.0619,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.6150,  0.9505, -0.0318,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.4717,  0.9382,  0.0204,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.5975,  0.9034, -0.0074,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.8610,  0.9294, -0.0747,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.7022,  0.8926,  0.0258,  ...,  0.0000,  0.0000,  0.0000]]),
 'actions': tensor([[ 0.2778, -0.5681, -0.3774, -0.5610, -0.6768,  0.5890,  0.4642,  0.2426],
         [ 0.7474, -0.5049,  0.0566,  0.1307, -0.2907,  0.2747,  0.5495,  0.1118],
         [ 0.3070, -0.9549, -0.9439, -0.9228, -0.5230,  0.7307,  0.6126,  0.5333],
         [ 0.5126, -0.1134, -0.2657, -0.0662, -0.1495,  0.5631,  0.2768,  0.4151],
         [ 0.6558, -0.8908,  0.0688, -0.8468, -0.0483,  0.6869, -0.0747,  0.0988],
         [ 0.5004, -0.5434, -0.0555,  0.4589,  0.0439,  0.3971,  0.7375,  0.1133],
         [ 0.5232, -0.9651, -0.03

# experiment

In [None]:
def experiment(variant):
    dummy_env = make_env(variant['env'])
    obs_dim = dummy_env.observation_space.low.size
    action_dim = dummy_env.action_space.low.size
    expl_env = VectorEnv([lambda: make_env(variant['env']) for _ in range(variant['expl_env_num'])])
    expl_env.seed(variant["seed"])
    expl_env.action_space.seed(variant["seed"])
    eval_env = SubprocVectorEnv([lambda: make_env(variant['env']) for _ in range(variant['eval_env_num'])])
    eval_env.seed(variant["seed"])

    M = variant["layer_size"]
    num_quantiles = variant["num_quantiles"]
    tau_type = variant["trainer_kwargs"]["tau_type"]
    
    zf1 = QuantileMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        num_quantiles=num_quantiles,
        hidden_sizes=[M, M],
    )
    zf2 = QuantileMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        num_quantiles=num_quantiles,
        hidden_sizes=[M, M],
    )
    target_zf1 = QuantileMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        num_quantiles=num_quantiles,
        hidden_sizes=[M, M],
    )
    target_zf2 = QuantileMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        num_quantiles=num_quantiles,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M, M],
    )
    eval_policy = MakeDeterministic(policy)
    target_policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M, M],
    )
    # fraction proposal network
    fp = target_fp = None
    if variant['trainer_kwargs'].get('tau_type') == 'fqf':
        fp = FlattenMlp(
            input_size=obs_dim + action_dim,
            output_size=num_quantiles,
            hidden_sizes=[M // 2, M // 2],
            output_activation=softmax,
        )
        target_fp = FlattenMlp(
            input_size=obs_dim + action_dim,
            output_size=num_quantiles,
            hidden_sizes=[M // 2, M // 2],
            output_activation=softmax,
        )
    eval_path_collector = VecMdpPathCollector(
        eval_env,
        eval_policy,
        zf1,
        tau_type,
    )
    expl_path_collector = VecMdpStepCollector(
        expl_env,
        policy,
    )
    replay_buffer = TorchReplayBuffer(
        variant['replay_buffer_size'],
        dummy_env,
    )
    expert_buffer = TorchReplayBuffer(
        variant['replay_buffer_size'] // 10,
        dummy_env,
    )
    iq_args = variant['iq_kwargs']
    expert_buffer.load(iq_args['expert_path'], iq_args['demos'], 
                       iq_args['subsample_freq'], variant['seed']
                      )
    trainer = IDSACTrainer(
        args=variant,
        env=dummy_env,
        policy=policy,
        target_policy=target_policy,
        zf1=zf1,
        zf2=zf2,
        target_zf1=target_zf1,
        target_zf2=target_zf2,
        fp=fp,
        target_fp=target_fp,
        num_quantiles=num_quantiles,
        **variant['trainer_kwargs'],
    )
    algorithm = TorchVecOnlineIQAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        expert_buffer=expert_buffer,
        **variant['algorithm_kwargs'],
    )
    algorithm.to(ptu.device)
    algorithm.train()


# args

In [None]:
with open('configs/ant.yaml', 'r', encoding="utf-8") as f:
    variant = yaml.load(f, Loader=yaml.FullLoader)

In [None]:
if torch.cuda.is_available():
    ptu.set_gpu_mode(True, 0)
seed = variant["seed"]
set_seed(seed)
log_prefix = variant["env"][:-3].lower()
setup_logger(log_prefix, variant=variant, seed=seed)
variant["device"] = ptu.device

# main

In [None]:
if __name__ == "__main__":
    experiment(variant)