## experiment (tqc)

In [None]:
def experiment_tqc(variant):
    dummy_env = make_env(variant['env'])
    obs_dim = dummy_env.observation_space.low.size
    action_dim = dummy_env.action_space.low.size
    expl_env = VectorEnv([lambda: make_env(variant['env']) for _ in range(variant['expl_env_num'])])
    expl_env.seed(variant["seed"])
    expl_env.action_space.seed(variant["seed"])
    eval_env = SubprocVectorEnv([lambda: make_env(variant['env']) for _ in range(variant['eval_env_num'])])
    eval_env.seed(variant["seed"])

    M = variant['layer_size']
    num_quantiles = variant['num_quantiles']
    n_nets = variant['n_nets']
    
    zf = Critic(
        input_size=obs_dim + action_dim,
        output_size=1,
        num_quantiles=num_quantiles,
        hidden_sizes=[M, M],
        n_nets=n_nets,
    )
    target_zf = Critic(
        input_size=obs_dim + action_dim,
        output_size=1,
        num_quantiles=num_quantiles,
        hidden_sizes=[M, M],
        n_nets=n_nets,
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )
    eval_policy = MakeDeterministic(policy)
    # fraction proposal network
    fp = target_fp = None
    if variant['trainer_kwargs'].get('tau_type') == 'fqf':
        fp = FlattenMlp(
            input_size=obs_dim + action_dim,
            output_size=num_quantiles,
            hidden_sizes=[M // 2, M // 2],
            output_activation=softmax,
        )
        target_fp = FlattenMlp(
            input_size=obs_dim + action_dim,
            output_size=num_quantiles,
            hidden_sizes=[M // 2, M // 2],
            output_activation=softmax,
        )
    eval_path_collector = VecMdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = VecMdpStepCollector(
        expl_env,
        policy,
    )
    replay_buffer = TorchReplayBuffer(
        variant['replay_buffer_size'],
        dummy_env,
    )
    expert_buffer = TorchReplayBuffer(
        variant['replay_buffer_size'] // 10,
        dummy_env,
    )
    iq_args = variant['iq_kwargs']
    expert_buffer.load(iq_args['expert_path'], iq_args['demos'], 
                       iq_args['subsample_freq'], variant['seed']
                      )
    trainer = TruncIDSACTrainer(
        args=variant,
        env=dummy_env,
        policy=policy,
        zf=zf,
        target_zf=target_zf,
        fp=fp,
        target_fp=target_fp,
        num_quantiles=num_quantiles,
        **variant['trainer_kwargs'],
    )
    algorithm = TorchVecOnlineIQAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        expert_buffer=expert_buffer,
        **variant['algorithm_kwargs'],
    )
    algorithm.to(ptu.device)
    algorithm.train()


# imports

In [1]:
import argparse
import yaml
import torch

import rlkit.torch.pytorch_util as ptu
from rlkit.data_management.torch_replay_buffer import TorchReplayBuffer
from rlkit.envs import make_env
from rlkit.envs.vecenv import SubprocVectorEnv, VectorEnv
from rlkit.launchers.launcher_util import set_seed, setup_logger
from rlkit.samplers.data_collector import (VecMdpPathCollector, VecMdpStepCollector)
from rlkit.torch.idsac.idsac import IDSACTrainer
from rlkit.torch.idsac.networks import QuantileMlp, Critic, softmax
from rlkit.torch.networks import FlattenMlp
from rlkit.torch.sac.policies import MakeDeterministic, TanhGaussianPolicy
from rlkit.torch.torch_iq_algorithm import TorchVecOnlineIQAlgorithm

torch.set_num_threads(4)
torch.set_num_interop_threads(4)

No personal conf_private.py found.
doodad not detected


# experiment (original)

In [2]:
def experiment(variant):
    dummy_env = make_env(variant['env'])
    obs_dim = dummy_env.observation_space.low.size
    action_dim = dummy_env.action_space.low.size
    expl_env = VectorEnv([lambda: make_env(variant['env']) for _ in range(variant['expl_env_num'])])
    expl_env.seed(variant["seed"])
    expl_env.action_space.seed(variant["seed"])
    eval_env = SubprocVectorEnv([lambda: make_env(variant['env']) for _ in range(variant['eval_env_num'])])
    eval_env.seed(variant["seed"])

    M = variant["layer_size"]
    num_quantiles = variant["num_quantiles"]
    tau_type = variant["trainer_kwargs"]["tau_type"]
    
    zf1 = QuantileMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        num_quantiles=num_quantiles,
        hidden_sizes=[M, M],
    )
    zf2 = QuantileMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        num_quantiles=num_quantiles,
        hidden_sizes=[M, M],
    )
    target_zf1 = QuantileMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        num_quantiles=num_quantiles,
        hidden_sizes=[M, M],
    )
    target_zf2 = QuantileMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        num_quantiles=num_quantiles,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M, M // 2],
    )
    eval_policy = MakeDeterministic(policy)
    target_policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M, M // 2],
    )
    # fraction proposal network
    fp = target_fp = None
    if variant['trainer_kwargs'].get('tau_type') == 'fqf':
        fp = FlattenMlp(
            input_size=obs_dim + action_dim,
            output_size=num_quantiles,
            hidden_sizes=[M // 2, M // 2],
            output_activation=softmax,
        )
        target_fp = FlattenMlp(
            input_size=obs_dim + action_dim,
            output_size=num_quantiles,
            hidden_sizes=[M // 2, M // 2],
            output_activation=softmax,
        )
    eval_path_collector = VecMdpPathCollector(
        eval_env,
        eval_policy,
        zf1,
        tau_type,
    )
    expl_path_collector = VecMdpStepCollector(
        expl_env,
        policy,
    )
    replay_buffer = TorchReplayBuffer(
        variant['replay_buffer_size'],
        dummy_env,
    )
    expert_buffer = TorchReplayBuffer(
        variant['replay_buffer_size'] // 10,
        dummy_env,
    )
    iq_args = variant['iq_kwargs']
    expert_buffer.load(iq_args['expert_path'], iq_args['demos'], 
                       iq_args['subsample_freq'], variant['seed']
                      )
    trainer = IDSACTrainer(
        args=variant,
        env=dummy_env,
        policy=policy,
        target_policy=target_policy,
        zf1=zf1,
        zf2=zf2,
        target_zf1=target_zf1,
        target_zf2=target_zf2,
        fp=fp,
        target_fp=target_fp,
        num_quantiles=num_quantiles,
        **variant['trainer_kwargs'],
    )
    algorithm = TorchVecOnlineIQAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        expert_buffer=expert_buffer,
        **variant['algorithm_kwargs'],
    )
    algorithm.to(ptu.device)
    algorithm.train()


# args

In [3]:
def get_config(dsac_cfg_path,
               expert_path,
               iq_cfg_path='configs/dsac-normal-iqn-neutral/iq.yaml',
               cql_cfg_path='configs/dsac-normal-iqn-neutral/cql.yaml'
              ):
    
    with open(dsac_cfg_path, 'r', encoding="utf-8") as f:
        variant = yaml.load(f, Loader=yaml.FullLoader)
        
    with open(iq_cfg_path, 'r', encoding="utf-8") as f:
        iq_cfg = yaml.load(f, Loader=yaml.FullLoader)

    with open(cql_cfg_path, 'r', encoding="utf-8") as f:
        cql_cfg = yaml.load(f, Loader=yaml.FullLoader)
        
    iq_cfg['expert_path'] = expert_path
    variant['iq_kwargs'] = iq_cfg
    variant['cql_kwargs'] = cql_cfg
    return variant

In [4]:
variant = get_config(dsac_cfg_path='configs/dsac-normal-iqn-neutral/ant.yaml',
                     expert_path='experts/Ant-v2_25.pkl')

In [5]:
if torch.cuda.is_available():
    ptu.set_gpu_mode(True, 0)
    # device = torch.device('cuda:0')
seed = variant["seed"]
set_seed(seed)
log_prefix = "_".join(["idsac", variant["env"][:-3].lower(), str(variant["version"])])
setup_logger(log_prefix, variant=variant, seed=seed)
variant["device"] = ptu.device

2024-06-10 20:36:09.647462 +0330 | Variant:
2024-06-10 20:36:09.649086 +0330 | {
  "algorithm_kwargs": {
    "batch_size": 256,
    "max_path_length": 1000,
    "min_num_steps_before_training": 10000,
    "num_epochs": 300,
    "num_eval_paths_per_epoch": 10,
    "num_expl_steps_per_train_loop": 1000,
    "num_trains_per_train_loop": 1000
  },
  "env": "Ant-v2",
  "seed": 0,
  "expectation_z": false,
  "eval_env_num": 10,
  "expl_env_num": 10,
  "layer_size": 256,
  "num_quantiles": 24,
  "replay_buffer_size": 1000000,
  "trainer_kwargs": {
    "alpha": 0.01,
    "discount": 0.99,
    "policy_lr": 7.5e-05,
    "soft_target_tau": 0.005,
    "target_update_period": 1,
    "tau_type": "iqn",
    "use_automatic_entropy_tuning": false,
    "zf_lr": 0.0003,
    "bias": 5,
    "bias_lr": 0.001,
    "use_automatic_bias_tuning": true
  },
  "version": "normal-iqn-neutral",
  "iq_kwargs": {
    "expert_path": "experts/Ant-v2_25.pkl",
    "subsample_freq": 1,
    "demos": 10,
    "regularize": tr

# main

In [6]:
if __name__ == "__main__":
    experiment(variant)



2024-06-10 20:36:32.060121 +0330 | [idsac_ant_normal-iqn-neutral_2024_06_10_20_36_09_0000--s-0] Epoch 0 finished
---------------------------------  ---------------
replay_buffer/size                 11000
trainer/ZF1 Loss                       6.63922
trainer/ZF2 Loss                       6.48249
trainer/ZF Expert Reward               0.452431
trainer/ZF Policy Reward               0.599272
trainer/ZF CHI2 Term                   1.11882
trainer/Policy Loss                   -0.00745451
trainer/Bias Loss                     10.3762
trainer/Bias Value                     4.999
trainer/Policy Grad Norm               0.059052
trainer/Policy Param Norm             14.6355
trainer/Zf1 Grad Norm                 11.3205
trainer/Zf1 Param Norm                32.0301
trainer/Zf2 Grad Norm                 15.1294
trainer/Zf2 Param Norm                32.0622
trainer/Z Expert Predictions Mean      0.0583173
trainer/Z Expert Predictions Std       0.16814
trainer/Z Expert Predictions Max       0.77