In [1]:
import pathlib
import numpy as np
import torch as th
import random

from stable_baselines3.ppo import MlpPolicy
from experiment3.RLHFAgent import RLHFAgent
from experiment3.AIRLAgent import AIRLAgent
from experiment3.Utils import Utils
from experiment3.Environment import Environment

In [2]:
# Initialize environment
SEED = 42
env = Environment("seals:seals/CartPole-v0", SEED, num_envs=8)
env.init_vec_env()

                We allow to pass a mode argument to maintain a backwards compatible VecEnv API, but the mode (rgb_array)
                has to be the same as the environment render mode (None) which is not the case.


<stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv at 0x2d02c035600>

In [3]:
def train_rlhf(num_comparisons, exploration_frac=0.05, fragment_length=100):
    # Initialize RLHFAgent
    rlhfAgent = RLHFAgent(env_object=env)
    rlhfAgent.init_gen_algo(policy_name="PPO", ac_policy=MlpPolicy, env_object=env)
    rlhfAgent.init_trajectory_gen(env_object=env, exploration_frac=exploration_frac)
    
    # Train RLHFAgent alone
    rlhfAgent.train(save_path=pathlib.Path("rlhf_agent"), env_object=env, total_human_comparisons=num_comparisons, total_timesteps=400_000,
                    fragment_length=fragment_length)


In [4]:
def train_airl():
    # Initialize AIRLAgent
    airlAgent = AIRLAgent(env_object=env)
    airlAgent.init_gen_algo(ac_policy=MlpPolicy, env_object=env)
    
    # Train AIRLAgent alone
    # Train for >400k for 100% expert, 280k for 70% expert, 200k for 50% expert, 80k for 20% expert, 20k for 5% expert
    airlAgent.train(env_object=env, train_steps=20_000)

In [5]:
def train_irlhf(reward_net_irl, num_comparisons, exploration_frac=0.05, fragment_length=100, num_it=60, initial_epoch_multiplier=4):
    # Initialize RLHFAgent
    rlhfAgent = RLHFAgent(env_object=env)
    rlhfAgent.set_reward_from_airl(reward_net_irl)
    rlhfAgent.init_gen_algo(policy_name="PPO", ac_policy=MlpPolicy, env_object=env)
    rlhfAgent.init_trajectory_gen(env_object=env, exploration_frac=exploration_frac)
    
    # Train RLHFAgent with reward initialized by AIRL
    rlhfAgent.train(save_path=pathlib.Path("irlhf_agent"), env_object=env, total_human_comparisons=num_comparisons, total_timesteps=400_000,
                    fragment_length=fragment_length, num_it=num_it, initial_epoch_multiplier=initial_epoch_multiplier)
    

In [6]:
def get_reward_stats(path):
    rewards = th.load(path)
    return rewards, np.mean(rewards), np.std(rewards)

In [7]:
from imitation.util.networks import RunningNorm
from imitation.policies.base import NormalizeFeaturesExtractor


def train_with_learned_reward_and_evaluate(reward_path, train_path, tensorboard_dir, tb_log_name, 
                                           wandb_project_name, wandb_save_path,
                                           batch_size=64, lr=0.001, gamma=0.98, clip_range=0.2, n_epochs=20
                                           ):
    reward_net = th.load(reward_path)
    Utils.train_with_learned_reward(learned_reward=reward_net, save_path=train_path,  
                                    ac_policy=MlpPolicy, tensorboard_dir=tensorboard_dir, tb_log_name=tb_log_name, 
                                    env_object=env, wandb_project_name=wandb_project_name, wandb_save_path=wandb_save_path,
                                    batch_size=batch_size, lr=lr, gamma=gamma, clip_range=clip_range, n_epochs=n_epochs,
                                    policy_kwargs=dict(
                                        features_extractor_class=NormalizeFeaturesExtractor,
                                        features_extractor_kwargs=dict(normalize_class=RunningNorm),
                                    ))
    eval_mean, eval_std = Utils.evaluate_trained_agent_with_learned_reward(load_path=train_path, venv=env.venv)
    return eval_mean, eval_std

In [8]:
def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    th.manual_seed(seed)
    
    if th.cuda.is_available():
        th.cuda.manual_seed(seed)
        th.cuda.manual_seed_all(seed)
    
    # wrap env in new seed
    env.seed = seed 
    env.rng = np.random.default_rng(seed)
    env.init_vec_env()
    env.venv.seed(seed)

# Training with different seeds and different demonstrations AIRL agent

In [9]:
def main_train_airl():
    rewards_over_seeds = [] 
    seed_list = [79]
    # expert_demonstrations = []
    
    for i, seed in enumerate(seed_list): 
        # set seed
        set_seed(seed)
        
        # start airl
        means = []
        stds = []
        eval_policy_mean = []
        eval_policy_std = []
        
        # Train
        train_airl()
        
        rws, mean, std = get_reward_stats("./airl_agent/learner_rewards.pt")
            
        means.append(mean)
        stds.append(std)
                    
        eval_mean, eval_std = train_with_learned_reward_and_evaluate("airl_agent/reward_net.pt", "airl_agent/airl_agent_trained_with_learned_reward",  "./airl_cartpole_tensorboard", f"run_3_expert_with_{seed}", 
         batch_size=64, lr=0.0005, gamma=0.95, clip_range=0.1, n_epochs=5,
         wandb_project_name="airl", wandb_save_path=f"models/run_3_expert_with_{seed}",)
                
        eval_policy_mean.append(eval_mean)
        eval_policy_std.append(eval_std)
            
        # print(f"number of expert demonstrations = {expert_dem}: mean={mean}, std={std}")
        # print(f"number of expert demonstrations = {expert_dem}: eval_mean={eval_mean}, eval_std={eval_std}")
            
        rewards_over_seeds.append(eval_policy_mean)

In [None]:
main_train_airl()

# Training with different seeds and different nr of queries RLHF / IRLHF

In [10]:
def main_train(agent_name, path):
    rewards_over_seeds = [] # [0]: lists of means for seed 0, num comp 250,500,600 respectively
    seed_list = [25, 34, 43, 52, 61, 70, 79]
    comparisons_list = [700]
    
    for i, seed in enumerate(seed_list): 
        # set seed
        set_seed(seed)
        
        # start rlhf / irlhf 
        means = []
        stds = []
        eval_policy_mean = []
        eval_policy_std = []
    
        for idx, num_comparisons in enumerate(comparisons_list):
            if agent_name == "rlhf":
                train_rlhf(num_comparisons)
            if agent_name == "irlhf":
                # Train airl under current seed
                train_airl()
                # Load trained AIRLAgent reward function
                reward_net_irl = th.load("airl_agent/reward_net.pt")
                # Pass initialized reward function (can be 100%, 70%, 50%, 20%, 5%) to irlhf
                if num_comparisons == 10:
                    train_irlhf(reward_net_irl, num_comparisons, exploration_frac=0.05, fragment_length=100, num_it=8, initial_epoch_multiplier=4)
                else:
                    train_irlhf(reward_net_irl, num_comparisons, exploration_frac=0.05, fragment_length=100, num_it=60, initial_epoch_multiplier=4)
                
            rws, mean, std = get_reward_stats(path)
            
            means.append(mean)
            stds.append(std)
            
            eval_mean = None
            eval_std = None
            
            if agent_name == "rlhf":
                eval_mean, eval_std = train_with_learned_reward_and_evaluate(
                    reward_path="rlhf_agent/reward_net.pt",
                    train_path="rlhf_agent/rlhf_agent_trained_with_learned_reward",
                    tensorboard_dir="./ppo_rlhf_cartpole_tensorboard/",
                    tb_log_name=f"run5_comparisons_{num_comparisons}_with_seed_{seed}",
                    wandb_project_name="rlhf",
                    wandb_save_path=f"models/run_comparisons_{num_comparisons}_with_seed_{seed}"
                )
            if agent_name == "irlhf":
                eval_mean, eval_std = train_with_learned_reward_and_evaluate(
                    reward_path="irlhf_agent/reward_net.pt",
                    train_path="irlhf_agent/irlhf_agent_trained_with_learned_reward",
                    tensorboard_dir="./ppo_irlhf_cartpole_tensorboard/",
                    tb_log_name=f"run7_comparisons_{num_comparisons}_with_seed_{seed}_irlhf_5%AIRLexpert",
                    wandb_project_name="irlhf",
                    wandb_save_path=f"models/run_comparisons_{num_comparisons}_with_seed_{seed}_irlhf"
                )
                
            eval_policy_mean.append(eval_mean)
            eval_policy_std.append(eval_std)
            
            print(f"number of comparisons = {num_comparisons}: mean={mean}, std={std}")
            print(f"number of comparisons = {num_comparisons}: eval_mean={eval_mean}, eval_std={eval_std}")
            
        rewards_over_seeds.append(eval_policy_mean)

In [11]:
# main_train("rlhf", "./rlhf_agent/learner_rewards.pt")
main_train("irlhf", "./irlhf_agent/learner_rewards.pt")

                We allow to pass a mode argument to maintain a backwards compatible VecEnv API, but the mode (rgb_array)
                has to be the same as the environment render mode (None) which is not the case.


Expert stats:  {'n_traj': 64, 'return_min': 500.0, 'return_mean': 500.0, 'return_std': 0.0, 'return_max': 500.0, 'len_min': 500, 'len_mean': 500.0, 'len_std': 0.0, 'len_max': 500}


round:   0%|          | 0/1 [00:00<?, ?it/s]

------------------------------------------
| raw/                        |          |
|    gen/rollout/ep_len_mean  | 500      |
|    gen/rollout/ep_rew_mean  | 32.6     |
|    gen/time/fps             | 2924     |
|    gen/time/iterations      | 1        |
|    gen/time/time_elapsed    | 5        |
|    gen/time/total_timesteps | 16384    |
------------------------------------------
--------------------------------------------------
| raw/                                |          |
|    disc/disc_acc                    | 0.509    |
|    disc/disc_acc_expert             | 1        |
|    disc/disc_acc_gen                | 0.0176   |
|    disc/disc_entropy                | 0.635    |
|    disc/disc_loss                   | 0.76     |
|    disc/disc_proportion_expert_pred | 0.991    |
|    disc/disc_proportion_expert_true | 0.5      |
|    disc/global_step                 | 1        |
|    disc/n_expert                    | 2.05e+03 |
|    disc/n_generated                 | 2.05e+03 |
-

round: 100%|██████████| 1/1 [00:11<00:00, 11.39s/it]


Rewards before training: 206.74 +/- 186.23751609168332
Rewards after training: 8.31 +/- 0.7706490770772388


AttributeError: 'AIRL' object has no attribute 'reward_net'