In [1]:
import pathlib
import numpy as np
import torch 
import random
import yaml

from stable_baselines3.ppo import MlpPolicy
from experiment3.RLHFAgent import RLHFAgent
from experiment3.AIRLAgent import AIRLAgent
from experiment3.Utils import Utils
from experiment3.Environment import Environment

In [2]:
def load_config(config_path, env_name):
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config[env_name]

In [4]:
config_path = "parameters.yaml"
env_name = "cartpole"  # or "pendulum"
config_rlhf = load_config(config_path, env_name + "RLHF")
config_airl = load_config(config_path, env_name + "AIRL")

In [8]:
# Initialize environment
SEED = 42
if env_name == "cartpole":
    env = Environment("seals:seals/CartPole-v0", SEED, num_envs=8)
elif env_name == "pendulum":
    env = Environment("Pendulum-v1", SEED, num_envs=4)
    
env.init_vec_env()
print(env.env_id)

seals:seals/CartPole-v0


# Training scripts

In [9]:
def train_rlhf(num_comparisons, exploration_frac=0.05, fragment_length=100):
    # Initialize RLHFAgent
    rlhfAgent = RLHFAgent(env_object=env)
    rlhfAgent.init_gen_algo(config=config_rlhf, ac_policy=MlpPolicy, env_object=env)
    rlhfAgent.init_trajectory_gen(env_object=env, exploration_frac=exploration_frac)
    
    # Train RLHFAgent alone
    rlhfAgent.train(save_path=pathlib.Path("rlhf_agent"), env_object=env, total_human_comparisons=num_comparisons,    total_timesteps=400_000, fragment_length=fragment_length)


In [10]:
def train_airl(expert_type, nr_demonstrations, random_prob=0, switch_prob=0):
    # Initialize AIRLAgent
    # To introduce suboptimality in expert demonstrations, specify random_prob and switch_prob
    airlAgent = AIRLAgent(env_object=env, expert_type=expert_type, nr_demonstrations=nr_demonstrations,
                          random_prob=random_prob, switch_prob=switch_prob)
    airlAgent.init_gen_algo(config=config_airl, ac_policy=MlpPolicy, env_object=env)
    
    # Train AIRLAgent alone
    airlAgent.train(env_object=env, train_steps=400_000)

In [11]:
def train_irlhf(reward_net_airl, num_comparisons, exploration_frac=0.05, fragment_length=100, num_it=60, initial_epoch_multiplier=4):
    # Initialize RLHFAgent
    rlhfAgent = RLHFAgent(env_object=env)
    rlhfAgent.set_reward_from_airl(reward_net_airl, env_object=env)
    # To only pass reward, remove path_to_algo parameter / or set to None
    # We are passing both reward_airl and policy_airl
    rlhfAgent.init_gen_algo(config=config_rlhf, ac_policy=MlpPolicy, env_object=env, path_to_algo="airl_agent/gen_policy/model.zip")
    rlhfAgent.init_trajectory_gen(env_object=env, exploration_frac=exploration_frac)
    
    # Train RLHFAgent with reward and policy initialized by AIRL
    rlhfAgent.train(save_path=pathlib.Path("irlhf_agent"), env_object=env, total_human_comparisons=num_comparisons, total_timesteps=400_000,
     fragment_length=fragment_length, num_it=num_it, initial_epoch_multiplier=initial_epoch_multiplier)
    

In [12]:
def get_reward_stats(path):
    rewards = torch.load(path)
    return rewards, np.mean(rewards), np.std(rewards)

In [13]:
from imitation.util.networks import RunningNorm
from imitation.policies.base import NormalizeFeaturesExtractor


def train_with_learned_reward_and_evaluate(reward_path, train_path, tensorboard_dir, tb_log_name, 
                                           wandb_project_name, wandb_save_path, config
                                           ):
    reward_net = torch.load(reward_path)
    Utils.train_with_learned_reward(learned_reward=reward_net, save_path=train_path, config=config, 
                                    ac_policy=MlpPolicy, tensorboard_dir=tensorboard_dir, tb_log_name=tb_log_name, 
                                    env_object=env, wandb_project_name=wandb_project_name, wandb_save_path=wandb_save_path,
                                    policy_kwargs=dict(
                                        features_extractor_class=NormalizeFeaturesExtractor,
                                        features_extractor_kwargs=dict(normalize_class=RunningNorm),
                                    ))
    eval_mean, eval_std = Utils.evaluate_trained_agent_with_learned_reward(load_path=train_path, venv=env.venv)
    return eval_mean, eval_std

In [14]:
def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    
    # wrap env in new seed
    env.seed = seed 
    env.rng = np.random.default_rng(seed)
    env.init_vec_env()
    env.venv.seed(seed)

# Training with different seeds and different demonstrations AIRL agent

In [17]:
def main_train_airl(expert_type, nr_demonstrations, random_prob=0, switch_prob=0):
    rewards_over_seeds = [] 
    seed_list = [34, 43, 52, 61, 70]
    
    for i, seed in enumerate(seed_list): 
        # set seed
        set_seed(seed)
        
        # start airl
        means = []
        stds = []
        eval_policy_mean = []
        eval_policy_std = []
        
        # Train
        train_airl(expert_type=expert_type, nr_demonstrations=nr_demonstrations, random_prob=random_prob, switch_prob=switch_prob)
        
        rws, mean, std = get_reward_stats("./airl_agent/learner_rewards.pt")
            
        means.append(mean)
        stds.append(std)
                    
        eval_mean, eval_std = train_with_learned_reward_and_evaluate("airl_agent/reward_net.pt", "airl_agent/airl_agent_trained_with_learned_reward",  "./airl_cartpole_tensorboard", f"run_4_expert_with_{seed}", 
         wandb_project_name="airl", wandb_save_path=f"models/run_4_expert_with_{seed}", 
         config=config_airl
         )
                
        eval_policy_mean.append(eval_mean)
        eval_policy_std.append(eval_std)
            
        # print(f"number of expert demonstrations = {expert_dem}: mean={mean}, std={std}")
        # print(f"number of expert demonstrations = {expert_dem}: eval_mean={eval_mean}, eval_std={eval_std}")
            
        rewards_over_seeds.append(eval_policy_mean)

In [None]:
main_train_airl("suboptimal", 60, random_prob=0.25, switch_prob=0.2)

# Training with different seeds and different nr of queries RLHF / IRLHF

In [10]:
def main_train(agent_name, path, expert_type, nr_demonstrations, random_prob=0, switch_prob=0):
    rewards_over_seeds = [] # [0]: lists of means for seed 0, num comp 250,500,600 respectively
    seed_list = [34, 43, 52, 61, 70]
    comparisons_list = [1400]
    
    for i, seed in enumerate(seed_list): 
        # set seed
        set_seed(seed)
        
        # start rlhf / irlhf 
        means = []
        stds = []
        eval_policy_mean = []
        eval_policy_std = []
    
        for idx, num_comparisons in enumerate(comparisons_list):
            if agent_name == "rlhf":
                train_rlhf(num_comparisons)
            if agent_name == "irlhf":
                # Train airl under current seed
                train_airl(expert_type=expert_type, nr_demonstrations=nr_demonstrations,
                           random_prob=random_prob, switch_prob=switch_prob)
                # Load trained AIRLAgent reward function
                reward_net_airl = torch.load("airl_agent/reward_net.pt")
                # Pass reward_net_airl to irlhf
                # Train irlhf (optimize reward function)
                if num_comparisons == 10:
                    train_irlhf(reward_net_airl, num_comparisons, num_it=8)
                else:
                    train_irlhf(reward_net_airl, num_comparisons)
                
            rws, mean, std = get_reward_stats(path)
            
            means.append(mean)
            stds.append(std)
            
            eval_mean = None
            eval_std = None
            
            # Train agent using the learned reward, optimize policy
            
            if agent_name == "rlhf":
                eval_mean, eval_std = train_with_learned_reward_and_evaluate(
                    reward_path="rlhf_agent/reward_net.pt",
                    train_path="rlhf_agent/rlhf_agent_trained_with_learned_reward",
                    tensorboard_dir="./ppo_rlhf_cartpole_tensorboard/",
                    tb_log_name=f"run5_comparisons_{num_comparisons}_with_seed_{seed}",
                    wandb_project_name="rlhf",
                    wandb_save_path=f"models/run_comparisons_{num_comparisons}_with_seed_{seed}",
                    config=config_rlhf
                )
            if agent_name == "irlhf":
                eval_mean, eval_std = train_with_learned_reward_and_evaluate(
                    reward_path="irlhf_agent/reward_net.pt",
                    train_path="irlhf_agent/irlhf_agent_trained_with_learned_reward",
                    tensorboard_dir="./ppo_irlhf_cartpole_tensorboard_3/",
                    tb_log_name=f"run_irlhf_comparisons_{num_comparisons}_demonstrations_{nr_demonstrations}_with_seed_{seed}_with_{expert_type}_expert",
                    wandb_project_name="irlhf",
                    wandb_save_path=f"models/run_comparisons_{num_comparisons}_with_seed_{seed}_irlhf",
                    config=config_rlhf
                )
                
            eval_policy_mean.append(eval_mean)
            eval_policy_std.append(eval_std)
            
            print(f"number of comparisons = {num_comparisons}: mean={mean}, std={std}")
            print(f"number of comparisons = {num_comparisons}: eval_mean={eval_mean}, eval_std={eval_std}")
            
        rewards_over_seeds.append(eval_policy_mean)

In [11]:
# main_train("rlhf", "./rlhf_agent/learner_rewards.pt")

In [None]:
for demonstrations in [1400]:
    main_train("irlhf", "./irlhf_agent/learner_rewards.pt", expert_type="suboptimal", nr_demonstrations=demonstrations,
               random_prob=0.25,
               switch_prob=0.2)