In [33]:
import sys
sys.path.append("../../")
sys.path.append("../../models/Memory_RL")

# from environments.Passive_T_Maze_Flag.env.env_passive_t_maze_flag import TMazeClassicPassive
from models.Memory_RL.envs.tmaze import TMazeClassicPassive
from models.Memory_RL.policies.models.policy_rnn_dqn import ModelFreeOffPolicy_DQN_RNN
import os 

import numpy as np
import gym
import matplotlib.pyplot as plt
import random
import torch
import yaml
import time
from moviepy.editor import ImageSequenceClip, VideoFileClip


In [34]:
from configs.rl.name_fns import name_fn as name_fn1
from ml_collections import ConfigDict
from typing import Tuple
from torchkit import pytorch_utils as ptu

def dqn_name_fn(
    config: ConfigDict, max_episode_steps: int, max_training_steps: int
) -> Tuple[ConfigDict, str]:
    config, name = name_fn1(config)
    # set eps = 1/T, so that the asymptotic prob to
    # sample fully exploited trajectory during exploration is
    # (1-1/T)^T = 1/e
    config.init_eps = 1.0
    config.end_eps = 1.0 / max_episode_steps
    config.schedule_steps = config.schedule_end * max_training_steps

    return config, name


def get_rl_config():
    config = ConfigDict()
    config.name_fn = dqn_name_fn

    config.algo = "dqn"

    config.critic_lr = 3e-4

    config.config_critic = ConfigDict()
    config.config_critic.hidden_dims = (256, 256)

    config.discount = 0.99
    config.tau = 0.005
    config.schedule_end = 0.1  # at least good for TMaze-like envs

    config.replay_buffer_size = 1e6
    config.replay_buffer_num_episodes = 1e3

    return config

In [35]:
from ml_collections import ConfigDict
from typing import Tuple


def name_fn(config: ConfigDict, max_episode_steps: int) -> Tuple[ConfigDict, str]:
    name = ""

    if config.sampled_seq_len == -1:
        config.sampled_seq_len = max_episode_steps

    name += f"{config.model.seq_model_config.name}-len-{config.sampled_seq_len}/"

    assert config.clip is False

    del config.name_fn
    return config, name


def get_seq_config():
    config = ConfigDict()
    config.name_fn = name_fn

    config.is_markov = False
    config.is_attn = False
    config.use_dropout = False

    config.sampled_seq_len = -1

    config.clip = False
    config.max_norm = 1.0
    config.use_l2_norm = False

    # fed into Module
    config.model = ConfigDict()

    # seq_model specific
    config.model.seq_model_config = ConfigDict()
    config.model.seq_model_config.name = "lstm"
    config.model.seq_model_config.hidden_size = 128
    config.model.seq_model_config.n_layer = 1

    # embedders
    config.model.observ_embedder = ConfigDict()
    config.model.observ_embedder.name = "mlp"
    config.model.observ_embedder.hidden_size = 32

    config.model.action_embedder = ConfigDict()
    config.model.action_embedder.name = "mlp"
    config.model.action_embedder.hidden_size = 16

    config.model.reward_embedder = ConfigDict()
    config.model.reward_embedder.name = "mlp"
    config.model.reward_embedder.hidden_size = 0

    return config

In [36]:
from itertools import permutations

def generate_permutations(nums):

    perms = permutations(nums)
    result = [int(''.join(map(str, perm))) for perm in perms]
    
    return result

In [37]:

# AGENT_CLASSES = {
#     "Policy_MLP": Policy_MLP,
#     "Policy_RNN_MLP": Policy_RNN_MLP,
#     "Policy_Separate_RNN": Policy_Separate_RNN,
#     "Policy_Shared_RNN": Policy_Shared_RNN,
#     "Policy_DQN_RNN": Policy_DQN_RNN,
# }
from torchkit.pytorch_utils import set_gpu_mode
set_gpu_mode('cuda', 0)

agent_class = ModelFreeOffPolicy_DQN_RNN
agent_arch = agent_class.ARCH

device = torch.device('cuda:0')
torch.set_default_device(device)

set device: cuda:0


In [20]:


episode_timeout = 31
corridor_length = episode_timeout - 2
penalty = -1/(episode_timeout - 1)


env = TMazeClassicPassive(episode_length=episode_timeout, 
                            corridor_length=corridor_length, 
                            goal_reward=1.0,
                            penalty=penalty)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [21]:
max_episode_steps = 15
max_training_steps = 999

config_seq, _ = name_fn(get_seq_config(), max_episode_steps = max_episode_steps)
config_rl, _ = dqn_name_fn(config = get_rl_config(), max_episode_steps =max_episode_steps , max_training_steps =max_training_steps)



In [22]:
image_encoder_fn = lambda: None

obs_dim = env.observation_space.shape[0]
act_dim = 4

freeze_critic = False

agent = agent_class(
    obs_dim=obs_dim,
    action_dim=act_dim,
    config_seq=config_seq,
    config_rl=config_rl,
    image_encoder_fn=image_encoder_fn,
    freeze_critic=freeze_critic,
).to(device)

# Checkpoint 1

In [31]:
ckpt_path = '/opt/Memory-RL-Codebase/models/Memory_RL/logs_2024_09_30_23_00/MinigridMemory/LSTM_DQN/SHORT_TERM/2024_09_30-17_33_03/best_agent.pt'
# ckpt_path = '/opt/Memory-RL-Codebase/models/Memory_RL/logs_2024_09_30_12_00/LSTM_DQN/2024_09_30-02_28_51/curr_agent.pt'

In [None]:
agent.load_state_dict(torch.load(ckpt_path, map_location=device))


In [25]:
from utils import helpers as utl

deterministic = False
# eval_episodes = 2

In [26]:
agent = agent.to(device)

In [27]:
nums = [1, 2, 3, 4, 5]
eval_seeds = generate_permutations(nums)

videos_limit = len(eval_seeds) + 1
n_episode = len(eval_seeds)


render = False

total_reward = 0
num_successes = 0
total_steps = 0

In [28]:

agent.eval()  # set to eval mode for deterministic dropout

returns_per_episode = np.zeros(n_episode)
success_rate = np.zeros(n_episode)
# total_steps = np.zeros(n_episode)

for task_idx in range(n_episode):
    step = 0
    running_reward = 0.0
    done_rollout = False

    if eval_seeds is not None and False:
        obs = ptu.from_numpy(env.reset(seed = eval_seeds[task_idx])).to(device)  # reset
    else:
        obs = ptu.from_numpy(env.reset()).to(device)  # reset

    obs = obs.reshape(1, obs.shape[-1])

    # assume initial reward = 0.0
    action, reward, internal_state = agent.get_initial_info(
        config_seq.sampled_seq_len
    )

    while not done_rollout:
        action, internal_state = agent.act(
            prev_internal_state=internal_state,
            prev_action=action.to(device),
            reward=reward.to(device),
            obs=obs.to(device),
            deterministic=deterministic,
        )


        # observe reward and next obs
        next_obs, reward, done, info = utl.env_step(
            env, action.squeeze(dim=0)
        )

        # add raw reward
        running_reward += reward.item()
        step += 1
        done_rollout = False if ptu.get_numpy(done[0][0]) == 0.0 else True

        # set: obs <- next_obs
        obs = next_obs.clone()

    #returns_per_episode[task_idx] = running_reward
    #total_steps[task_idx] = step
    if "success" in info and info["success"] == True:  # keytodoor
        success_rate[task_idx] = 1.0
        num_successes += 1
    
    total_reward += running_reward
    total_steps += step

    curr_seed = eval_seeds[task_idx]
    print(f'Episode: {task_idx}, seed: {curr_seed} Reward: {running_reward}, Steps: {step} Mean reward: {total_reward / (task_idx + 1)}, Mean steps: {total_steps / (task_idx + 1)}')


print(f'Total num episodes: {n_episode} Success rate: {num_successes / n_episode}, Mean reward: {total_reward / n_episode}, Mean steps: {total_steps / n_episode}')


Episode: 0, seed: 12345 Reward: -0.9999999962747097, Steps: 30 Mean reward: -0.9999999962747097, Mean steps: 30.0
Episode: 1, seed: 12354 Reward: -0.931034479290247, Steps: 30 Mean reward: -0.9655172377824783, Mean steps: 30.0
Episode: 2, seed: 12435 Reward: -0.8965517207980156, Steps: 30 Mean reward: -0.9425287321209908, Mean steps: 30.0
Episode: 3, seed: 12453 Reward: -0.931034479290247, Steps: 30 Mean reward: -0.9396551689133048, Mean steps: 30.0
Episode: 4, seed: 12534 Reward: -0.6896551698446274, Steps: 30 Mean reward: -0.8896551690995693, Mean steps: 30.0
Episode: 5, seed: 12543 Reward: -0.41379310190677643, Steps: 30 Mean reward: -0.8103448245674372, Mean steps: 30.0
Episode: 6, seed: 13245 Reward: -0.6896551698446274, Steps: 30 Mean reward: -0.7931034453213215, Mean steps: 30.0
Episode: 7, seed: 13254 Reward: -0.931034479290247, Steps: 30 Mean reward: -0.8103448245674372, Mean steps: 30.0
Episode: 8, seed: 13425 Reward: -0.7586206868290901, Steps: 30 Mean reward: -0.80459769815