In [1]:
import datetime

import ray

ray.shutdown()
ray.init(ignore_reinit_error=True, num_gpus=1, _metrics_export_port=8080, include_dashboard=True,
         configure_logging=False)

0,1
Python version:,3.9.13
Ray version:,2.9.1
Dashboard:,http://127.0.0.1:8265


In [2]:
from minigrid.wrappers import FlatObsWrapper


class CustomFlatObsWrapper(FlatObsWrapper):
    def __init__(self, env):
        super().__init__(env)

pygame 2.5.2 (SDL 2.28.3, Python 3.9.13)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
from gymnasium.envs.registration import register

try:
    import gymnasium as gym

    gymnasium = True
except Exception:
    import gym

    gymnasium = False

ENV_ID = "MiniGrid-CustomMultiRoom-N6-v0"

# Register the custom environment
register(
    id=ENV_ID,
    entry_point='custom_env:CustomMultiRoomEnv',
    max_episode_steps=1000,
)


In [4]:
from custom_env import CustomMultiRoomEnv

try:
    import gymnasium as gym

    gymnasium = True
except Exception:
    import gym

    gymnasium = False
from ray.tune import register_env


def env_creator(env_config=None):
    config = {
        "agent_start_pos": (1, 1),
        "agent_start_dir": 0,
        "goal_pos": (15, 15),
        "minNumRooms": 2,
        "maxNumRooms": 5,
        "enable_dowham": True,
        "max_episode_steps": 1000,
        **env_config
    }
    env = CustomMultiRoomEnv(**config)
    env.reset()
    env = CustomFlatObsWrapper(env)
    return env


# Register the custom environment
register_env("my_minigrid_env", env_creator)


In [5]:
from ray.rllib.algorithms.callbacks import DefaultCallbacks
from ray.rllib import BaseEnv, Policy
from typing import Dict, Union, Optional
from ray.rllib import BaseEnv, Policy
from ray.rllib.evaluation import Episode
from ray.rllib.evaluation.episode_v2 import EpisodeV2
from ray.rllib.utils.typing import PolicyID


class AccuracyCallback(DefaultCallbacks):
    def on_episode_end(
            self,
            *,
            worker: "RolloutWorker",
            base_env: BaseEnv,
            policies: Dict[PolicyID, Policy],
            episode: Union[Episode, EpisodeV2, Exception],
            env_index: Optional[int] = None,
            **kwargs,
    ) -> None:
        super(DefaultCallbacks, self).on_episode_end(worker=worker, base_env=base_env,
                                                     policies=policies, episode=episode,
                                                     env_index=env_index, **kwargs)

        episode.custom_metrics["mean_accuracy"] = 0

In [6]:
from ray.rllib.algorithms.dqn import DQNConfig
from ray.air import RunConfig
from ray import train

tune_config = DQNConfig().environment("my_minigrid_env").rollouts(
    num_envs_per_worker=20,
    observation_filter="MeanStdFilter",
    num_rollout_workers=0,
).exploration(
    explore=True,
    exploration_config={
        "type": "EpsilonGreedy",
        "initial_epsilon": 1.0,
        "final_epsilon": 0.1,
        "epsilon_timesteps": 10000,
    }
).training()

tune_config_dict = tune_config.to_dict()
# Example stop criteria for MiniGrid-MultiRoom-N6-v0
stop = {
    "training_iteration": 10000,  # Increase the number of training iterations to give the agent more time to learn
    "timesteps_total": 5000000,  # Increase the total number of steps to allow the agent to gather more experience
    # "time_total_s": 36000,  # Uncomment and set a maximum training time in seconds if you have a time constraint
}
# Setup your RunConfig
run_config = RunConfig(
    name="new_experiment",
    stop=stop,
    storage_path="C:\\Users\\BerkayEren\\PycharmProjects\\rl-learning\\ray_results",
    checkpoint_config=train.CheckpointConfig(
        checkpoint_score_attribute="mean_accuracy",
        num_to_keep=5,
    ),
)

In [None]:
from ray import tune
from ray.tune.schedulers import ASHAScheduler

from ray.tune import register_env

# Register the custom environment
register_env("my_minigrid_env", env_creator)

tuner = tune.Tuner(
    "DQN",
    param_space={
        "lr": tune.loguniform(0.001, 0.1),
        "momentum": tune.grid_search([0.8, 0.9, 0.99]),
        "should_checkpoint": True,
        "callbacks": AccuracyCallback,
        **tune_config
    },
    run_config=train.RunConfig(
        name="new_experiment",
        stop=stop,
        checkpoint_config=train.CheckpointConfig(
            num_to_keep=5,
            checkpoint_at_end=True,
            checkpoint_frequency=50
        ),
        storage_path="C:\\Users\\BerkayEren\\PycharmProjects\\rl-learning\\ray_results",
    ),
    tune_config=tune.TuneConfig(mode="max", metric="episode_reward_mean", num_samples=2, scheduler=ASHAScheduler(), ),

)
# Start the tuning process
result = tuner.fit()


In [None]:
import os
import torch

best_result = result.get_best_result("episode_reward_mean", mode="max")
with best_result.checkpoint.as_directory() as checkpoint_dir:
    print(checkpoint_dir)



In [7]:
from ray.rllib.algorithms import DQN
from ray.tune.logger import pretty_print
import numpy as np

best_result_path = "C:/Users/BerkayEren/PycharmProjects/rl-learning/ray_results/new_experiment/DQN_my_minigrid_env_f93a7_00003_3_momentum=0.8000_2024-04-19_21-31-01/checkpoint_000001"
best_result_path = "C:/Users/BerkayEren/PycharmProjects/rl-learning/ray_results/checkpoints"
trainer = DQN(config=tune_config_dict)
trainer.restore(best_result_path)

for trial in range(1000):
    print(f"Running trial {trial + 1}")
    result = trainer.train()

    # Save the trainer every 50 trials
    if (trial + 1) % 50 == 0:
        checkpoint_path = trainer.save(
            "C:/Users/BerkayEren/PycharmProjects/rl-learning/ray_results/checkpoints")
        print(f"Saved checkpoint to: {checkpoint_path.checkpoint.path}")

print(pretty_print(result))

# best_result = result.get_best_result()
# print(best_result)
# best_checkpoint = best_result.get_best_checkpoint('episode_reward_mean', 'max')
# 
# trainer = DQN(config=tune_config_dict)
# # You can now restore your trainer from this checkpoint as shown previously
# trainer.restore(best_checkpoint)

# Initialize a list to store the observations from each trial
all_observations = []

for trial in range(10):
    print(f"Running trial {trial + 1}")
    env = env_creator({"render_mode": "human"})
    observation, info = env.reset()
    done = False
    action = None
    reward = 0

    visited_states = {}

    while not done:
        # Compute the action using the trained policy
        action = trainer.compute_single_action(observation=observation, prev_action=action, prev_reward=reward)

        # Take the action in the environment
        observation, reward, done, info, _ = env.step(action)

        visited_states.setdefault(env.agent_pos, 0)
        visited_states[env.agent_pos] += 1

        # Render the environment
        env.render()

    all_observations.append(visited_states)


`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


Running trial 1
Running trial 2
Running trial 3
Running trial 4
Running trial 5
Running trial 6
Running trial 7
Running trial 8
Running trial 9
Running trial 10
Running trial 11
Running trial 12
Running trial 13
Running trial 14
Running trial 15
Running trial 16
Running trial 17
Running trial 18
Running trial 19
Running trial 20
Running trial 21
Running trial 22
Running trial 23
Running trial 24
Running trial 25
Running trial 26
Running trial 27
Running trial 28
Running trial 29
Running trial 30
Running trial 31
Running trial 32
Running trial 33
Running trial 34
Running trial 35
Running trial 36
Running trial 37
Running trial 38
Running trial 39
Running trial 40
Running trial 41
Running trial 42
Running trial 43
Running trial 44
Running trial 45
Running trial 46
Running trial 47
Running trial 48
Running trial 49
Running trial 50
Saved checkpoint to: C:/Users/BerkayEren/PycharmProjects/rl-learning/ray_results/checkpoints
Running trial 51
Running trial 52
Running trial 53
Running trial 5

KeyboardInterrupt: 

In [None]:
from numpy import inf
from minigrid import Wall
import matplotlib.pyplot as plt

# Create a 2D array representing the environment
env_array = np.full((env.width, env.height), 0)  # Use a value of 10 to represent the walls


def find_goal_position(env):
    for x in range(env.width):
        for y in range(env.height):
            if env.grid.get(x, y) is not None and env.grid.get(x, y).type == 'goal':
                return x, y
    return None, None


goal_x, goal_y = find_goal_position(env)

wall_x = []
wall_y = []
# Iterate over the cells in the environment
for i in range(env.width):
    for j in range(env.height):
        # If the cell is a wall, set its value in the array to a high value (e.g., 10)
        if isinstance(env.grid.get(i, j), Wall):
            env_array[i, j] = 10
            wall_x.append(i)
            wall_y.append(j)

# Generate and save a heatmap for each trial
for i, trial_observations in enumerate(all_observations):
    visited_states_array = np.zeros((env.width, env.height))

    # Iterate over the visited states
    for position, count in trial_observations.items():
        # Set the value at the agent's position in the array to the visit count
        visited_states_array[position] = count

    # Overlay the visited states array on the environment array
    heatmap_array = env_array + visited_states_array
    # Generate the heatmap
    plt.imshow(heatmap_array, cmap='hot', interpolation='nearest')
    plt.colorbar()
    plt.scatter(wall_x, wall_y, c='white', marker='s', s=100)
    plt.scatter(goal_x, goal_y, c='green', marker='*', s=200)
    plt.title(f'Heatmap of Visited States in Trial {i + 1}')

    date_string = datetime.datetime.now().strftime("%Y-%m-%d")

    # Save the heatmap to a file
    plt.savefig(f'heatmaps/heatmap_dowham_trial_{i + 1}_{date_string}.png')

    # Clear the current figure so the next heatmap doesn't overlap with this one
    plt.clf()