# Run Bee World Environment

In [1]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [2]:
# !pip install gymnasium
# !pip install stable_baselines3

# !git clone https://github.com/alTeska/rl-bee-multimodal-sensing.git
# !mv rl-bee-multimodal-sensing/bee.py ./

In [3]:
import os

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as animation

from IPython.display import HTML

import torch
import torch.nn as nn

import gymnasium as gym
from bee import BeeWorld
from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise


In [4]:
def display_video(frames, framerate=30):
  """Generates video from `frames`.

  Args:
    frames (ndarray): Array of shape (n_frames, height, width, 3).
    framerate (int): Frame rate in units of Hz.

  Returns:
    Display object.
  """
  height, width, _ = frames[0].shape
  dpi = 70
  orig_backend = matplotlib.get_backend()
  matplotlib.use('Agg')  # Switch to headless 'Agg' to inhibit figure rendering.
  fig, ax = plt.subplots(1, 1, figsize=(width / dpi, height / dpi), dpi=dpi)
  matplotlib.use(orig_backend)  # Switch back to the original backend.
  ax.set_axis_off()
  ax.set_aspect('equal')
  ax.set_position([0, 0, 1, 1])
  im = ax.imshow(frames[0])

  def update(frame):
    im.set_data(frame)
    return [im]
  interval = 1000/framerate
  anim = animation.FuncAnimation(fig=fig, func=update, frames=frames,
                                  interval=interval, blit=True, repeat=False)
  return HTML(anim.to_html5_video())

## Initialize Gym

In [5]:
gym.register(
    id="BeeWorld",
    entry_point=BeeWorld,
    max_episode_steps=3000,
)

env = gym.make("BeeWorld", render_mode="rgb_array")
env.reset()

({'vision': 0,
  'smell': array([0.0713274], dtype=float32),
  'velocity': array([0., 0.], dtype=float32),
  'time': array([0.001], dtype=float32)},
 {'is_success': False})

## Initialize the RL model

In [6]:
models_dir = "drive/MyDrive/neuromatch/models/{}"
model_alg = 'TD3'

if not os.path.exists(models_dir):
    os.makedirs(models_dir.format(model_alg), exist_ok=True)

logdir ="drive/MyDrive/logs"
if not os.path.exists(models_dir):
    os.makedirs(logdir, exist_ok = True)

In [9]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement
from stable_baselines3.common.logger import configure
from stable_baselines3.common.monitor import Monitor

env = Monitor(env, logdir, allow_early_resets=True)

logger = configure("test_logs",["stdout", "csv", "log", "tensorboard", "json"])
stop_train_callback = StopTrainingOnNoModelImprovement(max_no_improvement_evals=3, min_evals=5, verbose=1)

eval_callback = EvalCallback(env,
                             callback_after_eval=stop_train_callback,
                             best_model_save_path=models_dir.format(model_alg),
                             log_path=logdir,
                             eval_freq=1000,
                             n_eval_episodes=10,
                             deterministic=True,
                             render=False)

Logging to test_logs


In [10]:
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(
    mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)
)


model = TD3(
    "MultiInputPolicy",
    env,
    action_noise=action_noise,
    verbose=1,
    policy_kwargs = {
        "net_arch": [200, 200],  # Specify the number of hidden units per layer
        "activation_fn": nn.ReLU,  # Specify the activation function
    }   ,
    learning_rate=0.01
)
model.set_logger(logger)

vec_env = model.get_env()
obs = vec_env.reset()

timesteps = 10000
iters = 0
while iters < 10:
    iters += 1

    model.learn(total_timesteps=timesteps, reset_num_timesteps=False, callback=eval_callback)
    # model.save(f"{models_dir.format(model_alg)}/{timesteps*iters}")


env.close()

Using cpu device
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 218      |
|    ep_rew_mean     | 943      |
|    success_rate    | 1        |
| time/              |          |
|    episodes        | 4        |
|    fps             | 459      |
|    time_elapsed    | 1        |
|    total_timesteps | 872      |
| train/             |          |
|    actor_loss      | -22.8    |
|    critic_loss     | 184      |
|    learning_rate   | 0.01     |
|    n_updates       | 671      |
---------------------------------
Eval num_timesteps=1000, episode_reward=878.55 +/- 80.45
Episode length: 186.40 +/- 122.16
Success rate: 100.00%
---------------------------------
| eval/              |          |
|    mean_ep_length  | 186      |
|    mean_reward     | 879      |
|    success_rate    | 1        |
| time/              |          |
|    total_timesteps | 1000     |
| train/             |          |
|    actor_loss    