In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

import argparse

from habitat.datasets import make_dataset
from ss_baselines.av_nav.config import get_config
from ss_baselines.common.environments import AudioNavRLEnv

config = get_config(
    config_paths="ss_baselines/av_nav/config/audionav/mp3d/env_test_0.yaml", # RGB + AudiogoalSensor
    # opts=["CONTINUOUS", "True"],
    run_type="eval")
config.defrost()
config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") # Note: can we add audio sensory info fields here too ?
config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT
config.freeze()
print(config)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


BASE_TASK_CONFIG_PATH: configs/audionav/av_nav/mp3d/base_env_test.yaml
CHECKPOINT_FOLDER: data/models/output/data
CHECKPOINT_INTERVAL: 50
CMD_TRAILING_OPTS: []
CONTINUOUS: True
DEBUG: True
DISPLAY_RESOLUTION: 128
ENV_NAME: AudioNavRLEnv
EVAL:
  SPLIT: val_telephone
  USE_CKPT_CONFIG: True
EVAL_CKPT_PATH_DIR: data/models/output/data
EXTRA_RGB: False
FOLLOW_SHORTEST_PATH: False
LOG_FILE: data/models/output/train.log
LOG_INTERVAL: 10
NUM_PROCESSES: 1
NUM_UPDATES: 300000
RL:
  DISTANCE_REWARD_SCALE: 1.0
  PPO:
    clip_param: 0.1
    entropy_coef: 0.2
    eps: 1e-05
    gamma: 0.99
    hidden_size: 512
    lr: 0.00025
    max_grad_norm: 0.5
    num_mini_batch: 1
    num_steps: 150
    ppo_epoch: 4
    reward_window_size: 50
    tau: 0.95
    use_gae: True
    use_linear_clip_decay: True
    use_linear_lr_decay: True
    value_loss_coef: 0.5
  SLACK_REWARD: -0.01
  SUCCESS_REWARD: 10.0
  TIME_DIFF: False
  WITH_DISTANCE_REWARD: True
  WITH_TIME_PENALTY: True
SEED: 0
SENSORS: ['RGB_SENSOR']


In [2]:
dataset = make_dataset(id_dataset=config.TASK_CONFIG.DATASET.TYPE, config=config.TASK_CONFIG.DATASET)
env = AudioNavRLEnv(config=config, dataset=dataset)

2022-07-20 15:39:44,331 Initializing dataset AudioNav
2022-07-20 15:39:44,347 initializing sim ContinuousSoundSpacesSim


In [None]:
observation = env.reset()

In [None]:
# type(observation) # habitat.core.simulator.Observations
list(observation.keys()) # ['rgb', 'spectrogram'] or 
# observation["rgb"].shape # (128, 128, 3)
# observation["spectrogram"].shape # (65, 26, 2), available when TASK.SENSORS = ["SPECTROGRAM_SENSOR"]
# observation["audiogoal"].shape # (2, 16000), Available wwhen TASK.SENSORS = ['AUDIOGOAL_SENSOR']
# env.action_space # ActionSpace(MOVE_FORWARD:EmptySpace(), STOP:EmptySpace(), TURN_LEFT:EmptySpace(), TURN_RIGHT:EmptySpace())
# action = env.action_space.sample()

In [None]:
# fig.imshow(observation["rgb"])

In [None]:
action_fwd = {"action": "MOVE_FORWARD", "action_args": None}
observation, reward, done, info = env.step(**action_fwd)

In [None]:
# list(observation.keys())
# reward
# done
# list(info.keys()) # ['distance_to_goal', 'normalized_distance_to_goal', 'success', 'spl', 'softspl', 'na', 'sna', 'top_down_map']

In [None]:
## Collect one full episode
observation, done, ep_length = env.reset(), False, 0
ep_observations = [observation]

while not done:
    # The "STOP" action will make the episode finish early, but 
    # this work around is not very efficient.
    action = {"action": 'STOP', "action_args": None}
    while action["action"] == 'STOP':
        action = env.action_space.sample()

    observation, reward, done, info = env.step(**action)
    ep_length += 1

    ep_observations.append(observation)

    print("")
    print("###########################################")
    print(f"# DEBUG: Episode length: {ep_length}")
    print(f"DEBUG: Done value {done}")
    print("###########################################")
    print("")

In [None]:
# 'info' variable after the end of the peisode: do we get any additional infromation of the full episode ? For example all the RGB frames / audio waves as a nice list ?
list(info.keys())

In [None]:
len(ep_observations) # 501 for randomly sampled episode
# list(ep_observations[0].keys()) # ['rgb', 'audiogoal']
ep_rgb_observations = [obs["rgb"] for obs in ep_observations]
ep_audiogoal_observations = [obs["audiogoal"] for obs in ep_observations]
# ep_rgb_observations.shape, ep_audiogoal_observations.shape # ((501, 128, 128, 3), (501, 2, 16000))

In [None]:
import os
from typing import Dict, List, Optional
import moviepy.editor as mpy
from moviepy.audio.AudioClip import CompositeAudioClip, AudioArrayClip

def images_to_video_with_audio(
    images: List[np.ndarray],
    output_dir: str,
    video_name: str,
    audios: List[str],
    sr: int,
    fps: int = 1,
    quality: Optional[float] = 5,
    **kwargs
):
    r"""Calls imageio to run FFMPEG on a list of images. For more info on
    parameters, see https://imageio.readthedocs.io/en/stable/format_ffmpeg.html
    Args:
        images: The list of images. Images should be HxWx3 in RGB order.
        output_dir: The folder to put the video in.
        video_name: The name for the video.
        audios: raw audio files
        fps: Frames per second for the video. Not all values work with FFMPEG,
            use at your own risk.
        quality: Default is 5. Uses variable bit rate. Highest quality is 10,
            lowest is 0.  Set to None to prevent variable bitrate flags to
            FFMPEG so you can manually specify them using output_params
            instead. Specifying a fixed bitrate using ‘bitrate’ disables
            this parameter.
    """
    assert 0 <= quality <= 10
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    video_name = video_name.replace(" ", "_").replace("\n", "_") + ".mp4"

    audio_clips = []
    multiplier = 0.5
    for i, audio in enumerate(audios):
        audio_clip = AudioArrayClip(audio.T[:int(sr * 1 / fps)] * multiplier, fps=sr)
        audio_clip = audio_clip.set_start(1 / fps * i)
        audio_clips.append(audio_clip)
    composite_audio_clip = CompositeAudioClip(audio_clips)
    video_clip = mpy.ImageSequenceClip(images, fps=fps)
    video_with_new_audio = video_clip.set_audio(composite_audio_clip)
    video_with_new_audio.write_videofile(os.path.join(output_dir, video_name))

In [None]:
config.TASK_CONFIG.SIMULATOR.AUDIO.RIR_SAMPLING_RATE # 16000 for mp3d datset
config.TASK_CONFIG.SIMULATOR.VIEW_CHANGE_FPS # 10 for mp3d by default ?

In [None]:
images_to_video_with_audio(
    images=ep_rgb_observations,
    output_dir="/tmp/ss-videos",
    video_name="ss_video_dgb",
    audios=ep_audiogoal_observations,
    sr=config.TASK_CONFIG.SIMULATOR.AUDIO.RIR_SAMPLING_RATE, # 16000 for mp3d dataset
    fps=config.TASK_CONFIG.SIMULATOR.VIEW_CHANGE_FPS,
)