# Dataset Extraction

This is a notebook for getting started with the data.

The dataset used is **SOLAQUA**, available from [SINTEF Open Data](https://data.sintef.no/feature/fe-a8f86232-5107-495e-a3dd-a86460eebef6).  



## Installing Packages

In [1]:
from collections import defaultdict 
from rosbags.highlevel import AnyReader
from pathlib import Path
import numpy as np
import cv2
import re

## Defining .bag files

All data files should be placed in the `../data/SOLAQUA` folder.

- `*_data.bag` → contains **sensor data** (ROS bag format).
- `*_video.bag` → contains **video images** (ROS bag format).

The dataset used is **SOLAQUA**, available from [SINTEF Open Data](https://data.sintef.no/feature/fe-a8f86232-5107-495e-a3dd-a86460eebef6).  

In [6]:
# Change these two lines to switch dataset
DATA_BAG  = Path("data/bags/2024-08-20_14-31-29_data.bag")   # sensor data
VIDEO_BAG = Path("data/bags/2024-08-20_14-31-29_video.bag")  # camera and sonar video

# Output folder for extracted frames, videos, sonar arrays, etc.
OUT_ROOT = Path("data/exports/vision")
OUT_ROOT.mkdir(parents=True, exist_ok=True)

print(f"Using data bag : {DATA_BAG.resolve()}")
print(f"Using video bag: {VIDEO_BAG.resolve()}")
print(f"Output folder : {OUT_ROOT.resolve()}")

Using data bag : /cluster/home/henrban/SOLAQUA-UOD/solaqua/data/bags/2024-08-20_14-31-29_data.bag
Using video bag: /cluster/home/henrban/SOLAQUA-UOD/solaqua/data/bags/2024-08-20_14-31-29_video.bag
Output folder : /cluster/home/henrban/SOLAQUA-UOD/solaqua/data/exports/vision


## List Topics

In [3]:
def human_hz(count, duration_s):
    if count == 0 or duration_s <= 0:
        return 0.0
    return count / duration_s

for bag in [DATA_BAG, VIDEO_BAG]:
    print(f"\n=== {bag.name} ===")
    if not bag.exists():
        print("  (missing)")
        continue

    counts = defaultdict(int)
    first_ts = defaultdict(lambda: None)
    last_ts  = defaultdict(lambda: None)
    types = {}

    with AnyReader([bag]) as r:
        for c in r.connections:
            types[c.topic] = c.msgtype
        for conn, ts, _ in r.messages():
            t = conn.topic
            counts[t] += 1
            if first_ts[t] is None or ts < first_ts[t]:
                first_ts[t] = ts
            if last_ts[t] is None or ts > last_ts[t]:
                last_ts[t] = ts

    if not counts:
        print("  (no messages)")
        continue

    col_topic = max(len(t) for t in counts.keys())
    col_type  = max(len(types.get(t, "")) for t in counts.keys())
    header = f"{'TOPIC'.ljust(col_topic)}  {'TYPE'.ljust(col_type)}  COUNT    START(ns)          END(ns)            DURATION(s)  ~HZ"
    print(header)
    print("-" * len(header))

    for t in sorted(counts.keys()):
        n = counts[t]
        t0 = first_ts[t]
        t1 = last_ts[t]
        dur_s = (t1 - t0) / 1e9 if (t0 is not None and t1 is not None) else 0.0
        hz = human_hz(n, dur_s)
        print(
            f"{t.ljust(col_topic)}  "
            f"{types.get(t,'').ljust(col_type)}  "
            f"{str(n).rjust(5)}    "
            f"{str(t0).rjust(16)}  "
            f"{str(t1).rjust(16)}  "
            f"{dur_s:11.3f}  {hz:5.2f}"
        )


=== 2024-08-20_14-31-29_data.bag ===
TOPIC                                     TYPE                                     COUNT    START(ns)          END(ns)            DURATION(s)  ~HZ
--------------------------------------------------------------------------------------------------------------------------------------------------
/bluerov2/alive                           std_msgs/msg/Float32                       129    1724157094493788400  1724157158270660900       63.777   2.02
/bluerov2/armed                           std_msgs/msg/Float32                       131    1724157093740108300  1724157158270716300       64.531   2.03
/bluerov2/battery                         messages/msg/BatteryStatus                  60    1724157095568144200  1724157158350797800       62.783   0.96
/bluerov2/modes                           joystick/msg/ModeManager2                    1    1724157091929707100  1724157091929707100        0.000   0.00
/commanded_thrust                         rospy_tutorial

## Extracting data

### making frames

In [4]:
from pathlib import Path
import numpy as np
import cv2



# --- Config ---
# If you want to include only certain topics, put them here; otherwise leave as None to include all image topics.
TOPIC_INCLUDE = "/image/compressed_image/data"  # e.g., ["/image/compressed_image/data", "/ted/image"]

# If you want to exclude certain topics, list them here (checked after include).
TOPIC_EXCLUDE = []    # e.g., ["/image/compressed_image/camera_info"]

# --- Output scaffolding ---
bag_stem = VIDEO_BAG.stem.replace("_video", "")
RUN_ROOT = OUT_ROOT / bag_stem / "raw"
RUN_ROOT.mkdir(parents=True, exist_ok=True)

assert VIDEO_BAG.exists(), f"Missing video bag: {VIDEO_BAG}"

print(f"[INFO] Reading {VIDEO_BAG.name}")
print(f"[INFO] Saving frames under: {RUN_ROOT}")

def sanitize_topic(topic: str) -> str:
    """Make a filesystem-safe topic label (stable and readable)."""
    # Strip leading slash, replace remaining slashes with double underscores
    # Keep alphanum, underscore and dash, map others to underscore.
    base = topic.strip("/")
    safe = base.replace("/", "__")
    safe = "".join(ch if (ch.isalnum() or ch in ("_", "-", ".", "__")) else "_" for ch in safe)
    return safe or "topic"

def ensure_topic_dirs(topic: str, cache: dict) -> Path:
    if topic not in cache:
        safe = sanitize_topic(topic)
        topic_dir = RUN_ROOT / safe  
        topic_dir.mkdir(parents=True, exist_ok=True)
        cache[topic] = topic_dir
        print(f"[INFO] → Topic '{topic}' → {topic_dir}")
    return cache[topic]

def decode_raw_image(msg):
    """Decode sensor_msgs/msg/Image to BGR np.ndarray (uint8)."""
    h, w, step = msg.height, msg.width, msg.step
    enc = (msg.encoding or "").lower()
    buf = np.frombuffer(msg.data, dtype=np.uint8)

    # Common 8-bit encodings
    if enc in ("bgr8",):
        frame = buf.reshape(h, step)[:, :w*3].reshape(h, w, 3)
        return frame
    if enc in ("rgb8",):
        frame = buf.reshape(h, step)[:, :w*3].reshape(h, w, 3)
        return cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    if enc in ("mono8", "8uc1", "8uc1c1", "mono"):
        frame = buf.reshape(h, step)[:, :w]
        return cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)
    if enc in ("yuv422", "yuyv", "yuyv422", "yuv422_yuy2"):
        # 2 bytes per pixel
        frame = buf.reshape(h, step)[:, :w*2]
        return cv2.cvtColor(frame, cv2.COLOR_YUV2BGR_YUY2)

    # Fallback heuristic (can be wrong for Bayer/16-bit)
    chans = 3 if (step % w != 0) else max(1, step // w)
    try:
        raw = buf.reshape(h, step)[:, :w*chans].reshape(h, w, chans)
    except Exception:
        return None

    if chans == 1:
        return cv2.cvtColor(raw, cv2.COLOR_GRAY2BGR)
    if enc == "rgb8":  # just in case encoding was weirdly reported
        return cv2.cvtColor(raw, cv2.COLOR_RGB2BGR)
    return raw

def save_frame(topic_dir: Path, topic_safe: str, ts_ns: int, frame_bgr: np.ndarray):
    out = topic_dir / f"{ts_ns}.jpg"
    cv2.imwrite(str(out), frame_bgr)

# --- Main read loop (per-topic saving) ---
saved_by_topic = {}
skipped_by_topic = {}
topic_dirs_cache = {}

from contextlib import ExitStack
with ExitStack() as stack:
    r = stack.enter_context(AnyReader([VIDEO_BAG]))

    for i, (conn, ts, raw) in enumerate(r.messages()):
        msgtype = conn.msgtype
        topic = conn.topic

        # Filter for image-like topics only (skip CameraInfo, sonar custom msgs, etc.)
        if msgtype not in ("sensor_msgs/msg/CompressedImage", "sensor_msgs/msg/Image"):
            continue

        if TOPIC_INCLUDE and topic not in TOPIC_INCLUDE:
            continue
        if TOPIC_EXCLUDE and topic in TOPIC_EXCLUDE:
            continue

        topic_dir = ensure_topic_dirs(topic, topic_dirs_cache)
        topic_safe = topic_dir.parent.name  # the sanitized topic folder name

        # Decode
        if msgtype == "sensor_msgs/msg/CompressedImage":
            msg = r.deserialize(raw, msgtype)
            arr = np.frombuffer(msg.data, np.uint8)
            frame = cv2.imdecode(arr, cv2.IMREAD_COLOR)
        else:  # sensor_msgs/msg/Image
            msg = r.deserialize(raw, msgtype)
            frame = decode_raw_image(msg)

        # Count bookkeeping
        if topic not in saved_by_topic:
            saved_by_topic[topic] = 0
            skipped_by_topic[topic] = 0

        if frame is None:
            skipped_by_topic[topic] += 1
            continue

        save_frame(topic_dir, topic_safe, ts, frame)
        saved_by_topic[topic] += 1

        # Per-topic progress
        if saved_by_topic[topic] % 100 == 0:
            print(f"[INFO] [{topic}] Saved {saved_by_topic[topic]} frames …")

# --- Summary ---
print("\n[DONE] Per-topic results:")
total_saved = 0
total_skipped = 0
for t in sorted(saved_by_topic.keys()):
    s = saved_by_topic[t]
    k = skipped_by_topic.get(t, 0)
    total_saved += s
    total_skipped += k
    safe = sanitize_topic(t)
    out_dir = RUN_ROOT / safe / "image_frames"
    print(f"  - {t} → saved: {s:5d}, skipped: {k:5d}, dir: {out_dir}")

print(f"\n[TOTAL] Saved {total_saved} frames across {len(saved_by_topic)} topics.")
if total_skipped:
    print(f"[WARN] Skipped {total_skipped} frames (decode failures).")


[INFO] Reading 2024-08-20_14-31-29_video.bag
[INFO] Saving frames under: data/exports/vision/2024-08-20_14-31-29/raw
[INFO] → Topic '/image/compressed_image/data' → data/exports/vision/2024-08-20_14-31-29/raw/image__compressed_image__data
[INFO] [/image/compressed_image/data] Saved 100 frames …
[INFO] [/image/compressed_image/data] Saved 200 frames …
[INFO] [/image/compressed_image/data] Saved 300 frames …
[INFO] [/image/compressed_image/data] Saved 400 frames …
[INFO] [/image/compressed_image/data] Saved 500 frames …
[INFO] [/image/compressed_image/data] Saved 600 frames …
[INFO] [/image/compressed_image/data] Saved 700 frames …
[INFO] [/image/compressed_image/data] Saved 800 frames …
[INFO] [/image/compressed_image/data] Saved 900 frames …
[INFO] [/image/compressed_image/data] Saved 1000 frames …
[INFO] [/image/compressed_image/data] Saved 1100 frames …
[INFO] [/image/compressed_image/data] Saved 1200 frames …
[INFO] [/image/compressed_image/data] Saved 1300 frames …
[INFO] [/image/c

### Make MP4 from images

We are using Variable frame rate (VFR) and the ffmpeg library.

Was a bit tricky to get the ffmpeg dependency. I did module load Anaconda3/2024.02-1, then conda --version to confirm. Then conda create -n myffmpeg -c conda-forge ffmpeg -y to install ffmpeg. conda activate myffmpeg and ffmpeg -version to confirm. To load it into the .venv i did: source .venv/bin/activate and then export PATH="$HOME/.conda/envs/myffmpeg/bin:$PATH". Lastly select kernel with the python enviornment (.venv).

To activate environments next time do:
- module load Anaconda3/2024.02-1
- source .venv/bin/activate
- export PATH="$HOME/.conda/envs/myffmpeg/bin:$PATH"
- select right kernel

- if kernel cant find ffmpeg, do this inside notebook:
```python

```

In [2]:
import os
os.environ["PATH"] = "/cluster/home/henrban/.conda/envs/myffmpeg/bin:" + os.environ["PATH"]
!ffmpeg -version

ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers
  built with gcc 14.3.0 (conda-forge gcc 14.3.0-5)
  configuration: --prefix=/cluster/home/henrban/.conda/envs/myffmpeg --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1758923993009/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1758923993009/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1758923993009/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1758923993009/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --enable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --disable-gnutls --enable-libvpx --enable-libass --enable-pthreads --enable-alsa --enable-libpulse --enable-libvpl --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --enable-

In [5]:
from utils.sonar_visualization import build_vfr_mp4_from_ns_frames


#VIDEO_BAG = Path("data/bags/2024-08-20_13-57-42_video.bag")     # bag should be defined in the top
bag_stem  = VIDEO_BAG.stem.replace("_video", "")
vision_frames = Path(f"data/exports/vision/{bag_stem}/raw/image__compressed_image__data")
vision_out    = Path(f"data/exports/vision/{bag_stem}/raw/{bag_stem}_vision.mp4")   ## change output name if you want to

build_vfr_mp4_from_ns_frames(vision_frames, vision_out, crf=18, preset="slow")

Running: ffmpeg -y -f concat -safe 0 -i data/exports/vision/2024-08-20_13-57-42/raw/2024-08-20_13-57-42_vision.list.txt -fps_mode vfr -pix_fmt yuv420p -c:v libx264 -crf 18 -preset slow -movflags +faststart data/exports/vision/2024-08-20_13-57-42/raw/2024-08-20_13-57-42_vision.mp4


ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers
  built with gcc 14.3.0 (conda-forge gcc 14.3.0-5)
  configuration: --prefix=/cluster/home/henrban/.conda/envs/myffmpeg --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1758923993009/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1758923993009/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1758923993009/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1758923993009/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --enable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --disable-gnutls --enable-libvpx --enable-libass --enable-pthreads --enable-alsa --enable-libpulse --enable-libvpl --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --enable-

✅ Wrote data/exports/vision/2024-08-20_13-57-42/raw/2024-08-20_13-57-42_vision.mp4


PosixPath('data/exports/vision/2024-08-20_13-57-42/raw/2024-08-20_13-57-42_vision.mp4')