# Dataset Extraction

This is a notebook for getting started with the data.

The dataset used is **SOLAQUA**, available from [SINTEF Open Data](https://data.sintef.no/feature/fe-a8f86232-5107-495e-a3dd-a86460eebef6).  



## Installing Packages

In [1]:
from collections import defaultdict 
from rosbags.highlevel import AnyReader
from pathlib import Path
import numpy as np
import cv2
import re

## Defining .bag files

All data files should be placed in the `../data/SOLAQUA` folder.

- `*_data.bag` → contains **sensor data** (ROS bag format).
- `*_video.bag` → contains **video images** (ROS bag format).

The dataset used is **SOLAQUA**, available from [SINTEF Open Data](https://data.sintef.no/feature/fe-a8f86232-5107-495e-a3dd-a86460eebef6).  

In [6]:
# Change these two lines to switch dataset
DATA_BAG  = Path("data/bags/2024-08-20_14-31-29_data.bag")   # sensor data
VIDEO_BAG = Path("data/bags/2024-08-20_14-31-29_video.bag")  # camera and sonar video

# Output folder for extracted frames, videos, sonar arrays, etc.
OUT_ROOT = Path("data/exports/vision")
OUT_ROOT.mkdir(parents=True, exist_ok=True)

print(f"Using data bag : {DATA_BAG.resolve()}")
print(f"Using video bag: {VIDEO_BAG.resolve()}")
print(f"Output folder : {OUT_ROOT.resolve()}")

Using data bag : /cluster/home/henrban/SOLAQUA-UOD/solaqua/data/bags/2024-08-20_14-31-29_data.bag
Using video bag: /cluster/home/henrban/SOLAQUA-UOD/solaqua/data/bags/2024-08-20_14-31-29_video.bag
Output folder : /cluster/home/henrban/SOLAQUA-UOD/solaqua/data/exports/vision


## List Topics

In [3]:
def human_hz(count, duration_s):
    if count == 0 or duration_s <= 0:
        return 0.0
    return count / duration_s

for bag in [DATA_BAG, VIDEO_BAG]:
    print(f"\n=== {bag.name} ===")
    if not bag.exists():
        print("  (missing)")
        continue

    counts = defaultdict(int)
    first_ts = defaultdict(lambda: None)
    last_ts  = defaultdict(lambda: None)
    types = {}

    with AnyReader([bag]) as r:
        for c in r.connections:
            types[c.topic] = c.msgtype
        for conn, ts, _ in r.messages():
            t = conn.topic
            counts[t] += 1
            if first_ts[t] is None or ts < first_ts[t]:
                first_ts[t] = ts
            if last_ts[t] is None or ts > last_ts[t]:
                last_ts[t] = ts

    if not counts:
        print("  (no messages)")
        continue

    col_topic = max(len(t) for t in counts.keys())
    col_type  = max(len(types.get(t, "")) for t in counts.keys())
    header = f"{'TOPIC'.ljust(col_topic)}  {'TYPE'.ljust(col_type)}  COUNT    START(ns)          END(ns)            DURATION(s)  ~HZ"
    print(header)
    print("-" * len(header))

    for t in sorted(counts.keys()):
        n = counts[t]
        t0 = first_ts[t]
        t1 = last_ts[t]
        dur_s = (t1 - t0) / 1e9 if (t0 is not None and t1 is not None) else 0.0
        hz = human_hz(n, dur_s)
        print(
            f"{t.ljust(col_topic)}  "
            f"{types.get(t,'').ljust(col_type)}  "
            f"{str(n).rjust(5)}    "
            f"{str(t0).rjust(16)}  "
            f"{str(t1).rjust(16)}  "
            f"{dur_s:11.3f}  {hz:5.2f}"
        )


=== 2024-08-20_14-31-29_data.bag ===
TOPIC                                     TYPE                                     COUNT    START(ns)          END(ns)            DURATION(s)  ~HZ
--------------------------------------------------------------------------------------------------------------------------------------------------
/bluerov2/alive                           std_msgs/msg/Float32                       129    1724157094493788400  1724157158270660900       63.777   2.02
/bluerov2/armed                           std_msgs/msg/Float32                       131    1724157093740108300  1724157158270716300       64.531   2.03
/bluerov2/battery                         messages/msg/BatteryStatus                  60    1724157095568144200  1724157158350797800       62.783   0.96
/bluerov2/modes                           joystick/msg/ModeManager2                    1    1724157091929707100  1724157091929707100        0.000   0.00
/commanded_thrust                         rospy_tutorial

## Extracting data

In [4]:
from pathlib import Path
import numpy as np
import cv2



# --- Config ---
# If you want to include only certain topics, put them here; otherwise leave as None to include all image topics.
TOPIC_INCLUDE = "/image/compressed_image/data"  # e.g., ["/image/compressed_image/data", "/ted/image"]

# If you want to exclude certain topics, list them here (checked after include).
TOPIC_EXCLUDE = []    # e.g., ["/image/compressed_image/camera_info"]

# --- Output scaffolding ---
bag_stem = VIDEO_BAG.stem.replace("_video", "")
RUN_ROOT = OUT_ROOT / bag_stem / "raw"
RUN_ROOT.mkdir(parents=True, exist_ok=True)

assert VIDEO_BAG.exists(), f"Missing video bag: {VIDEO_BAG}"

print(f"[INFO] Reading {VIDEO_BAG.name}")
print(f"[INFO] Saving frames under: {RUN_ROOT}")

def sanitize_topic(topic: str) -> str:
    """Make a filesystem-safe topic label (stable and readable)."""
    # Strip leading slash, replace remaining slashes with double underscores
    # Keep alphanum, underscore and dash, map others to underscore.
    base = topic.strip("/")
    safe = base.replace("/", "__")
    safe = "".join(ch if (ch.isalnum() or ch in ("_", "-", ".", "__")) else "_" for ch in safe)
    return safe or "topic"

def ensure_topic_dirs(topic: str, cache: dict) -> Path:
    if topic not in cache:
        safe = sanitize_topic(topic)
        topic_dir = RUN_ROOT / safe  
        topic_dir.mkdir(parents=True, exist_ok=True)
        cache[topic] = topic_dir
        print(f"[INFO] → Topic '{topic}' → {topic_dir}")
    return cache[topic]

def decode_raw_image(msg):
    """Decode sensor_msgs/msg/Image to BGR np.ndarray (uint8)."""
    h, w, step = msg.height, msg.width, msg.step
    enc = (msg.encoding or "").lower()
    buf = np.frombuffer(msg.data, dtype=np.uint8)

    # Common 8-bit encodings
    if enc in ("bgr8",):
        frame = buf.reshape(h, step)[:, :w*3].reshape(h, w, 3)
        return frame
    if enc in ("rgb8",):
        frame = buf.reshape(h, step)[:, :w*3].reshape(h, w, 3)
        return cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    if enc in ("mono8", "8uc1", "8uc1c1", "mono"):
        frame = buf.reshape(h, step)[:, :w]
        return cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)
    if enc in ("yuv422", "yuyv", "yuyv422", "yuv422_yuy2"):
        # 2 bytes per pixel
        frame = buf.reshape(h, step)[:, :w*2]
        return cv2.cvtColor(frame, cv2.COLOR_YUV2BGR_YUY2)

    # Fallback heuristic (can be wrong for Bayer/16-bit)
    chans = 3 if (step % w != 0) else max(1, step // w)
    try:
        raw = buf.reshape(h, step)[:, :w*chans].reshape(h, w, chans)
    except Exception:
        return None

    if chans == 1:
        return cv2.cvtColor(raw, cv2.COLOR_GRAY2BGR)
    if enc == "rgb8":  # just in case encoding was weirdly reported
        return cv2.cvtColor(raw, cv2.COLOR_RGB2BGR)
    return raw

def save_frame(topic_dir: Path, topic_safe: str, ts_ns: int, frame_bgr: np.ndarray):
    out = topic_dir / f"{ts_ns}.jpg"
    cv2.imwrite(str(out), frame_bgr)

# --- Main read loop (per-topic saving) ---
saved_by_topic = {}
skipped_by_topic = {}
topic_dirs_cache = {}

from contextlib import ExitStack
with ExitStack() as stack:
    r = stack.enter_context(AnyReader([VIDEO_BAG]))

    for i, (conn, ts, raw) in enumerate(r.messages()):
        msgtype = conn.msgtype
        topic = conn.topic

        # Filter for image-like topics only (skip CameraInfo, sonar custom msgs, etc.)
        if msgtype not in ("sensor_msgs/msg/CompressedImage", "sensor_msgs/msg/Image"):
            continue

        if TOPIC_INCLUDE and topic not in TOPIC_INCLUDE:
            continue
        if TOPIC_EXCLUDE and topic in TOPIC_EXCLUDE:
            continue

        topic_dir = ensure_topic_dirs(topic, topic_dirs_cache)
        topic_safe = topic_dir.parent.name  # the sanitized topic folder name

        # Decode
        if msgtype == "sensor_msgs/msg/CompressedImage":
            msg = r.deserialize(raw, msgtype)
            arr = np.frombuffer(msg.data, np.uint8)
            frame = cv2.imdecode(arr, cv2.IMREAD_COLOR)
        else:  # sensor_msgs/msg/Image
            msg = r.deserialize(raw, msgtype)
            frame = decode_raw_image(msg)

        # Count bookkeeping
        if topic not in saved_by_topic:
            saved_by_topic[topic] = 0
            skipped_by_topic[topic] = 0

        if frame is None:
            skipped_by_topic[topic] += 1
            continue

        save_frame(topic_dir, topic_safe, ts, frame)
        saved_by_topic[topic] += 1

        # Per-topic progress
        if saved_by_topic[topic] % 100 == 0:
            print(f"[INFO] [{topic}] Saved {saved_by_topic[topic]} frames …")

# --- Summary ---
print("\n[DONE] Per-topic results:")
total_saved = 0
total_skipped = 0
for t in sorted(saved_by_topic.keys()):
    s = saved_by_topic[t]
    k = skipped_by_topic.get(t, 0)
    total_saved += s
    total_skipped += k
    safe = sanitize_topic(t)
    out_dir = RUN_ROOT / safe / "image_frames"
    print(f"  - {t} → saved: {s:5d}, skipped: {k:5d}, dir: {out_dir}")

print(f"\n[TOTAL] Saved {total_saved} frames across {len(saved_by_topic)} topics.")
if total_skipped:
    print(f"[WARN] Skipped {total_skipped} frames (decode failures).")


[INFO] Reading 2024-08-20_14-31-29_video.bag
[INFO] Saving frames under: data/exports/vision/2024-08-20_14-31-29/raw
[INFO] → Topic '/image/compressed_image/data' → data/exports/vision/2024-08-20_14-31-29/raw/image__compressed_image__data
[INFO] [/image/compressed_image/data] Saved 100 frames …
[INFO] [/image/compressed_image/data] Saved 200 frames …
[INFO] [/image/compressed_image/data] Saved 300 frames …
[INFO] [/image/compressed_image/data] Saved 400 frames …
[INFO] [/image/compressed_image/data] Saved 500 frames …
[INFO] [/image/compressed_image/data] Saved 600 frames …
[INFO] [/image/compressed_image/data] Saved 700 frames …
[INFO] [/image/compressed_image/data] Saved 800 frames …
[INFO] [/image/compressed_image/data] Saved 900 frames …
[INFO] [/image/compressed_image/data] Saved 1000 frames …
[INFO] [/image/compressed_image/data] Saved 1100 frames …
[INFO] [/image/compressed_image/data] Saved 1200 frames …
[INFO] [/image/compressed_image/data] Saved 1300 frames …
[INFO] [/image/c

### Make MP4 from images

In [5]:
from pathlib import Path
import re
import cv2

# --- Inputs you already have ---
# VIDEO_BAG: Path(...)  -> e.g. ../.../2024-08-20_13-57-42_video.bag
# OUT_ROOT:  Path(...)  -> e.g. /cluster/home/.../data/exports/vision

bag_stem = VIDEO_BAG.stem.replace("_video", "")          # e.g. "2024-08-20_13-55-34"
RUN_ROOT = OUT_ROOT / bag_stem / "raw"                   # e.g. .../exports/vision/<bag_stem>/raw

# Choose topic directory (matches your per-topic folder name under raw/)
TOPIC_SAFE = "image__compressed_image__data"             # adjust if needed

FRAMES_DIR = RUN_ROOT / TOPIC_SAFE                       # <raw>/<topic>/
OUT_MP4    = RUN_ROOT / f"{bag_stem}_{TOPIC_SAFE}.mp4"   # saved in <raw>/

assert FRAMES_DIR.exists(), f"Frame folder not found: {FRAMES_DIR}. Extract frames first."

# Files look like: <timestamp>.jpg  (timestamps are ns)
pat = re.compile(r"^(\d+)\.jpg$")

def ts_from_name(p: Path) -> int:
    m = pat.match(p.name)
    return int(m.group(1)) if m else -1

# Collect & sort frames by timestamp
frames = [p for p in FRAMES_DIR.glob("*.jpg") if pat.match(p.name)]
frames.sort(key=ts_from_name)
assert frames, f"No frames found in {FRAMES_DIR} matching '*.jpg' with numeric names."

# Estimate FPS from timestamps (nanoseconds)
ts_list = [ts_from_name(p) for p in frames]
dur_s = (ts_list[-1] - ts_list[0]) / 1e9 if len(ts_list) > 1 else 0.0
fps_est = (len(ts_list) / dur_s) if dur_s > 0 else 25.0
FPS = round(fps_est, 2)

# Get size from first readable frame
first = None
for fp in frames:
    first = cv2.imread(str(fp))
    if first is not None:
        break
assert first is not None, f"Failed to read any frame in {FRAMES_DIR}"
h, w = first.shape[:2]

fourcc = cv2.VideoWriter_fourcc(*"mp4v")  # try 'avc1' if you have H.264 installed
vw = cv2.VideoWriter(str(OUT_MP4), fourcc, float(FPS), (w, h))
assert vw.isOpened(), "VideoWriter failed to open. Try a different FOURCC (e.g. 'avc1')."

print(f"[INFO] Writing {len(frames)} frames → {OUT_MP4}")
print(f"[INFO] FPS={FPS}  size={w}x{h}")

written = 0
for i, fp in enumerate(frames, 1):
    img = cv2.imread(str(fp))
    if img is None:
        continue
    if img.shape[:2] != (h, w):
        img = cv2.resize(img, (w, h), interpolation=cv2.INTER_AREA)
    vw.write(img)
    written += 1
    if written % 100 == 0:
        print(f"[INFO] Wrote {written}/{len(frames)} frames …")

vw.release()
print(f"[DONE] MP4 saved: {OUT_MP4}  ({written} frames at {FPS} FPS)")


[INFO] Writing 1688 frames → data/exports/vision/2024-08-20_14-31-29/raw/2024-08-20_14-31-29_image__compressed_image__data.mp4
[INFO] FPS=25.02  size=1280x720
[INFO] Wrote 100/1688 frames …
[INFO] Wrote 200/1688 frames …
[INFO] Wrote 300/1688 frames …
[INFO] Wrote 400/1688 frames …
[INFO] Wrote 500/1688 frames …
[INFO] Wrote 600/1688 frames …
[INFO] Wrote 700/1688 frames …
[INFO] Wrote 800/1688 frames …
[INFO] Wrote 900/1688 frames …
[INFO] Wrote 1000/1688 frames …
[INFO] Wrote 1100/1688 frames …
[INFO] Wrote 1200/1688 frames …
[INFO] Wrote 1300/1688 frames …
[INFO] Wrote 1400/1688 frames …
[INFO] Wrote 1500/1688 frames …
[INFO] Wrote 1600/1688 frames …
[DONE] MP4 saved: data/exports/vision/2024-08-20_14-31-29/raw/2024-08-20_14-31-29_image__compressed_image__data.mp4  (1688 frames at 25.02 FPS)
