# Load Training Data

In [None]:
!pip install opencv-python-headless  # lighter, no GUI deps

In [1]:
import pandas as pd
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [31]:
import boto3, json, pandas as pd, itertools
from tqdm.notebook import tqdm            # Jupyter/HTML bar

s3 = boto3.client("s3")

In [3]:
# ------------------------------------------------------------
# helpers
# ------------------------------------------------------------
def body_iter_lines(uri: str):
    bucket, key = uri.replace("s3://", "", 1).split("/", 1)
    return boto3.client("s3").get_object(Bucket=bucket, Key=key)["Body"].iter_lines()

def row_count(uri: str) -> int:
    "One streaming pass → exact line count."
    return sum(1 for _ in body_iter_lines(uri))

def stream_jsonl(uri: str):
    for raw in body_iter_lines(uri):
        if raw:
            yield json.loads(raw)

In [4]:
import re
import pandas as pd

def _split_bullets(block: str) -> list[str]:
    """
    Turn a multiline block that starts with "- " and uses "\n- "
    into a list of clean strings.
    """
    if not block:
        return []

    # Remove the first leading dash and any leading/trailing whitespace
    block = block.lstrip("-").strip()

    # Split on newline followed by dash (with optional spaces)
    items = re.split(r"\n-\s*", block)

    # Final strip for each item, drop empties
    return [x.strip() for x in items if x.strip()]

def parse_messages_to_df(df: pd.DataFrame, *, col: str = "messages") -> pd.DataFrame:
    """
    Extract SID, study-findings, numeric-measurements (as lists),
    and META tags from the 'messages' column.
    """
    rows_out = []

    for row in df[col]:
        # 1 · pull first user message text ---------------------------------
        user_parts = next((m["content"] for m in row if m.get("role") == "user"), [])
        full_text  = "\n".join(p["text"] for p in user_parts if "text" in p)

        # 2 · regex captures -----------------------------------------------
        sid          = re.search(r"<SID:([^>]+)>", full_text)
        study_find   = re.search(r"Study findings:\n(.*?)\n\nNumeric measurements:", full_text, re.S)
        numeric_meas = re.search(r"Numeric measurements:\n(.*?)\n\n<META_START>",   full_text, re.S)
        meta_tags    = re.search(r"<META_START>\n(.*?)\n<META_END>",                full_text, re.S)

        # 3 · split the two blocks into lists ▼-----------------------------
        study_list   = _split_bullets(study_find.group(1))   if study_find   else []
        numeric_list = _split_bullets(numeric_meas.group(1)) if numeric_meas else []

        rows_out.append({
            "sid"           : sid.group(1).strip()        if sid       else None,
            "study_findings": study_list,
            "numeric_meas"  : numeric_list,
            "meta_tags"     : meta_tags.group(1).strip()  if meta_tags else None,
        })

    return pd.DataFrame(rows_out)


In [5]:
import pandas as pd

def is_tag_only_list(lst):
    """
    True  → every element looks like a bare tag
            e.g.  ["LVH_obs", "AoV_sten_degree_SD_obs"]
    False → at least one element carries a value or a sentence
            e.g.  ["LVH_obs : mild", "The LV chamber size is normal."]
    """
    if not isinstance(lst, list):
        return False                 # guard for NaNs or bad types
    return all((":" not in s) and ("." not in s) for s in lst)

In [6]:
import pandas as pd, orjson, re
sid_pat = re.compile(r"<SID:([^>]+)>")

def parse_row(row):
    # fast JSON load only if still a string
    mi = orjson.loads(row.modelInput)  if isinstance(row.modelInput,  str) else row.modelInput
    mo = orjson.loads(row.modelOutput) if isinstance(row.modelOutput, str) else row.modelOutput

    # ── SID ──────────────────────────────────────────────────────────────
    sid = None
    for m in mi.get("messages", ()):
        for seg in m.get("content", ()):
            if isinstance(seg, dict):
                m0 = sid_pat.search(seg.get("text", ""))
                if m0:
                    sid = m0.group(1)
                    break
        if sid: break

    # ── conversation text ───────────────────────────────────────────────
    try:
        conv = mo["output"]["message"]["content"][0]["text"]
    except Exception:
        conv = None

    return sid, conv

In [7]:
import re, json, orjson
from tqdm.auto import tqdm

COL       = "conversations"

# ────────────────────────────────────────────────────────────────
# 1 · structural typos
# ────────────────────────────────────────────────────────────────
_fix_struct = (
    (re.compile(r'"value="\s*'),                       '"value":"'),
    (re.compile(r'\{"from":"(gpt|human)",\s*""'),      r'{"from":"\1","value":"'),
    (re.compile(r'\{"from":"(gpt|human)"\s*:\s*'),     r'{"from":"\1","value":')
)

def structural(txt: str) -> str:
    for pat, repl in _fix_struct:
        txt = pat.sub(repl, txt)
    return txt


# ────────────────────────────────────────────────────────────────
# 2 · escape control chars & lone back-slashes inside strings
# ────────────────────────────────────────────────────────────────
_str_pat = re.compile(r'"(?:[^"\\]|\\.)*"', re.S)        # every JSON string

def _escape_ctrl(ch: str) -> str:
    """Map control char to JSON escape sequence."""
    if   ch == '\n': return r'\n'
    elif ch == '\r': return r'\r'
    elif ch == '\t': return r'\t'
    else:            return f'\\u{ord(ch):04x}'

_ctrl_pat = re.compile(r'[\x00-\x1F]')                   # 0–31

def escape_in_quotes(txt: str) -> str:
    def patch(m):
        s = m.group(0)
        s = _ctrl_pat.sub(lambda c: _escape_ctrl(c.group(0)), s)
        s = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', s)     # lone "\"
        return s
    return _str_pat.sub(patch, txt)


# ────────────────────────────────────────────────────────────────
# 3 · convert raw → Python list   (None if truly truncated)
# ────────────────────────────────────────────────────────────────
def to_list(raw: str):
    try:                         # fast path
        return orjson.loads(raw)
    except orjson.JSONDecodeError:
        fixed = escape_in_quotes(structural(raw))
        try:
            return json.loads(fixed)  # stdlib tolerates pretty well
        except Exception:
            return None               # still broken

# Inspect Inputs

In [8]:
uris = [
    "s3://echodata25/results/echo-images/nova-pro/gen10/job_00/prompts.jsonl",
    "s3://echodata25/results/echo-images/nova-pro/gen10/job_01/prompts.jsonl",
    "s3://echodata25/results/echo-images/nova-pro/gen10/job_02/prompts.jsonl",
    "s3://echodata25/results/echo-images/nova-pro/gen10/job_03/prompts.jsonl",
    "s3://echodata25/results/echo-images/nova-pro/gen10/job_04/prompts.jsonl",
    "s3://echodata25/results/echo-images/nova-pro/gen10/job_05/prompts.jsonl",
]

 # ------------------------------------------------------------
# main loop
# ------------------------------------------------------------
frames = []
outer = tqdm(uris, desc="all jobs", unit="file")   # overall progress

for uri in outer:
    job   = uri.split("/")[-3]
    total = row_count(uri)

    rows = []
    for rec in tqdm(stream_jsonl(uri),
                    total=total,
                    desc=job,
                    unit="rows",
                    leave=True):      # keep each bar
        rows.append(rec)

    frames.append(pd.DataFrame.from_records(rows))
    outer.update()                    # tick the master bar


# final concatenation
df_inputs = pd.concat(frames, ignore_index=True)

all jobs:   0%|          | 0/6 [00:00<?, ?file/s]

gen10:   0%|          | 0/50000 [00:00<?, ?rows/s]

gen10:   0%|          | 0/50000 [00:00<?, ?rows/s]

gen10:   0%|          | 0/50000 [00:00<?, ?rows/s]

gen10:   0%|          | 0/50000 [00:00<?, ?rows/s]

gen10:   0%|          | 0/50000 [00:00<?, ?rows/s]

gen10:   0%|          | 0/16008 [00:00<?, ?rows/s]

In [9]:
print("Combined shape:", df_inputs.shape)

Combined shape: (266008, 2)


In [10]:
uris = [
 "s3://echodata25/results/echo-images/nova-pro/gen11/job_00/prompts.jsonl"
]

frames = []
outer = tqdm(uris, desc="all jobs", unit="file")   # overall progress

for uri in outer:
    job   = uri.split("/")[-3]
    total = row_count(uri)

    rows = []
    for rec in tqdm(stream_jsonl(uri),
                    total=total,
                    desc=job,
                    unit="rows",
                    leave=True):      # keep each bar
        rows.append(rec)

    frames.append(pd.DataFrame.from_records(rows))
    outer.update()                    # tick the master bar


# final concatenation
gen11_inputs = pd.concat(frames, ignore_index=True)
print("Combined shape:", gen11_inputs.shape)

all jobs:   0%|          | 0/1 [00:00<?, ?file/s]

gen11:   0%|          | 0/49025 [00:00<?, ?rows/s]

Combined shape: (49025, 2)


# Parse Into Findings + Measurements

In [11]:
df_inputs_parsed = parse_messages_to_df(df_inputs, col="messages")

In [12]:
gen11_parsed = parse_messages_to_df(gen11_inputs, col="messages") 

In [13]:
hls_dict = pd.read_csv('hls_dict_v3.csv')

# Drop Empty Findings

In [14]:
def tag_only_str(s: str) -> bool:
    """
    True  -> looks like a bare tag (no colon, no spaces)
    False -> carries a value or is a free-text sentence
    """
    return (":" not in s) and (" " not in s)

def tag_only_row(lst) -> bool:
    return isinstance(lst, list) and all(tag_only_str(x) for x in lst)

def none_provided(lst):
    return (isinstance(lst, list)                 and
            len(lst) == 1                        and
            lst[0].strip().lower() == "none provided.")

print(f"original rows: {len(df_inputs_parsed)}")

tag_only_mask   = df_inputs_parsed["study_findings"].apply(tag_only_row)
none_mask       = df_inputs_parsed["study_findings"].apply(none_provided)

# counts -----------------------------------------------------------------
n_tag_only = tag_only_mask.sum()
n_none     = none_mask.sum()

print(f"tag-only rows  : {n_tag_only:,}")
print(f'“none provided” rows: {n_none:,}')

# separate DataFrames ----------------------------------------------------
tag_only_df = df_inputs_parsed[tag_only_mask]
none_df     = df_inputs_parsed[none_mask]

# drop both categories from the main DataFrame ---------------------------
df_inputs_parsed = df_inputs_parsed[~(tag_only_mask | none_mask)].reset_index(drop=True)
print("remaining rows :", len(df_inputs_parsed))

original rows: 266008
tag-only rows  : 48,708
“none provided” rows: 317
remaining rows : 216983


In [15]:
dropped_sids = set(tag_only_df["sid"]).union(none_df["sid"])
print(f"Total number of dropped SIDs: {len(dropped_sids):,}")
print(f"Dropped SIDs that overlap with gen11_parsed: {(len(dropped_sids)/len(gen11_parsed))*100:.0f}%")

Total number of dropped SIDs: 49,025
Dropped SIDs that overlap with gen11_parsed: 100%


Now, dropped_sidshe JSONL outputs for only those prompt outputs whose SIDs overlap with the remaining `df_inputs_parsed`. The rest will be pulled from `gen11`.

# Outputs

In [16]:
uris = [
    "s3://echodata25/results/echo-images/nova-pro/gen10-outputs/job_00/koh3yhoh38cy/prompts.jsonl.out",
    "s3://echodata25/results/echo-images/nova-pro/gen10-outputs/job_01/zhnxl9kb6alc/prompts.jsonl.out",
    "s3://echodata25/results/echo-images/nova-pro/gen10-outputs/job_02/h74k0w4wqyuy/prompts.jsonl.out",
    "s3://echodata25/results/echo-images/nova-pro/gen10-outputs/job_03/0db4t3ctdamr/prompts.jsonl.out",
    "s3://echodata25/results/echo-images/nova-pro/gen10-outputs/job_04/9xztwg01glwe/prompts.jsonl.out",
    "s3://echodata25/results/echo-images/nova-pro/gen10-outputs/job_05/9ynxch4xlfeq/prompts.jsonl.out",
]

 # ------------------------------------------------------------
# main loop
# ------------------------------------------------------------
frames = []
outer = tqdm(uris, desc="all jobs", unit="file")   # overall progress

for uri in outer:
    job   = uri.split("/")[-3]
    total = row_count(uri)

    rows = []
    for rec in tqdm(stream_jsonl(uri),
                    total=total,
                    desc=job,
                    unit="rows",
                    leave=True):      # keep each bar
        rows.append(rec)

    frames.append(pd.DataFrame.from_records(rows))
    outer.update()                    # tick the master bar


# final concatenation
df_all = pd.concat(frames, ignore_index=True)
print("Combined shape:", df_all.shape)

all jobs:   0%|          | 0/6 [00:00<?, ?file/s]

job_00:   0%|          | 0/50000 [00:00<?, ?rows/s]

job_01:   0%|          | 0/50000 [00:00<?, ?rows/s]

job_02:   0%|          | 0/50000 [00:00<?, ?rows/s]

job_03:   0%|          | 0/50000 [00:00<?, ?rows/s]

job_04:   0%|          | 0/50000 [00:00<?, ?rows/s]

job_05:   0%|          | 0/16008 [00:00<?, ?rows/s]

Combined shape: (266008, 3)


In [17]:
uris = [
 "s3://echodata25/results/echo-images/nova-pro/gen11-outputs/job_00/lhmwnkih6ov0/prompts.jsonl.out"
]

frames = []
outer = tqdm(uris, desc="all jobs", unit="file")   # overall progress

for uri in outer:
    job   = uri.split("/")[-3]
    total = row_count(uri)

    rows = []
    for rec in tqdm(stream_jsonl(uri),
                    total=total,
                    desc=job,
                    unit="rows",
                    leave=True):      # keep each bar
        rows.append(rec)

    frames.append(pd.DataFrame.from_records(rows))
    outer.update()                    # tick the master bar


# final concatenation
gen11_outputs = pd.concat(frames, ignore_index=True)

all jobs:   0%|          | 0/1 [00:00<?, ?file/s]

job_00:   0%|          | 0/49025 [00:00<?, ?rows/s]

In [18]:
print("Combined shape:", gen11_outputs.shape)

Combined shape: (49025, 3)


# Parse + Fix Conversations

In [21]:
# process rows lazily; no extra dataframe copies
ids, convs = zip(*map(parse_row, df_all.itertuples(index=False)))
all_processed = pd.DataFrame({"id": ids, "conversations": convs})

In [22]:
all_processed.shape

(266008, 2)

In [23]:
# ────────────────────────────────────────────────────────────────
# 4 · clean whole column with a progress bar
# ────────────────────────────────────────────────────────────────
all_processed_fixed        = all_processed.copy()                  # keep original safe
good, bad = [], []

for i, txt in tqdm(enumerate(all_processed_fixed[COL]), total=len(all_processed_fixed), unit="conv"):
    parsed = to_list(txt)
    if parsed is None:
        bad.append(i)
    else:
        good.append(parsed)

print(f"{len(bad)} rows truncated ➜ {bad[:10]}")

# drop unrecoverable rows, assign cleaned lists
all_processed_fixed = all_processed_fixed.drop(index=bad).reset_index(drop=True)
all_processed_fixed[COL] = good

  0%|          | 0/266008 [00:00<?, ?conv/s]

1213 rows truncated ➜ [366, 1501, 2620, 3930, 5297, 6029, 6868, 7208, 7825, 8270]


In [24]:
# process rows lazily; no extra dataframe copies
ids, convs = zip(*map(parse_row, gen11_outputs.itertuples(index=False)))
gen11_processed = pd.DataFrame({"id": ids, "conversations": convs})

In [25]:
gen11_processed_fixed        = gen11_processed.copy()                  # keep original safe
good, bad = [], []

for i, txt in tqdm(enumerate(gen11_processed_fixed[COL]), total=len(gen11_processed_fixed), unit="conv"):
    parsed = to_list(txt)
    if parsed is None:
        bad.append(i)
    else:
        good.append(parsed)

print(f"{len(bad)} rows truncated ➜ {bad[:10]}")

# drop unrecoverable rows, assign cleaned lists
gen11_processed_fixed = gen11_processed_fixed.drop(index=bad).reset_index(drop=True)
gen11_processed_fixed[COL] = good

  0%|          | 0/49025 [00:00<?, ?conv/s]

1221 rows truncated ➜ [62, 96, 207, 244, 256, 276, 307, 342, 368, 423]


In [92]:
# # raw_df (or new) → the dataframe that still has the raw strings
# # bad           → list/array of bad-row indices

# def peek_rows(df, indices, n=20, ctx=120):
#     """
#     Print a context slice around the JSON error byte for the first `n` indices.
#     """
#     import orjson
#     for idx in indices[:n]:
#         txt = df.at[idx, "conversations"]
#         try:
#             orjson.loads(txt)            # will raise
#         except orjson.JSONDecodeError as e:
#             pos = e.pos                  # byte offset where parsing broke
#             frag = txt[max(0, pos-ctx): pos+ctx]
#             print(f"\n── row {idx}  (byte {pos}) ──\n{frag}\n")

# peek_rows(raw_df, bad, n=20)

# Drop Invalid Rows

In [26]:
before = len(all_processed_fixed)     
print(f"length of all_processed_fixed before: {len(all_processed_fixed)}")

all_processed_fixed = (all_processed_fixed
            [~all_processed_fixed["id"].astype(str).isin(dropped_sids)]
            .reset_index(drop=True))

after = len(all_processed_fixed)
print(f"rows removed from other_df: {before - after:,}")
print(f"remaining rows            : {after:,}")

length of all_processed_fixed before: 264795
rows removed from other_df: 47,965
remaining rows            : 216,830


We don't want to do the same for `gen11_processed_fixed` because it contains the rows that were reprocessed by the LLM with the fixed inputs.

In [27]:
col = "id"                 

ids_all   = set(all_processed_fixed[col].astype(str))
ids_gen11 = set(gen11_processed_fixed[col].astype(str))

overlap   = ids_all & ids_gen11          # intersection

print(f"IDs in all_processed_fixed : {len(ids_all):,}")
print(f"IDs in gen11_processed_fixed: {len(ids_gen11):,}")
print(f"overlap count             : {len(overlap):,}")

before = len(all_processed_fixed)

all_processed_fixed = (
    all_processed_fixed[~all_processed_fixed[col].astype(str).isin(overlap)]
    .reset_index(drop=True)
)

after = len(all_processed_fixed)
print(f"rows removed from all_processed_fixed: {before - after:,}")
print(f"remaining rows                       : {after:,}")


IDs in all_processed_fixed : 216,830
IDs in gen11_processed_fixed: 47,804
overlap count             : 0
rows removed from all_processed_fixed: 0
remaining rows                       : 216,830


In [28]:
cols_all   = set(all_processed_fixed.columns)
cols_gen11 = set(gen11_processed_fixed.columns)

master_conv = pd.concat(
    [all_processed_fixed, gen11_processed_fixed],
    ignore_index=True,      # new 0-based index
    sort=False              # keep column order, don’t alphabetize
)

print("master_conv shape:", master_conv.shape)

master_conv shape: (264634, 2)


# Build Data Sources

In [29]:
hls_master = pd.read_csv('hls_master_v3.csv') # now we import the master CSV to get the study directories

In [30]:
# build a Series once → O(n) memory-light lookup table
study_dir_map = (
    hls_master
    .set_index("DeidentifiedStudyID")["study_dir"]
    .astype(str)          # make sure keys/vals are strings
)

# add column (vectorised; no join-copy)
master_conv["data_source"] = master_conv["id"].map(study_dir_map) # we set the data_source to the study directory

In [31]:
master_conv.iloc[0]

id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

# (Old) Get S3 Concatenated Video Paths

In [38]:
# prerequisites
# pip install boto3 tqdm orjson

import random, boto3, orjson, re, pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm.auto import tqdm

# ---------- part 2 · pick one shuf*.mp4 per study --------------
BUCKET   = "echodata25"
ROOT     = "results/echo-images/video-concat"          # constant path prefix
MATCHES  = ("shuf1.mp4", "shuf2.mp4")                  # wanted filenames
MAX_WORKERS = 32                                       # tune for your net-bandwidth / vCPU

s3 = boto3.client("s3")
paginator = s3.get_paginator("list_objects_v2")        # reused → keeps HTTP-pool alive

def find_video(study_id: str, source_dir: str | float) -> str | None:
    """Return s3://…/shuf1|2.mp4 (random pick) or None."""
    if pd.isna(source_dir):
        return None

    prefix = f"{ROOT}/{source_dir}/{study_id}/"        # e.g. results/…/echo-study/<ID>/
    try:
        for page in paginator.paginate(Bucket=BUCKET, Prefix=prefix):
            keys = [obj["Key"] for obj in page.get("Contents", ())
                    if obj["Key"].endswith(MATCHES)]
            if keys:                                   # got one or more candidates
                return f"s3://{BUCKET}/{random.choice(keys)}"
    except s3.exceptions.NoSuchBucket:
        pass                                           # bucket typo guard
    return None

# vectorised parallel lookup with progress bar
ids   = out["id"].values
dirs  = out["data_source"].values

with ThreadPoolExecutor(MAX_WORKERS) as ex:
    out["video"] = list(
        tqdm(ex.map(find_video, ids, dirs), total=len(out), unit="file")
    )

# out now has columns: id · conversations · data_source · video

In [89]:
"""
Compute average frame-count, FPS, and duration over the first `N`
videos listed in df["video"] (each value is an s3://…/shuf*.mp4 URI).

Dependencies  (install once):
    pip install boto3 opencv-python-headless tqdm
"""

import os, tempfile, boto3, cv2, pandas as pd, numpy as np
from urllib.parse import urlparse
from tqdm.auto import tqdm

N        = 20                             # ← how many videos to sample
tmp_dir  = tempfile.mkdtemp()
s3       = boto3.client("s3")

frames, fpss, durs = [], [], []

def download(uri: str, dest_dir: str) -> str:
    """Download S3 object to `dest_dir`, return local path."""
    parsed = urlparse(uri)
    bucket, key = parsed.netloc, parsed.path.lstrip("/")
    local = os.path.join(dest_dir, os.path.basename(key))
    if os.path.exists(local):                      # cached
        return local
    s3.download_file(bucket, key, local)
    return local

for uri in tqdm(df["video"].iloc[:N], total=N, desc="videos"):
    path = download(uri, tmp_dir)

    cap = cv2.VideoCapture(path)
    if not cap.isOpened():                         # skip broken files
        continue
    f = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    r = cap.get(cv2.CAP_PROP_FPS) or np.nan
    cap.release()

    frames.append(f)
    fpss.append(r)
    durs.append(f / r if r else np.nan)

# ── report ──────────────────────────────────────────────────────────────
print(f"sampled videos : {len(frames)}")
print(f"avg frames     : {np.nanmean(frames):.1f}")
print(f"avg fps        : {np.nanmean(fpss):.2f}")
print(f"avg duration s : {np.nanmean(durs):.2f}")


videos:   0%|          | 0/20 [00:00<?, ?it/s]

sampled videos : 20
avg frames     : 3506.2
avg fps        : 30.00
avg duration s : 116.87


# Get Valid Study Paths

In [34]:
# ---- Python (works in scripts / Jupyter) ----
import os, psutil, multiprocessing

ram = psutil.virtual_memory()
print(f"RAM total: {ram.total/1e9:.1f} GB   free: {ram.available/1e9:.1f} GB")

cpus_logical  = os.cpu_count()                 #

import os, multiprocessing as mp
n_logical  = os.cpu_count()        # includes hyper-threads
n_physical = mp.cpu_count()        # same on Linux; fallback

print(n_logical, "logical cores")

RAM total: 401.2 GB   free: 360.0 GB
96 logical cores


In [36]:
master_llava = master_conv.copy()

In [37]:
# prerequisites
# pip install boto3 tqdm orjson

import random, boto3, orjson, pandas as pd, re
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
from tqdm.auto import tqdm

BUCKET       = "echodata25"
MAX_WORKERS  = 96                                   # adjust to network / vCPU
MIN_CLIPS    = 10                                   # threshold for “good” dir

s3        = boto3.client("s3")
paginator = s3.get_paginator("list_objects_v2")     # keeps HTTP pool alive

# ----------------------------------------------------------------------
# helper: find the first subdir with ≥10 mp4s and return (s3_dir, count)
# ----------------------------------------------------------------------
def select_dir(study_id: str, data_source: str | float):
    """Return (s3://bucket/.../subdir/, n_clips) or (None, 0)."""
    if pd.isna(data_source):
        return None, 0

    root = f"results/{data_source}/{study_id}/"      # new root rule
    try:
        mp4_counts = defaultdict(int)                # subdir → clip count

        # list every object under the study prefix
        for page in paginator.paginate(Bucket=BUCKET, Prefix=root):
            for obj in page.get("Contents", ()):
                key = obj["Key"]
                if not key.endswith(".mp4"):
                    continue

                # key looks like results/…/study_id/<subdir>/file.mp4
                rel = key[len(root):]
                if "/" not in rel:                   # mp4 directly under study root
                    continue                         # skip (needs subdir)
                subdir = rel.split("/", 1)[0] + "/"  # keep trailing slash
                mp4_counts[subdir] += 1

        # pick the first subdir (sorted) with enough clips
        for subdir in sorted(mp4_counts):
            if mp4_counts[subdir] >= MIN_CLIPS:
                s3_dir = f"s3://{BUCKET}/{root}{subdir}"
                return s3_dir, mp4_counts[subdir]

    except s3.exceptions.NoSuchBucket:
        pass                                         # safety guard

    return None, 0

# ----------------------------------------------------------------------
# vectorised parallel lookup with progress bar
# `master_llava` must have columns: id  ·  data_source
# ----------------------------------------------------------------------
ids   = master_llava["id"].values
srcs  = master_llava["data_source"].values

with ThreadPoolExecutor(MAX_WORKERS) as ex:
    results = list(tqdm(ex.map(select_dir, ids, srcs),
                        total=len(master_llava), unit="study"))

# unpack tuples into two new columns
master_llava["video"]       = [r[0] for r in results]     # s3://…/subdir/
master_llava["n_clips_dir"] = [r[1] for r in results]     # integer count

# ----------------------------------------------------------------------
# stats
# ----------------------------------------------------------------------
has_video_mask = master_llava["video"].notna()

clip_counts = master_llava.loc[has_video_mask, "n_clips_dir"]

print("\n--- directory statistics ---")
if not clip_counts.empty:
    print(f" avg clips : {clip_counts.mean():.1f}")
    print(f" min clips : {clip_counts.min()}")
    print(f" max clips : {clip_counts.max()}")
else:
    print("No directories selected.")

print(f"studies with eligible dir : {has_video_mask.sum():,}")
print(f"studies without dir       : {len(master_llava) - has_video_mask.sum():,}")


  0%|          | 0/264634 [00:00<?, ?study/s]


--- directory statistics ---
 avg clips : 58.2
 min clips : 10
 max clips : 281
studies with eligible dir : 263,964
studies without dir       : 670


In [39]:
master_llava.to_csv('master_llava.csv')

In [90]:
len(df)

264795

# Combine w/ Salient Vids

In [75]:
import pandas as pd
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [76]:
master_llava = pd.read_csv('master_llava.csv')

In [77]:
print(master_llava.shape)
print(master_llava.iloc[0])

(264634, 6)
Unnamed: 0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [78]:
es0_vids = pd.read_csv('es0_salient_vids.csv')
es1_vids = pd.read_csv('es1_salient_vids.csv')
es2_vids = pd.read_csv('es2_salient_vids.csv')

In [79]:
# import ast

# count = (
#     es1_vids['salient_videos']
#         .map(lambda v: len(ast.literal_eval(v)) if isinstance(v, str) else len(v))
#         .lt(10)
#         .sum()
# )
# print(count)

In [80]:
print(es0_vids.shape)
print(es1_vids.shape)
print(es2_vids.shape)
# print(es1_vids.iloc[0:3])

(213773, 5)
(25944, 5)
(79526, 5)


In [82]:
import pandas as pd

all_vids_list = [es0_vids, es1_vids, es2_vids]        # replace with your three dataframes
all_vids = (
    pd.concat(all_vids_list, ignore_index=True)
      .drop(columns='Unnamed: 0', errors='ignore')         # remove index column
      .drop_duplicates(subset='DeidentifiedStudyID')       # keep first occurrence
)

In [83]:
all_vids.shape

(319243, 4)

In [84]:
master_llava.columns

Index(['Unnamed: 0', 'id', 'conversations', 'data_source', 'video',
       'n_clips_dir'],
      dtype='object')

In [85]:
master_llava.iloc[0]

Unnamed: 0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [86]:
# 2. (optional) ensure master_llava has unique IDs
master_llava = master_llava.drop_duplicates('id')
print(master_llava.shape)

(264634, 6)


In [87]:
master_llava_vids = (
    all_vids
        .merge(master_llava[['id', 'conversations']], left_on='DeidentifiedStudyID', right_on='id', how='inner')
        .drop(columns='DeidentifiedStudyID')        # keep 'id', discard old column
)

In [88]:
master_llava_vids.to_csv('master_llava_vids.csv')

# Tags

In [89]:
# master_llava_vids = pd.read_csv('master_llava_vids.csv')

In [90]:
from pathlib import Path
from sagemaker.pytorch import PyTorch
import sagemaker, boto3

role = sagemaker.get_execution_role()

# 👇  we’re already inside the BIMBA folder, so just use cwd()
SRC_DIR = Path.cwd()                         # /home/.../user-default-efs/BIMBA
REQ_FILE = SRC_DIR / "BIMBA-LLaVA-NeXT" / "requirements.txt"
IMAGE_URI = "495467399120.dkr.ecr.us-west-2.amazonaws.com/bimba-train:latest"

estimator = PyTorch(
    image_uri        = IMAGE_URI,
    entry_point      = "train_entrypoint.py",   # file is right here
    source_dir       = str(SRC_DIR),            # <-- fixed path
    dependencies     = [str(REQ_FILE)],
    role             = role,
    instance_type    = "ml.p4d.24xlarge", # ml.p4d.24xlarge, ml.p4de.24xlarge, ml.p5.48xlarge
    instance_count   = 1,
    framework_version = "2.1",
    py_version       = "py310",
    base_job_name    = "bimba-train",
    disable_profiler = True,
    environment={"ECHO_DEBUG": "1"}, 
)

estimator.fit(
    inputs = {
        "training": "s3://echodata25/BIMBA-LLaVA-NeXT/mlv_1.jsonl"
    },
    wait   = True,
)

In [91]:
# ── 1. measurement-tag dictionary ────────────────────────────────────────────
meas_tag_map = {
    "<N0>":  "<aortic_valve_peak_velocity>",
    "<N1>":  "<aortic_valve_mean_gradient>",
    "<N2>":  "<aortic_valve_effective_orifice_area>",
    "<N3>":  "<aortic_valve_dimensionless_index>",
    "<N4>":  "<left_ventricular_ejection_fraction>",
    "<N5>":  "<left_ventricular_end_diastolic_volume_index>",
    "<N6>":  "<left_ventricular_end_systolic_volume_index>",
    "<N7>":  "<left_ventricular_mass_index>",
    "<N8>":  "<left_ventricular_dpdt>",
    "<N9>":  "<left_ventricular_outflow_tract_velocity_time_integral>",
    "<N10>": "<left_ventricular_outflow_tract_peak_gradient>",
    "<N11>": "<mitral_inflow_deceleration_time>",
    "<N12>": "<mitral_e_a_ratio>",
    "<N13>": "<mitral_e_e_prime_ratio>",
    "<N14>": "<right_ventricular_systolic_pressure>",
    "<N15>": "<right_ventricular_fractional_area_change>",
    "<N16>": "<tricuspid_annular_plane_systolic_excursion>",
    "<N17>": "<left_atrial_volume_index>",
    "<N18>": "<aortic_root_diameter>",
    "<N19>": "<ascending_aorta_diameter>",
}

# ── 2. replacement helper (re-usable with the condition tags) ───────────────
import re, ast

def to_py(x):
    if isinstance(x, list): return x
    if isinstance(x, str) and x.lstrip().startswith("["):
        try: return ast.literal_eval(x)
        except Exception: pass
    return x                       # NaN / other types

def make_replacer(tag_map):
    pattern = re.compile("|".join(map(re.escape, tag_map)))
    return lambda s: pattern.sub(lambda m: tag_map[m.group()], s)

meas_repl = make_replacer(meas_tag_map)

def expand_meas_tags(conv_list):
    if not isinstance(conv_list, list):        # NaN etc.
        return conv_list
    out = []
    for msg in conv_list:
        if isinstance(msg, dict) and "value" in msg and isinstance(msg["value"], str):
            msg = msg.copy()
            msg["value"] = meas_repl(msg["value"])
        out.append(msg)
    return out

# ── 3. apply to the dataframe column 'conversations' ────────────────────────
master_llava_vids["conversations"] = (
    master_llava_vids["conversations"]
        .map(to_py)
        .map(expand_meas_tags)
)


In [92]:
master_llava_vids.iloc[0]

data_source                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [93]:
import re
from collections import Counter
import pandas as pd

# ── 1. helper: pull every <…> token from a string ───────────────────────────
_tag_re = re.compile(r"<[^>]+>")          # matches <anything_inside_angle_brackets>

def tags_in_text(t: str):
    return _tag_re.findall(t)             # returns list of tags (may be empty)


# ── 2. walk through the dataframe, accumulate counts ────────────────────────
counter = Counter()

for conv in master_llava_vids["conversations"]:
    if not isinstance(conv, list):
        continue                          # skip NaN / malformed rows
    for msg in conv:
        if isinstance(msg, dict) and "value" in msg and isinstance(msg["value"], str):
            counter.update(tags_in_text(msg["value"]))

# ── 3. convert to a tidy pandas Series (optional) ───────────────────────────
tag_counts = pd.Series(counter).sort_values(ascending=False)

# tag_counts now holds a per-tag frequency table, e.g.:
# <left_ventricular_ejection_fraction>      812
# <tricuspid_regurgitation_mild>            647
# …


In [109]:
master_llava_vids.shape

(264186, 5)

In [110]:
import ast

# ensure every cell is a list object
master_llava_vids['salient_videos'] = master_llava_vids['salient_videos'].apply(
    lambda x: x if isinstance(x, list) else ast.literal_eval(x)
)

In [111]:
# keep only rows whose list is non-empty
master_llava_vids = (
    master_llava_vids[master_llava_vids['salient_videos'].apply(len) >= 1]
      .reset_index(drop=True)
)

In [112]:
master_llava_vids.to_csv('master_llava_vids_retagged.csv')

In [117]:
master_llava_vids.shape

(252030, 5)

In [116]:
# rows where the list has fewer than 10 elements
n_short = (master_llava_vids['salient_videos'].apply(len) < 33).sum()
print(n_short)

252030


# Drop Rows Not Yet Processed

1. ~Also drop all of Syngo (54K rows) until it is reprocessed (findings column)~
2. ~Change all C63 to C61 (RA dilation)~

In [2]:
import pandas as pd
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [4]:
import boto3, json, pandas as pd, itertools
from tqdm.notebook import tqdm            # Jupyter/HTML bar

s3 = boto3.client("s3")

In [3]:
master_llava_vids = pd.read_csv('master_llava_vids_retagged.csv')

In [35]:
# quick sanity-check
df_ok = master_llava_vids
try:
    df_ok["conversations"] = df_ok["conversations"].apply(to_list)
except Exception as e:
    print("❌  at least one bad row in 'conversations':", e)
    raise

In [36]:
import json, ast, pandas as pd

def to_list(x):
    """Make sure each cell is a list of dicts."""
    if isinstance(x, list):
        return x                    # already good
    if isinstance(x, str):
        # try proper JSON first (double-quoted)
        try:
            return json.loads(x)
        except json.JSONDecodeError:
            # fall back to Python repr (single-quoted)
            return ast.literal_eval(x)
    raise ValueError("unexpected type in conversations column")

df_ok["conversations"] = df_ok["conversations"].apply(to_list)

In [37]:
print("rows in df_ok:", len(df_ok))
# print(df_ok.head(3))
assert len(df_ok) > 0, "⚠️  DataFrame is empty!"

rows in df_ok: 252030


In [38]:
def to_py(obj):
    """Convert JSON / repr string to real Python object."""
    if isinstance(obj, list):
        return obj
    if isinstance(obj, str):
        try:
            return json.loads(obj)          # valid JSON list?
        except json.JSONDecodeError:
            return ast.literal_eval(obj)    # Python-repr list
    return obj                              # fall through

df_ok["conversations"]   = df_ok["conversations"].apply(to_py)
df_ok["salient_videos"]  = df_ok["salient_videos"].apply(to_py)
df_ok["salient_views"]   = df_ok["salient_views"].apply(to_py)

In [39]:

OUT = "BIMBA-LLaVA-NeXT/mlv_1.jsonl"

with open(OUT, "w", encoding="utf-8") as f:
    for _, row in tqdm(df_ok.iterrows(), total=len(df_ok),
                       desc="writing", unit="rows"):
        rec = {
            "id":          str(row["id"]),
            "salient_videos":       row["salient_videos"],
            "salient_views":       row["salient_views"],
            "conversations": row["conversations"],   # already a list of dicts
        }

        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"✅ wrote {OUT}")

writing:   0%|          | 0/252030 [00:00<?, ?rows/s]

✅ wrote BIMBA-LLaVA-NeXT/mlv_1.jsonl


In [28]:
# after the cleaning loop, before the S3 upload
num_lines = sum(1 for _ in open(OUT))
print("👀 final jsonl rows:", num_lines)
# print("first two rows:\n", open(OUT).read().splitlines()[:2])

👀 final jsonl rows: 252030


In [14]:
# import boto3
# s3 = boto3.client("s3")

# local_file  = OUT
# bucket      = "echodata25"
# s3_key      = "data/mlv_1.jsonl"

# s3.upload_file(local_file, bucket, s3_key)
# print(f"✔️ uploaded to s3://{bucket}/{s3_key}")


In [27]:
# row

# Training

In [29]:
import json, collections, pathlib, gzip, boto3, shutil, os
from tqdm.auto import tqdm

# ─── paths ───────────────────────────────────────────────────────────
SRC = pathlib.Path("BIMBA-LLaVA-NeXT/mlv_1.jsonl")
DST = SRC.with_stem(SRC.stem + "_clean")     # mlv_1_clean.jsonl
S3_BUCKET  = "echodata25"
S3_KEY     = "BIMBA-LLaVA-NeXT/mlv_1.jsonl"

problems = collections.Counter()
kept = dropped = 0

# helper for .gz / plain text
def _opener(p: pathlib.Path, mode: str):
    if p.suffix == ".gz":
        return gzip.open(p, mode + "t", encoding="utf-8")
    return p.open(mode, encoding="utf-8")

# ─── filter loop ─────────────────────────────────────────────────────
with _opener(SRC, "r") as fin, DST.open("w", encoding="utf-8") as fout, \
     tqdm(desc="filtering", unit="line") as bar:
    for line in fin:
        bar.update()
        try:
            row = json.loads(line)
        except json.JSONDecodeError:
            problems["invalid json"] += 1
            dropped += 1
            continue

        for t in row.get("conversations", []):
            if "value" not in t:
                problems["missing value"] += 1
                dropped += 1
                break
            if not isinstance(t["value"], str):
                problems["not a string"] += 1
                dropped += 1
                break
        else:                       # ← good row
            fout.write(line)
            kept += 1

print("summary:", problems or "all good")
print(f"kept {kept:,}  |  dropped {dropped:,}")
print(f"local file saved → {DST}")

filtering: 0line [00:00, ?line/s]

summary: Counter({'missing value': 4})
kept 252,026  |  dropped 4
local file saved → BIMBA-LLaVA-NeXT/mlv_1_clean.jsonl


In [34]:
# after the cleaning loop, before the S3 upload
num_lines = sum(1 for _ in open(DST))
print("👀 final jsonl rows:", num_lines)
print("first two rows:\n", open(DST).read().splitlines()[:2])


👀 final jsonl rows: 252026
first two rows:
 ['{"id": "1.2.276.0.7230010.3.1.2.1714512485.1.1703120985.10480112", "salient_videos": "[\'s3://echodata25/results/echo-study/1.2.276.0.7230010.3.1.2.1714512485.1.1703120985.10480112/1.2.276.0.7230010.3.1.3.1714512485.1.1703120985.10480113/1.2.276.0.7230010.3.1.4.811753780.1.1703121096.8327763.mp4\', \'s3://echodata25/results/echo-study/1.2.276.0.7230010.3.1.2.1714512485.1.1703120985.10480112/1.2.276.0.7230010.3.1.3.1714512485.1.1703120985.10480113/1.2.276.0.7230010.3.1.4.845494328.1.1703121183.13461417.mp4\', \'s3://echodata25/results/echo-study/1.2.276.0.7230010.3.1.2.1714512485.1.1703120985.10480112/1.2.276.0.7230010.3.1.3.1714512485.1.1703120985.10480113/1.2.276.0.7230010.3.1.4.1714578744.1.1703121416.8082906.mp4\', \'s3://echodata25/results/echo-study/1.2.276.0.7230010.3.1.2.1714512485.1.1703120985.10480112/1.2.276.0.7230010.3.1.3.1714512485.1.1703120985.10480113/1.2.276.0.7230010.3.1.4.1714512485.1.1703121105.10482320.mp4\', \'s3://echo

In [32]:
# ─── upload the cleaned file as-is ───────────────────────────────────
print(f"uploading to s3://{S3_BUCKET}/{S3_KEY} …")
boto3.client("s3").upload_file(str(DST), S3_BUCKET, S3_KEY)
print("✓ upload complete")

uploading to s3://echodata25/BIMBA-LLaVA-NeXT/mlv_1.jsonl …
✓ upload complete


In [None]:
from pathlib import Path
from sagemaker.pytorch import PyTorch
import sagemaker, boto3

role = sagemaker.get_execution_role()

# 👇  we’re already inside the BIMBA folder, so just use cwd()
SRC_DIR = Path.cwd()                         # /home/.../user-default-efs/BIMBA
REQ_FILE = SRC_DIR / "BIMBA-LLaVA-NeXT" / "requirements.txt"
IMAGE_URI = "495467399120.dkr.ecr.us-west-2.amazonaws.com/bimba-train:latest"

estimator = PyTorch(
    image_uri        = IMAGE_URI,
    entry_point      = "train_entrypoint.py",   # file is right here
    source_dir       = str(SRC_DIR),            # <-- fixed path
    dependencies     = [str(REQ_FILE)],
    role             = role,
    instance_type    = "ml.p4d.24xlarge", # ml.g5.2xlarge, ml.p4d.24xlarge, ml.p4de.24xlarge, ml.p5.48xlarge
    instance_count   = 1,
    framework_version = "2.1",
    py_version       = "py310",
    base_job_name    = "bimba-train",
    disable_profiler = True,
)

estimator.fit(
    inputs = {
        "training": "s3://echodata25/BIMBA-LLaVA-NeXT/mlv_1.jsonl"
    },
    wait   = True,
)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: bimba-train-2025-05-15-17-56-40-383


2025-05-15 18:04:57 Starting - Starting the training job
2025-05-15 18:04:57 Pending - Training job waiting for capacity........

# Local

In [42]:
from pathlib import Path
from sagemaker.pytorch import PyTorch
import sagemaker, boto3

role = sagemaker.get_execution_role()

# 👇  we’re already inside the BIMBA folder, so just use cwd()
SRC_DIR = Path.cwd()                         # /home/.../user-default-efs/BIMBA
REQ_FILE = SRC_DIR / "BIMBA-LLaVA-NeXT" / "requirements.txt"
IMAGE_URI = "495467399120.dkr.ecr.us-west-2.amazonaws.com/bimba-train:latest"

estimator = PyTorch(
    image_uri        = IMAGE_URI,
    entry_point      = "train_entrypoint.py",   # file is right here
    source_dir       = str(SRC_DIR),            # <-- fixed path
    dependencies     = [str(REQ_FILE)],
    role             = role,
    instance_type    = "local", # ml.g5.2xlarge, ml.p4d.24xlarge, ml.p4de.24xlarge, ml.p5.48xlarge
    instance_count   = 1,
    framework_version = "2.1",
    py_version       = "py310",
    base_job_name    = "bimba-train",
    disable_profiler = True,
)

estimator.fit(
    inputs = {
        "training": "s3://echodata25/BIMBA-LLaVA-NeXT/mlv_1.jsonl"
    },
    wait   = True,
)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
