In [37]:
import torch
print("Built with CUDA:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
print("CUDA devices:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Device name:", torch.cuda.get_device_name(0))

Built with CUDA: 11.8
CUDA available: True
CUDA devices: 1
Device name: Quadro RTX 8000


In [38]:
import pandas as pd
import json, numpy as np
from sentence_transformers import SentenceTransformer
import numpy as np
import spacy, re
import networkx as nx
from pathlib import Path
import pickle
import cv2
import torch
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from ultralytics import YOLO
from torchvision import transforms as T
from torchvision.transforms import InterpolationMode
import itertools
import os

import spacy, re
nlp = spacy.load("en_core_web_sm")

In [39]:
# get subset uids
with open('/scratch/dk3343/egoschema/subset_answers.json') as f:
    subsest_questions = json.load(f) 
subsest_uids = list(subsest_questions.keys())

# go through currently downloaded videos
directory = '/scratch/dk3343/egoschema/videos'
my_videos = [f.replace('.mp4', '') for f in os.listdir(directory) if f.endswith('.mp4')]

# filter subset videos
filtered_subset_video = [video for video in my_videos if video in subsest_uids]
len(filtered_subset_video)

114

In [40]:
FPS = 30
N_SEC = 180
VIDEO_DIR = "/scratch/dk3343/egoschema/videos/"
BLIP2_PATH = f'/scratch/dk3343/egoschema/captions/egoschema/blip2_fullset.json'
LAVILA_PATH = f'/scratch/dk3343/egoschema/captions/egoschema/lavila_fullset.json'
HCQA_PATH = "/scratch/dk3343/egoschema/hcqa_captions_subset"

# Load spacy

In [41]:
nlp = spacy.load("en_core_web_sm")

# Load Captions

In [42]:
with open(BLIP2_PATH) as f:
    blip2_fullset = json.load(f)                 
    
with open(LAVILA_PATH) as f:
    lavila_fullset = json.load(f)            # {video_uid: [180 captions]}

# Get query func

In [43]:
def get_question_dict(questions, video_uid):
    question_json = [item for item in questions if item.get("q_uid") == video_uid]
    return question_json[0]

# Get captions func for each video

In [44]:
def get_lav_4s_list(video_uid):
    hcqa_captions_path = f'{HCQA_PATH}/{video_uid}.json'
    with open(hcqa_captions_path) as f:
        hcqa_captions = json.load(f)  
    lav_4s_list = [hcqa['Caption'] for hcqa in hcqa_captions]
    return lav_4s_list

In [45]:
def get_lav_1s(video_uid):
    lav_1s = lavila_fullset[video_uid]
    return lav_1s
def get_blip2_1s(video_uid):
    blip2_1s = blip2_fullset[video_uid]
    return blip2_1s

        
    # blip2_df = pd.DataFrame({"sec": range(N_SEC), "blip2": blip2_caption}) # plain list of 180 strings
    # lavila_df = pd.DataFrame({"sec": range(N_SEC), "lavila": lavila_caption})

# Frame filtering

In [46]:
def extract_k_most_relevant_4s_blocks(lav_4s_list, question, K=8):
    # Embed captions and the question
    encoder = SentenceTransformer("all-MiniLM-L6-v2")   # 384-D, fast CPU/GPU
    
    cap_vecs = encoder.encode(lav_4s_list, normalize_embeddings=True)   # [45,384]
    q_vec    = encoder.encode([question], normalize_embeddings=True)[0] # [384]
    
    scores   = cap_vecs @ q_vec            # cosine similarity for each 4-s block
    # K        = 8                           # keep 8 windows  →  8×4 s = 32 s of vision
    top_idxs = np.argsort(scores)[::-1][:K]     # highest-scoring blocks
    
    print("Chosen 4-sec windows:", top_idxs)
    
    # Convert to second indices (or pivot indices)
    kept_secs = []
    for blk in top_idxs:
        start_sec = blk * 4                 # each block = 4 seconds
        kept_secs.extend(range(start_sec, start_sec + 4))
    
    kept_secs = sorted(set(kept_secs))      # 32 second-indices to analyse
    return kept_secs

# Sampling frames

In [47]:
def sample_frames(video_path: str, every_sec: float = 1.0) -> dict[int, Image.Image]:
    """
    Grab one frame every <every_sec> seconds and return
    {original_frame_index: PIL_Image}.
    """
    cap       = cv2.VideoCapture(video_path)
    src_fps   = cap.get(cv2.CAP_PROP_FPS) or 30        # fallback if FPS unknown
    step      = int(src_fps * every_sec)               # frame interval
    frames    = {}                                     # idx → image

    idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if idx % step == 0:
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames[idx] = Image.fromarray(rgb)
        idx += 1

    cap.release()
    return frames

# Build base graph with detections

In [48]:
# detections_df columns: pivot_idx · track_id · label · box · depth
# ---------------------------------------------------------------

def build_base_graph(detections_df: pd.DataFrame) -> nx.MultiDiGraph:
    G = nx.MultiDiGraph()

    # 1 -- add one node per tracked object instance
    for _, r in detections_df.iterrows():
        node_id = int(r.track_id)                # global persistent ID
        G.add_node(
            node_id,
            label  = r.label,
            box    = [r.xmin, r.ymin, r.xmax, r.ymax],
            depth  = float(r.mean_depth),
            pivot  = int(r.pivot_idx),
            semantic_only = False
        )

    # 2 -- (optional) add spatial edges here …
    #      e.g., left_of / in_front_of edges you already computed
    #      G.add_edge(node_u, node_v, predicate="left_of", pivot=p)

    return G

In [49]:
def add_spatial_edges(G: nx.MultiDiGraph,
                      detections_df: pd.DataFrame,
                      depth_thresh: float = 0.15) -> None:
    """
    Add spatial relation edges to G.
    • left_of / right_of when objects are on roughly the same depth plane
    • in_front_of / behind      when depth gap is significant
    Edge attrs:  predicate, pivot (=sec), source='spatial'
    """
    for sec, grp in detections_df.groupby("pivot_idx"):
        rows = grp.reset_index(drop=True)           # consecutive integer index
        # iterate over unordered pairs
        for (i, a), (j, b) in itertools.combinations(rows.iterrows(), 2):

            da, db = a.mean_depth, b.mean_depth     # use your depth column name
            if np.isnan(da) or np.isnan(db):
                continue

            # ---------- decide predicate --------------------------------
            rel_diff = abs(da - db) / max(da, db)

            if rel_diff < depth_thresh:             # roughly same depth plane
                # compare x-centers
                xa = (a.xmin + a.xmax) / 2.0
                xb = (b.xmin + b.xmax) / 2.0
                if xa < xb:
                    subj, obj, pred = int(a.track_id), int(b.track_id), "left_of"
                else:
                    subj, obj, pred = int(a.track_id), int(b.track_id), "right_of"
            else:                                   # depth gap
                # larger depth value ⇒ closer to camera (assumption)
                if da > db:
                    subj, obj, pred = int(a.track_id), int(b.track_id), "in_front_of"
                else:
                    subj, obj, pred = int(a.track_id), int(b.track_id), "behind"

            # ---------- ensure nodes exist (they should, but be safe) ---
            for node_id, row in [(subj, a), (obj, b)]:
                if node_id not in G:
                    G.add_node(
                        node_id,
                        label=row.label,
                        box=[row.xmin, row.ymin, row.xmax, row.ymax],
                        depth=float(row.mean_depth),
                        pivot=int(row.pivot_idx),
                        semantic_only=False
                    )

            # ---------- add the spatial edge ----------------------------
            G.add_edge(
                subj, obj,
                predicate=pred,
                pivot=int(sec),
                source="spatial"
            )


# Contruct graph from captions

In [50]:
def construct_action_graphs_lav(kept_secs, lav_1s):
    lav_verb_edges = []                # (sec, subj_txt, verb_lemma, obj_txt)
    
    def loose_svo(doc):
        """Return first (subj, verb, obj) tuple if available"""
        verb = next((t.lemma_ for t in doc if t.pos_ == "VERB"), None)
        nouns= [nc.root.lemma_ for nc in doc.noun_chunks]
        if verb and len(nouns) >= 2:
            return nouns[0], verb, nouns[1]
        return None
    
    for s in kept_secs:
        doc = nlp(re.sub(r"#C ", "", lav_1s[s]))   # strip '#C '
        triple = loose_svo(doc)
        if triple:
            subj, verb, obj = triple
            lav_verb_edges.append((s, subj, verb, obj))
    # print(lav_verb_edges)
    return lav_verb_edges

def construct_noun_nodes(kept_secs, blip2_1s): 
    noun_nodes = {}                               # sec -> set of noun lemmas
    for s in kept_secs:
        nouns = [nc.root.lemma_
                 for nc in nlp(blip2_1s[s]).noun_chunks
                 if nc.root.pos_ == "NOUN"]
        noun_nodes[s] = set(nouns)
    # print(noun_nodes)
    return noun_nodes

# revised to replace "man", "woman", "person" with "c"

In [51]:
GENERIC_VERBS = {"be", "have", "stand", "sit", "hold", "look"}  # expand blacklist

def construct_action_graphs_blip(
        kept_secs: list[int],
        blip2_1s: list[str],
        lav_verb_edges: list[tuple] = None   # (sec, subj, verb, obj) from LaViLLa
) -> list[tuple]:
    """
    Return a list of caption-derived edges from BLIP-1 s captions:
        (sec, subj_txt, verb_lemma, obj_txt)
    If there is no verb or fewer than two nouns → skip.
    If the verb is in the generic blacklist → skip.
    If (s, subj, obj) is in lav_pairs → LaViLLa already explained that relation → skip.
    Otherwise append (s, subj, verb, obj) to the output list ➜ blip2_verb_edges.
    Additionally, normalize subjects 'man', 'woman', or 'person' to 'c'.
    """
    lav_pairs = {(sec, s_txt, o) for sec, s_txt, _, o in (lav_verb_edges or [])}

    verb_edges: list[tuple[int, str, str, str]] = []
    for s in kept_secs:
        doc = nlp(blip2_1s[s])
        # --- extract first S-V-O triple ---
        verb = next((t.lemma_ for t in doc if t.pos_ == "VERB"), None)
        nouns = [nc.root.lemma_ for nc in doc.noun_chunks]
        if not (verb and len(nouns) >= 2):
            continue

        if verb in GENERIC_VERBS:
            continue                     # too common → noise

        subj_txt, obj_txt = nouns[0], nouns[1]
        # normalize generic subjects
        if subj_txt.lower() in ("man", "woman", "person"):
            subj_txt = "c"

        # skip if LaViLLa already has that (sec, subj, obj) pair
        if (s, subj_txt, obj_txt) in lav_pairs:
            continue                     # LaViLLa already gave a verb

        verb_edges.append((s, subj_txt, verb, obj_txt))
    return verb_edges


In [52]:

# def construct_action_graphs_blip(
#         kept_secs: list[int],
#         blip2_1s: list[str],
#         lav_verb_edges: list[tuple] = None   # (sec, subj, verb, obj) from LaViLLa
# ) -> list[tuple]:
#     """
#     Return a list of caption-derived edges from BLIP-1 s captions:
#         (sec, subj_txt, verb_lemma, obj_txt)
#     If there is no verb or fewer than two nouns → skip.
#     If the verb is in the generic blacklist → skip.
#     If (s, subj, obj) is in lav_pairs → LaViLLa already explained that relation → skip.
#     Otherwise append (s, subj, verb, obj) to the output list ➜ blip2_verb_edges.
#     """
#     lav_pairs = {(sec, s, o) for sec, s, _, o in (lav_verb_edges or [])}

#     verb_edges = []
#     for s in kept_secs:
#         doc = nlp(blip2_1s[s])
#         # --- extract first S-V-O triple ---
#         verb  = next((t.lemma_ for t in doc if t.pos_ == "VERB"), None)
#         nouns = [nc.root.lemma_ for nc in doc.noun_chunks]
#         if not (verb and len(nouns) >= 2):
#             continue

#         if verb in GENERIC_VERBS:
#             continue                     # too common → noise

#         subj_txt, obj_txt = nouns[0], nouns[1]
#         # skip if LaViLLa already has that (sec, sid, oid) pair
#         if (s, subj_txt, obj_txt) in lav_pairs:
#             continue                     # LaViLLa already gave a verb

#         verb_edges.append((s, subj_txt, verb, obj_txt))
#     return verb_edges

In [53]:
# def add_caption_edge_to_graph(G: nx.MultiDiGraph,   
#                               sec: int,
#                               subj_txt: str,
#                               verb: str,
#                               obj_txt: str,
#                               # df_dets: pd.DataFrame,
#                               source: str = "lav"):
#     """
#     sec      : second / pivot_idx
#     subj_txt : raw subject string from caption
#     verb     : lemmatized verb   (predicate)
#     obj_txt  : raw object string
#     source   : 'lav' | 'blip'  (stored in edge attr)
#     """
#     # 1. helper: map caption text → track_id (or None)
#     dets = df_dets[df_dets.pivot_idx == sec].to_dict("records")

#     def match_txt(txt):
#         txt_low = txt.lower()
#         # strongest match = highest confidence where txt appears in label
#         best = max(
#             (d for d in dets if txt_low in d["label"].lower()),
#             key=lambda d: d["confidence"],
#             default=None,
#         )
#         return int(best["track_id"]) if best else None

#     sid = match_txt(subj_txt)
#     oid = match_txt(obj_txt)

#     # 2. ensure nodes exist in graph --------------------------------------
#     def ensure_node(node_id, label):
#         if node_id not in G:
#             G.add_node(node_id,
#                        label=label,
#                        pivot=sec,
#                        box=None,
#                        depth=None,
#                        semantic_only=True)

#     if sid is None:
#         sid = f"{sec}:{subj_txt}"
#         ensure_node(sid, subj_txt)
#     if oid is None:
#         oid = f"{sec}:{obj_txt}"
#         ensure_node(oid, obj_txt)

#     # 3. add the caption edge --------------------------------------------
#     G.add_edge(sid, oid,
#                predicate=verb,
#                pivot=sec,
#                source=source)   # keep track of origin

def add_caption_edge_to_graph(
        G: nx.MultiDiGraph,
        sec: int,
        subj_txt: str,
        verb: str,
        obj_txt: str,
        source: str = "lav"         # 'lav' or 'blip'
):
    """
    Insert one caption‑derived S‑V‑O edge.
    If a visual node for the subject / object already exists in `G`
    (same second, matching label), reuse it; otherwise create a
    new semantic‑only node   f"{sec}:{raw_text}".
    """

    # ------------------------------------------------------------------ #
    # 1)  Helper: look up an existing node by label + timestamp
    # ------------------------------------------------------------------ #
    def reuse_node(label_txt: str):
        lbl_low = label_txt.lower()
        for n, data in G.nodes(data=True):
            # We reuse a node if:
            #   • it belongs to the same second (`pivot`)
            #   • label matches (case‑insensitive)
            #   • it is *not* purely semantic_only  ---> optional
            if data.get("pivot") == sec and data.get("label", "").lower() == lbl_low:
                return n
        return None

    sid = reuse_node(subj_txt)
    oid = reuse_node(obj_txt)

    # ------------------------------------------------------------------ #
    # 2)  Ensure both endpoints exist, create if necessary
    # ------------------------------------------------------------------ #
    def ensure_node(node_id: str, raw_label: str, semantic_only=True):
        if node_id not in G:
            G.add_node(
                node_id,
                label=raw_label,
                pivot=sec,
                box=None,
                depth=None,
                semantic_only=semantic_only,
                source=f"{source}_caption"
            )

    if sid is None:
        sid = f"{sec}:{subj_txt}"
        ensure_node(sid, subj_txt, semantic_only=True)

    if oid is None:
        oid = f"{sec}:{obj_txt}"
        ensure_node(oid, obj_txt, semantic_only=True)

    # ------------------------------------------------------------------ #
    # 3)  Add / update the caption edge
    # ------------------------------------------------------------------ #
    G.add_edge(
        sid,
        oid,
        predicate=verb,     # already lemmatised
        pivot=sec,
        source=source       # 'lav' or 'blip'
    )



# SPATIAL_MAP = {
#     "on": "on_top_of", "onto": "on_top_of",
#     "under": "under", "beneath": "under", "below": "under",
#     "in front of": "in_front_of",
#     "behind": "behind",
#     "next to": "next_to", "beside": "next_to",
#     # "with": "with",
# }

SPATIAL_MAP = {
    # existing
    "on":          "on_top_of",
    "onto":        "on_top_of",
    "under":       "under",
    "beneath":     "under",
    "below":       "under",
    "in front of": "in_front_of",
    "behind":      "behind",
    "next to":     "next_to",
    "beside":      "next_to",

    # added language‐level synonyms
    "above":       "on_top_of",
    "over":        "on_top_of",
    "inside":      "inside_of",
    "in":          "inside_of",
    "within":      "inside_of",
    "between":     "between",
    "near":        "near",
    "adjacent to": "next_to",
    "against":     "touching",
    "on top":      "on_top_of",

    # (optionally) more fine‐grained spatial predicates
    "left of":     "to_left_of",
    "right of":    "to_right_of",
    "above":       "above",      # if you’d rather distinguish on_top_of vs. just above
    "below":       "below",
}



def extract_spatial_relations(caption: str):
    """
    Return list of (source, canonical_relation, target) triples.
    """
    triples = []
    doc = nlp(caption)
    for tok in doc:
        # Preposition or prepositional phrase
        if tok.dep_ == "prep":
            # Try to recover multi-token preps like "in front of"
            span_text = tok.text.lower()
            if span_text == "in":
                # Look ahead for "front of"
                nxt = tok.nbor(1) if tok.i + 1 < len(doc) else None
                nxt2 = tok.nbor(2) if tok.i + 2 < len(doc) else None
                if nxt and nxt.text == "front" and nxt2 and nxt2.text == "of":
                    span_text = "in front of"
            # Map only if we know the canonical form
            if span_text not in SPATIAL_MAP:
                continue

            # The governing head is the *source* (usually NOUN/VERB)
            head = tok.head
            if head.pos_ not in {"NOUN", "PROPN"}:
                continue
            source = head.lemma_.lower()

            # Find the object of the preposition (pobj) → target
            pobj = [c for c in tok.children if c.dep_ == "pobj" and c.pos_ == "NOUN"]
            if not pobj:
                continue
            target = pobj[0].lemma_.lower()

            triples.append((source,
                            SPATIAL_MAP[span_text],
                            target))
    return triples



def add_spatial_relations(G, blip2_1s, kept_secs):
    for sec in kept_secs:
        cap = blip2_1s[sec]
        for src, rel, tgt in extract_spatial_relations(cap):

            # ── ensure both nodes carry a "label" ───────────────
            for node_txt in (src, tgt):
                node_id = f"{sec}:{node_txt}"
                # print(node_txt)
                G.add_node(                   # idempotent update
                    node_id,
                    label=node_txt,
                    pivot=sec,
                    semantic_only=True,
                    source="blip_spatial"
                )

            # ── add / update edge ───────────────────────────────
            src_id, tgt_id = f"{sec}:{src}", f"{sec}:{tgt}"
            G.add_edge(
                src_id,
                tgt_id,
                predicate=rel,
                pivot=sec,
                source="blip_spatial"
            )


In [54]:
# def add_blip_noun_nodes(G, detections_df, noun_nodes):
#     for sec, nouns in noun_nodes.items():
#         # detections for that second
#         dets = detections_df[detections_df.pivot_idx == sec]
#         det_labels = {lbl.lower() for lbl in dets.label.unique()}
    
#         for noun in nouns:
#             if noun.lower() in det_labels:
#                 continue                         # YOLO already has it
#             node_id = f"{sec}:{noun}"
#             if node_id not in G:
#                 G.add_node(
#                     node_id,
#                     label=noun,
#                     pivot=sec,
#                     box=None,
#                     depth=None,
#                     semantic_only=True,          # mark as non-visual
#                     source="blip_noun"
#                 )
#                 # optional: connect to 'scene' or 'person' for graph connectivity
#                 # person_id could be the first 'person' track in this second
#                 pers = dets[dets.label=="person"]
#                 if not pers.empty:
#                     person_track = int(pers.iloc[0].track_id)
#                     G.add_edge(person_track, node_id,
#                                predicate="present",
#                                pivot=sec,
#                                source="blip_noun")
def add_blip_noun_nodes(G, noun_nodes):
    """
    Add BLIP noun‑only nodes when no YOLO detections_df is available.

    Parameters
    ----------
    G : nx.Graph
        Scene graph built so far (must contain 'label' + 'pivot' on every node).
    noun_nodes : dict[int, list[str]]
        { second : [noun1, noun2, ...] } extracted from BLIP‑1 s captions.
    """
    for sec, nouns in noun_nodes.items():

        # ── what visual nodes already exist at this second? ──────────────
        visual_labels = {
            data["label"].lower()
            for n, data in G.nodes(data=True)
            if data.get("pivot") == sec and not data.get("semantic_only", True)  # non‑semantic ⇒ visual
        }

        # ── add only the new nouns ───────────────────────────────────────
        for noun in nouns:
            if noun.lower() in visual_labels:
                continue                      # already grounded visually

            node_id = f"{sec}:{noun}"
            if node_id in G:                 # maybe added by another caption pass
                continue

            G.add_node(
                node_id,
                label=noun,
                pivot=sec,
                box=None,
                depth=None,
                semantic_only=True,
                source="blip_noun"
            )

            # optional connectivity: link to a 'person' in the same second
            person_id = next(
                (n for n, d in G.nodes(data=True)
                 if d.get("pivot") == sec and d.get("label") == "person"),
                None
            )
            if person_id is not None:
                G.add_edge(
                    person_id,
                    node_id,
                    predicate="present",
                    pivot=sec,
                    source="blip_noun"
                )


# Laod Midas

In [55]:
# Step 5: Load the lightweight MiDaS_small model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas  = torch.hub.load("intel-isl/MiDaS", "MiDaS_small", trust_repo=True).to(device)
midas.eval()

# Manual “small” transform for MiDaS_small
small_transform = T.Compose([
    T.Resize((256, 256), interpolation=InterpolationMode.BICUBIC),
    T.ToTensor(),  # scales pixel values to [0,1]
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std =[0.229, 0.224, 0.225])
])

# Precompute depth maps for each pivot frame, squeezing out extra dims
def computes_depth_maps(frames_kept):
    depth_maps = {}
    for pivot_idx, frame in frames_kept.items():
        img = frame                 # PIL.Image
        inp = small_transform(img).unsqueeze(0).to(device)  # [1,3,H,W]
    
        with torch.no_grad():
            pred = midas(inp)                # [1, H', W']
            pred = torch.nn.functional.interpolate(
                pred.unsqueeze(1),           # [1,1,H',W']
                size=img.size[::-1],         # → [1,1,H,W]
                mode="bicubic",
                align_corners=False
            )
            pred = pred.squeeze()            # now [H, W]
    
        depth_maps[pivot_idx] = pred.cpu().numpy()  # store a (H,W) mapdf_dets
    return depth_maps
    
def compute_mean_abs_depth(row, depth_maps):
    dm = depth_maps[row.pivot_idx]       # shape (H, W)
    x1, y1, x2, y2 = map(int, (row.xmin, row.ymin, row.xmax, row.ymax))
    crop = dm[y1:y2, x1:x2]
    if crop.size == 0:
        return np.nan
    abs_crop = np.abs(crop)
    valid = np.isfinite(abs_crop)
    return float(abs_crop[valid].mean()) if valid.any() else np.nan

Using cache found in /home/dk3343/.cache/torch/hub/intel-isl_MiDaS_master
Using cache found in /home/dk3343/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master


Loading weights:  None


# Load YOLO

In [56]:
yolo = YOLO('yolov8l.pt')

In [57]:
# def detect_objects(frames_kept):
#     detections = {}
#     for pivot_idx, frame in frames_kept.items():
#         # YOLO expects either a numpy array (BGR) or PIL Image
#         results = yolo(frame)  # returns a list of Results objects
    
#         det_list = []
#         for r in results:
#             # r.boxes contains xyxy, cls, conf
#             for box, cls, conf in zip(r.boxes.xyxy, r.boxes.cls, r.boxes.conf):
#                 x1, y1, x2, y2 = box.cpu().numpy().tolist()
#                 det_list.append({
#                     "label":     yolo.names[int(cls)],
#                     "confidence": float(conf.cpu()),
#                     "xmin":       x1, "ymin": y1,
#                     "xmax":       x2, "ymax": y2
#                 })
#         detections[pivot_idx] = det_list
#     return detections

def detect_objects(frames_kept, conf_thresh=0.4):
    """
    frames_kept: dict[sec → PIL.Image]
    conf_thresh: only keep detections with conf ≥ this
    returns dict[sec → list[det dicts]]
    """
    detections = {}
    for pivot_idx, frame in frames_kept.items():
        results = yolo(frame)
        det_list = []
        for r in results:
            for box, cls, conf in zip(r.boxes.xyxy,
                                      r.boxes.cls,
                                      r.boxes.conf):
                score = float(conf.cpu())
                if score < conf_thresh:
                    continue
                x1, y1, x2, y2 = box.cpu().numpy().tolist()
                det_list.append({
                    "label":      yolo.names[int(cls)],
                    "confidence": score,
                    "xmin":       x1, "ymin": y1,
                    "xmax":       x2, "ymax": y2
                })
        detections[pivot_idx] = det_list
    return detections


def build_df_for_detections(detections):
    records = []
    for pivot_idx, det_list in detections.items():
        for det in det_list:
            records.append({
                "pivot_idx":    pivot_idx,
                "label":        det["label"],
                "confidence":   det["confidence"],
                "xmin":         det["xmin"],
                "ymin":         det["ymin"],
                "xmax":         det["xmax"],
                "ymax":         det["ymax"],
            })
    df_dets = pd.DataFrame(records)
    df_dets
    return df_dets

def build_detections_df_from_yolo_midas(frames_kept):
    # frames_kept = {
    #     sec * FPS: frames_all[sec * FPS]      # FPS = native fps (30)
    #     for sec in kept_secs
    #     if sec * FPS in frames_all
    # }

    depth_maps   = computes_depth_maps(frames_kept)          
    detections   = detect_objects(frames_kept)              # list[dict]
    detections_df= build_df_for_detections(detections)      # pivot_idx, xmin…
    
    # ---------- row-wise depth ------------------------------------------------
    def compute_mean_abs_depth(row, depth_maps):
        """Return the mean depth inside the detection’s bbox."""
        dm   = depth_maps[row.pivot_idx]                    # depth map of that frame
        x1,y1,x2,y2 = map(int, [row.xmin, row.ymin, row.xmax, row.ymax])
        return float(dm[y1:y2, x1:x2].mean())
    
    detections_df["mean_depth"] = detections_df.apply(
        lambda r: compute_mean_abs_depth(r, depth_maps), axis=1
    )

    if "track_id" not in detections_df.columns:
        # give each detection a persistent ID = running integer
        detections_df = detections_df.reset_index(drop=True)
        detections_df["track_id"] = detections_df.index        # or any custom IDs   
    return detections_df

# Conver graph to fact list for LLM

In [58]:
# Revised

In [59]:
def graph_to_fact_list(G: nx.MultiDiGraph) -> list[str]:
    """
    Convert the fused scene-graph into chronological fact sentences,
    collapsing runs of the *same* fact over consecutive seconds into a
    single “start–end” window.

    Priority is ignored and there is no limit on the number of facts.

    Returns
    -------
    facts : list[str]  (aggregated over consecutive seconds)
    """
    # 1) collect all (sec, fact_text)
    rows: list[tuple[int, str]] = []
    for u, v, d in G.edges(data=True):
        subj = G.nodes[u]["label"]
        obj  = G.nodes[v]["label"]
        pred = d.get("predicate", "").replace("_", " ")
        sec  = int(d.get("pivot", 0))
        text = f"{subj} {pred} {obj}".strip()
        rows.append((sec, text))

    # 2) sort and remove duplicate facts within the same second
    seen: set[tuple[int, str]] = set()
    unique: list[tuple[int, str]] = []
    for sec, text in sorted(rows, key=lambda t: (t[0], t[1])):
        if (sec, text) not in seen:
            seen.add((sec, text))
            unique.append((sec, text))

    # 3) group timestamps per fact_text
    from collections import defaultdict
    fact_to_secs: dict[str, list[int]] = defaultdict(list)
    for sec, text in unique:
        fact_to_secs[text].append(sec)

    # 4) collapse consecutive runs for each fact
    runs: list[tuple[int, int, str]] = []
    for text, secs in fact_to_secs.items():
        seq = sorted(set(secs))
        start = prev = seq[0]
        for s in seq[1:]:
            if s == prev + 1:
                prev = s
            else:
                runs.append((start, prev, text))
                start = prev = s
        runs.append((start, prev, text))

    # 5) sort runs by start time
    runs.sort(key=lambda x: x[0])

    # 6) format the output strings
    facts: list[str] = []
    for start, end, text in runs:
        time_str = f"{start}s" if start == end else f"{start}-{end}s"
        facts.append(f"{time_str}: {text}")

    return facts



In [60]:
# def graph_to_fact_list(G: nx.MultiDiGraph,
#                        max_facts: int = 10000000) -> list[str]:
#     """
#     Convert the fused scene-graph into ≤ `max_facts` short, chronological
#     fact sentences, but collapse runs of the *same* fact over consecutive
#     seconds into a single “start–end” window.

#     Priority
#     --------
#     1. visual / spatial     (source == "spatial")
#     2. visual  VidVRD       (source == "visual")
#     3. caption  LaViLLa     (source == "lav")
#     4. caption  BLIP        (source == "blip")
#     5. caption  BLIP nouns  (source == "blip_noun")
#     """
#     PRIORITY = {
#         "lav"      : 1,
#         "spatial"  : 3,
#         "visual"   : 5,
#         "blip"     : 2,
#         "blip_noun": 4
#     }

#     # 1) collect all (sec, priority, fact_text)
#     rows: list[tuple[int,int,str]] = []
#     for u, v, d in G.edges(data=True):
#         subj = G.nodes[u]["label"]
#         obj  = G.nodes[v]["label"]
#         pred = d["predicate"].replace("_", " ")
#         sec  = int(d["pivot"])
#         src  = d.get("source", "visual")
#         text = f"{subj} {pred} {obj}"
#         rows.append((sec, PRIORITY.get(src, 5), text))

#     # 2) sort and remove duplicate facts *within* the same second
#     seen = set()
#     unique: list[tuple[int,str]] = []
#     for sec, prio, text in sorted(rows, key=lambda t: (t[0], t[1], t[2])):
#         if (sec, text) not in seen:
#             seen.add((sec, text))
#             unique.append((sec, text))

#     # 3) group runs of the *same* fact over consecutive seconds
#     facts: list[str] = []
#     if unique:
#         curr_text = unique[0][1]
#         start_sec = prev_sec = unique[0][0]

#         for sec, text in unique[1:]:
#             if text == curr_text and sec == prev_sec + 1:
#                 # extend the current run
#                 prev_sec = sec
#             else:
#                 # flush the current run
#                 if start_sec == prev_sec:
#                     time_str = f"{start_sec}s"
#                 else:
#                     time_str = f"{start_sec}-{prev_sec}s"
#                 facts.append(f"{time_str}: {curr_text}")
#                 if len(facts) >= max_facts:
#                     return facts

#                 # start a new run
#                 curr_text = text
#                 start_sec = prev_sec = sec

#         # flush the final run
#         if start_sec == prev_sec:
#             time_str = f"{start_sec}s"
#         else:
#             time_str = f"{start_sec}-{prev_sec}s"
#         facts.append(f"{time_str}: {curr_text}")

#     return facts


In [61]:
# def graph_to_fact_list(G: nx.MultiDiGraph,
#                        max_facts: int = 15) -> list[str]:
#     """
#     Convert the fused scene-graph into ≤ `max_facts` short, chronological
#     fact sentences.

#     Priority
#     --------
#     1. visual / spatial     (source == "spatial")
#     2. visual  VidVRD       (source == "visual")
#     3. caption  LaViLLa     (source == "lav")
#     4. caption  BLIP        (source == "blip")
#     5. caption  BLIP nouns  (source == "blip_noun")

#     Returns
#     -------
#     facts : list[str]  (deduplicated, sorted by pivot then priority)
#     """
#     PRIORITY = {
#         "lav"    : 1,
#         "spatial": 3,
#         "visual" : 5,
#         "blip"   : 2,
#         "blip_noun": 4
#     }

#     # ------ collect candidate sentences ----------------------------------
#     rows = []
#     for u, v, d in G.edges(data=True):
#         # print(G.nodes[u])
#         subj = G.nodes[u]["label"]
#         obj  = G.nodes[v]["label"]
#         pred = d["predicate"].replace("_", " ")
#         sec  = int(d["pivot"])
#         src  = d.get("source", "visual")   # spatial/visual edges may not have 'source'

#         fact = f"{sec}s: {subj} {pred} {obj}"
#         rows.append((sec, PRIORITY.get(src, 5), fact))

#     # ------ sort & deduplicate -------------------------------------------
#     seen, facts = set(), []
#     for _, _, fact in sorted(rows, key=lambda t: (t[0], t[1])):
#         if fact not in seen:
#             seen.add(fact)
#             facts.append(fact)
#         if len(facts) == max_facts:
#             break
#     return facts

# Build prompt for LLM

In [62]:
def build_prompt(q_entry: dict, facts: list[str]) -> str:
    """
    Build a chain-of-thought prompt that asks the model to output
    a single digit 0–4 (matching 'option 0' … 'option 4').

    Parameters
    ----------
    q_entry : dict   # fields: 'question', 'option 0' … 'option 4'
    facts   : list[str]   # sentences from graph_to_fact_list
    """
    # -------- format answer options --------------------------------------
    options_txt = "\n".join(
        f"{i}) {q_entry[f'option {i}']}" for i in range(5)
    )

    # -------- format facts ----------------------------------------------
    facts_txt = "\n".join(f"{k+1}. {f}" for k, f in enumerate(facts))

    # -------- assemble prompt -------------------------------------------
    prompt = f"""You are an expert in egocentric-video reasoning. Given the timeline of actions, focus only on what 'c' personally does. You should finish with "Answer: <0|1|2|3|4>" after analysis.

Video-derived facts:
{facts_txt}

Question:
{q_entry['question']}

Answer choices:
{options_txt}

Mention every thought process while you are answering, once you select the option by reasoning, compare your option with other closely related options from 0|1|2|3|4 , if any other options look close, try to do the reasoning and inference to find which one option would be better. 
After that, finally mention the reasons for which you are disregarding other options and choosing your option from the given 5 options.
Then, give confidence score for your answer between 1 to 5, if you think the confidence is below 5, reanalyze your reasoning with common sense and more attention to "c"'s interaction.

Finish with one line after the whole steps of reasoning:
Answer: <0|1|2|3|4>"""

# """
#     prompt = f"""You are an expert in egocentric-video reasoning. In the video, "c" refers to the camera wearer and "man", "woman", and "person" could refer to the same individual "c". 
#     Given the timeline of actions, focus only on what 'c' personally does. You should finish with "Answer: <0|1|2|3|4>" after analysis.

# Video-derived facts:
# {facts_txt}

# Question:
# {q_entry['question']}

# Answer choices:
# {options_txt}

# Mention every thought process while you are answering, once you select the option by reasoning, compare your option with other closely related options from 0|1|2|3|4 , if any other options look close, try to do the reasoning and inference to find which one option would be better. 
# After that, finally mention the reasons for which you are disregarding other options and choosing your option from the given 5 options.
# Then, give confidence score for your answer between 1 to 5, if you think the confidence is below 4, reanalyze your reasoning with common sense and more attention to "c"'s interaction.

# Finish with one line after the whole steps of reasoning:
# Answer: <0|1|2|3|4>"""
# """

#     prompt = f"""You are an expert in egocentric-video reasoning. In the video, "c" refers to the camera wearer, and "man", "woman", and "person" could refer to the same individual, including "c" in some cases. 
# Your goal is to determine the main task that 'c' personally and repeatedly engaged in, based **only on their direct actions** throughout the video timeline — not the general scene or what others were doing.

# Video-derived facts:
# {facts_txt}

# Question:
# {q_entry['question']}

# Answer choices:
# {options_txt}

# Start by analyzing only what "c" did. Mention every step of your reasoning, identifying patterns or repeated actions done by "c" and what task they most align with. After you choose an answer, compare your choice with the closest other options from 0|1|2|3|4 and explain why your selected answer best fits 'c''s **personal activity focus**. 

# Then, explain why the other options are less suitable given 'c''s specific actions, not the general context. 
# Rate your confidence on a scale from 1 to 5. If your confidence is below 3, reanalyze your reasoning with more attention to repeated patterns in 'c''s interactions.

# End with a single line:
# Answer: <0|1|2|3|4>
# """

    return prompt.strip()


# Get currently available subset videos

In [63]:
def get_avail_subset_vidoes():
    # get subset uids
    with open('/scratch/dk3343/egoschema/subset_answers.json') as f:
        subsest_questions = json.load(f) 
    subsest_uids = list(subsest_questions.keys())
    
    # go through currently downloaded videos
    directory = '/scratch/dk3343/egoschema/videos'
    my_videos = [f.replace('.mp4', '') for f in os.listdir(directory) if f.endswith('.mp4')]
    
    # filter subset videos
    filtered_subset_video = [video for video in my_videos if video in subsest_uids]
    filtered_subset_video
    return filtered_subset_video

In [64]:
def sample_frames_by_second(video_path, every_sec=1):
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    duration = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) / fps)
    frames = {}
    for sec in range(0, duration, every_sec):
        cap.set(cv2.CAP_PROP_POS_MSEC, sec*1000)
        ret, frame = cap.read()
        if not ret:
            break
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames[sec] = Image.fromarray(rgb)
    cap.release()
    return frames

# Main Loop

In [65]:
len(filtered_subset_video)

114

In [66]:
result_json_path = "/scratch/dk3343/egoschema/caption_only_trial/aggregate__more_spatial_4o.jsonl"

In [67]:
def append_result(path, record):
    """Appends one JSON object (as one line) to `path`."""
    with open(path, "a") as f:
        f.write(json.dumps(record) + "\n")

In [68]:
import os
os.environ["OPENAI_API_KEY"] = "REPLACE WITH YOURS"

In [69]:
import re
def extract_answer_from_llm(output_text):
    """
    Tries several common patterns to pull out a single-digit answer.
    Supported formats:
      **Answer: 4**
      Answer: <4>
      Answer: <0>
      **Answer:** <1>
    """
    patterns = [
        # **Answer: 4**
        r"\*\*Answer:\s*([0-9]+)\*\*",
        # **Answer:** <1>
        r"\*\*Answer:\*\*\s*<\s*([0-9]+)\s*>",
        # Answer: <4>  or  Answer:<4>
        r"Answer:\s*<\s*([0-9]+)\s*>",
    ]
    for pat in patterns:
        m = re.search(pat, output_text)
        if m:
            return int(m.group(1))
    # fallback: last character if it’s a digit 0–9
    if output_text and output_text.strip()[-1].isdigit():
        return int(output_text.strip()[-1])
    # nothing matched
    print("no answer found!")
    return None



def has_video_uid(path, video_uid):
    """Return True if any record in the JSONL at `path` has video_uid == video_uid."""
    try:
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                if not line.strip():
                    continue
                rec = json.loads(line)
                if rec.get("video_uid") == video_uid:
                    return True
        return False
    except:
        print("json not created yet")
        return False

In [70]:
import time

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from openai import OpenAI

with open('/scratch/dk3343/egoschema/questions.json') as f:
    questions = json.load(f)   

with open('/scratch/dk3343/egoschema/subset_answers.json') as f:
    subset_answers = json.load(f) 

filtered_subset_video = get_avail_subset_vidoes()


# videos = ['9d578253-ae7e-4445-bbfc-974a5e540857', '7cccb681-3447-410f-8747-6937d146725c', '111b7189-cc45-438c-8144-ded1eed0f6c2', 
#           'b81bb7b2-a16d-4cdc-8326-f4019f3be544', 'bbf66dab-376a-4c11-8528-22ca0c5b01c8', 'f10119f9-631e-47d0-8921-9ce6859a3708',
#           'fe375c7f-6cf0-41aa-8eac-df89817c38e2', '68f6ddfd-bf42-4bee-b1c9-a48db428e586']

accuracy = 0
total = 0
correct = 0

for video_uid in filtered_subset_video:
    if has_video_uid(result_json_path, video_uid):
        print(f"Already processed {video_uid}. Move on!")
        continue
    # 1 ──────────────────────────────────────────────────────────────────────
    # Get Captions
    lav_4s_list = get_lav_4s_list(video_uid)
    lav_1s = get_lav_1s(video_uid)
    blip2_1s = get_blip2_1s(video_uid)
    
    # 2 ──────────────────────────────────────────────────────────────────────
    # Get Question
    question_dict = get_question_dict(questions, video_uid)
    """
    {'q_uid': 'a203b4a9-0639-43c8-b05d-cbcbacb77f48', 'google_drive_id': '1C46w5lDQpPOeSWXbB4wuRiPN_-6TWPtm', 
    question': 'What is the overall goal of the actions performed by "c" in the video?', 
    'option 0': 'Currently, c is diligently cleaning her reliable sewing machine with care.', 
    'option 1': 'Carefully, c is skillfully threading a fine needle with precision.', 
    'option 2': 'Currently, c is carefully cutting fabric for a project.', 
    'option 3': 'C is ironing fabric.', 
    'option 4': 'C is sewing a piece of fabric.'}
    """
    print(video_uid)
    question = question_dict['question']
    ans = subset_answers[question_dict['q_uid']]

    # 3 ──────────────────────────────────────────────────────────────────────
    # Extract relevant frames based on the question and captions
    kept_secs = extract_k_most_relevant_4s_blocks(lav_4s_list, question, K=30)

    # 4 ──────────────────────────────────────────────────────────────────────
    # Sample frames from a video
    video_path=f"{VIDEO_DIR}/{video_uid}.mp4"
    # frames_all = sample_frames(video_path, every_sec=1)  
    frames_all   = sample_frames_by_second(video_path, every_sec=1)
    kept_frames  = {s: frames_all[s] for s in kept_secs}

    # 5 ──────────────────────────────────────────────────────────────────────
    # Construct edges from captions
    # Extract one verb triple per kept second (LaViLLa-1 s)
    # Gives a high-quality action edge (verb) anchored in each second the retrieval module kept.
    lav_verb_edges = construct_action_graphs_lav(kept_secs, lav_1s)
    
    # Harvest salient nouns from BLIP2 - 1s
    # Collects all nouns BLIP “sees” so we can later add semantic-only nodes for objects YOLO might have missed.
    noun_nodes = construct_noun_nodes(kept_secs, blip2_1s)
    
    # Extract one verb triple (BLIP2 - 1s)
    # Acts as a fallback to recover actions LaViLLa omitted while keeping noise low.
    blip2_verb_edges = construct_action_graphs_blip(kept_secs, blip2_1s, lav_verb_edges)

    # # 6 ──────────────────────────────────────────────────────────────────────
    # 1) Build the base graph, then enrich with caption edges
    G = nx.DiGraph()

    # 2) add LaViLLa verb edges
    for sec, subj, verb, obj in lav_verb_edges:
        # add_caption_edge_to_graph(G, sec, subj, verb, obj, detections_df, source="lav")
        add_caption_edge_to_graph(G, sec, subj, verb, obj, source="lav")
   
    # 3) add BLIP verb edges (filtered / fallback)
    for sec, subj, verb, obj in blip2_verb_edges:
        # add_caption_edge_to_graph(G, sec, subj, verb, obj, detections_df, source="blip")
        add_caption_edge_to_graph(G, sec, subj, verb, obj, source="blip")
        
    # Experiment without blip noun
    # # 4) add BLIP noun nodes (semantic_only = True)
    add_blip_noun_nodes(G, noun_nodes)   

    add_spatial_relations(G, blip2_1s, kept_secs)
    
    facts = graph_to_fact_list(G)   # chronological, deduped
    prompt = build_prompt(question_dict, facts)
    print(prompt)

    client = OpenAI()
    try:
        response = client.responses.create(
            model= "gpt-4.1-mini", #"gpt-4.1-nano", #"gpt-4o",
            input= prompt
        )
    except RateLimitError as e:
        # The error message even tells you how long to wait (≈20 s)
        wait = 20
        print(f"Rate limit hit—sleeping {wait}s…")
        time.sleep(wait)
        # then retry once more
        response = client.responses.create(
            model= "gpt-4o", #"gpt-4.1-nano", #"gpt-4o",
            input= prompt
        )
    
    print(response.output_text)
    output = extract_answer_from_llm(response.output_text.strip())
    record = {
        "video_uid": video_uid,
        "prompt":     prompt,
        "response": response.output_text,
        "output":     output,
        "ans":        ans
    }
    append_result(result_json_path, record)
        
    total += 1
    if str(output) == str(ans):
        correct += 1
    accuracy = correct/total*100
    print(f"Ouput: {output}")
    print(f"Answer: {ans}")
    print(f"Accuracy: {accuracy}")

    # assume you still have:
    # • frames: List[PIL.Image] sampled at 1 fps
    # • df:    your detections DataFrame with cols ['pivot_idx','label','xmin','ymin','xmax','ymax']
    # • rel_df: relations DataFrame with cols ['pivot_idx','obj_A','obj_B','relation']
    
    # for pivot, subrels in detections_df.groupby("pivot_idx"):
    #     # 1) grab the frame and its detections
    #     img = kept_frames[pivot]
    #     dets = detections_df[detections_df.pivot_idx == pivot]
    
    #     # 2) plot
    #     fig, ax = plt.subplots(1, figsize=(8,6))
    #     ax.imshow(img)
    #     ax.axis("off")
    
    #     # draw each box + label
    #     for _, det in dets.iterrows():
    #         x1,y1,x2,y2 = det.xmin, det.ymin, det.xmax, det.ymax
    #         rect = patches.Rectangle(
    #             (x1, y1), x2-x1, y2-y1,
    #             linewidth=2, edgecolor="red", facecolor="none"
    #         )
    #         ax.add_patch(rect)
    #         ax.text(x1, y1, det.label, color="yellow", fontsize=12, weight="bold")
    
    #     # # 3) show relations in the title
    #     # rels = ", ".join(subrels.relation.tolist())
    #     # plt.title(f"Pivot {pivot} → {rels}", fontsize=10)
    #     # plt.tight_layout()
    #     # plt.show()


    # # Assuming kept_frames: dict[int, PIL.Image] and all_facts: list[str] are defined
    
    # secs = sorted(kept_frames.keys())
    # # Break into chunks of 4 seconds each
    # chunks = [secs[i:i+4] for i in range(0, len(secs), 4)]
    
    # for chunk in chunks:
    #     n = len(chunk)
    #     fig, axs = plt.subplots(2, n, figsize=(5 * n, 8), gridspec_kw={'height_ratios': [3, 1]})
        
    #     for i, s in enumerate(chunk):
    #         # Top: pivot frame
    #         ax_img = axs[0, i] if n > 1 else axs[0]
    #         ax_img.imshow(kept_frames[s])
    #         ax_img.set_title(f"{s}s", fontsize=14)
    #         ax_img.axis('off')
    
    #         # Bottom: facts for that pivot second
    #         ax_txt = axs[1, i] if n > 1 else axs[1]
    #         facts_for_s = [f for f in facts if f.startswith(f"{s}s")]
    #         ax_txt.axis('off')
    #         ax_txt.text(
    #             0, 1, "\n".join(facts_for_s),
    #             va='top', ha='left', fontsize=12, wrap=True
    #         )
        
    #     plt.tight_layout()
    #     plt.show()

json not created yet
a203b4a9-0639-43c8-b05d-cbcbacb77f48
Chosen 4-sec windows: [13 11  3 16  2 24 15 17 26 43 25 39  9 18 38 40 21 27 10 33 35 31 36 34 30 28 29  1 12 32]
You are an expert in egocentric-video reasoning. Given the timeline of actions, focus only on what 'c' personally does. You should finish with "Answer: <0|1|2|3|4>" after analysis.

Video-derived facts:
1. 4s: c cut thread
2. 4s: c sew dress
3. 4s: dress on top of machine
4. 5s: c put hand
5. 5s: c use machine
6. 5s: person present flower
7. 5s: person present person
8. 6s: c remove thread
9. 6s: c sew machine
10. 7s: c cut thread
11. 7s: c sew dress
12. 7s: dress on top of machine
13. 8s: c make hat
14. 8s: c sew cloth
15. 9s: c adjust thread
16. 9s: c sew blouse
17. 10s: c adjust knob
18. 11s: c sew pair
19. 11s: c tie cloth
20. 12s: c fold cloth
21. 13s: c fold thread
22. 14s: c sew blouse
23. 14s: c fold cloth
24. 14s: blouse on top of machine
25. 15s: c sew dress
26. 15s: dress on top of machine
27. 15s: c pick 