In [22]:
import os
import glob
import pandas as pd

def iter_annotation_files(annot_root: str):
    """
    Yield (lab_id, video_id, path) cho từng file parquet trong annot_root.

    Giả định:
      annot_root/lab_id/*.parquet
      video_id = tên file không gồm .parquet
    """
    for lab_dir in sorted(glob.glob(os.path.join(annot_root, "*"))):
        if not os.path.isdir(lab_dir):
            continue
        lab_id = os.path.basename(lab_dir)
        for fp in sorted(glob.glob(os.path.join(lab_dir, "*.parquet"))):
            vid = os.path.splitext(os.path.basename(fp))[0]
            yield lab_id, vid, fp

def collect_action_segments_from_anno(
    annot_root: str,
    target_lab: str | None = None,
) -> pd.DataFrame:
    """
    Đọc các file parquet annotation (schema:
      agent_id, target_id, action, start_frame, stop_frame)
    và trả ra 1 DataFrame thống kê segment hành vi:

      lab_id, video_id, action, agent, target,
      start_frame, end_frame, n_frames
    """
    rows = []

    for lab_id, vid, fp in iter_annotation_files(annot_root):
        if target_lab is not None and lab_id != target_lab:
            continue

        df = pd.read_parquet(fp)

        # Kiểm tra cột
        required_cols = ["agent_id", "action", "start_frame", "stop_frame"]
        missing = [c for c in required_cols if c not in df.columns]
        if missing:
            print(f"[WARN] {fp} thiếu cột {missing}, bỏ qua.")
            continue

        # Nếu thiếu target_id thì tạo cho đồng nhất
        if "target_id" not in df.columns:
            df["target_id"] = None

        # Lọc những dòng có action (nếu có NaN)
        sub = df[~df["action"].isna()].copy()
        if sub.empty:
            continue

        # Đảm bảo dạng int
        sub["start_frame"] = sub["start_frame"].astype(int)
        sub["stop_frame"]  = sub["stop_frame"].astype(int)

        # Tính số frame trong mỗi segment
        sub["n_frames"] = sub["stop_frame"] - sub["start_frame"] + 1

        for _, row in sub.iterrows():
            rows.append({
                "lab_id": lab_id,
                "video_id": vid,
                "action": row["action"],
                "agent": row["agent_id"],
                "target": row["target_id"],
                "start_frame": int(row["start_frame"]),
                "end_frame": int(row["stop_frame"]),
                "n_frames": int(row["n_frames"]),
            })

    seg_df = pd.DataFrame(rows)

    if not seg_df.empty:
        seg_df = seg_df.sort_values(
            ["lab_id", "video_id", "start_frame", "end_frame"]
        ).reset_index(drop=True)

    return seg_df


annot_root = "MABe-mouse-behavior-detection/train_annotation"

# 1) Tất cả lab
list_lab = [
    "AdaptableSnail",
    "BoisterousParrot",
    "CalMS21_supplemental",
    "CalMS21_task1",
    "CalMS21_task2",
    "CautiousGiraffe",
    "CRIM13",
    "DeliriousFly",
    "ElegantMink",
    "GroovyShrew",
    "InvincibleJellyfish",
    "JovialSwallow",
    "LyricalHare",
    "NiftyGoldfinch",
    "PleasantMeerkat",
    "ReflectiveManatee",
    "SparklingTapir",
    "TranquilPanther",
    "UppityFerret",
]

for lab_id in list_lab:
    lab_id_seg = collect_action_segments_from_anno(annot_root, target_lab=lab_id)
    print(f"{lab_id}: ")
    print(lab_id_seg.action.unique())




AdaptableSnail: 
['chase' 'avoid' 'attack' 'chaseattack' 'rear' 'approach' 'submit']
BoisterousParrot: 
['shepherd']
CalMS21_supplemental: 
['sniff' 'sniffgenital' 'attemptmount' 'mount' 'intromit' 'approach'
 'sniffbody' 'sniffface' 'dominancemount' 'attack']
CalMS21_task1: 
['sniffgenital' 'sniff' 'mount' 'approach' 'sniffbody' 'attack'
 'sniffface' 'intromit' 'genitalgroom']
CalMS21_task2: 
['sniff' 'mount' 'attack']
CautiousGiraffe: 
['reciprocalsniff' 'sniffbody' 'sniff' 'sniffgenital' 'chase' 'escape']
CRIM13: 
['approach' 'sniff' 'mount' 'disengage' 'rear' 'selfgroom' 'attack']
DeliriousFly: 
['sniff' 'dominance' 'attack']
ElegantMink: 
['attack' 'sniff' 'allogroom' 'mount' 'intromit' 'ejaculate'
 'attemptmount']
GroovyShrew: 
['sniff' 'sniffgenital' 'dig' 'approach' 'escape' 'rear' 'selfgroom' 'run'
 'rest' 'attemptmount' 'climb' 'defend']
InvincibleJellyfish: 
['allogroom' 'sniff' 'dominancegroom' 'sniffgenital' 'attack' 'escape'
 'selfgroom' 'dig']
JovialSwallow: 
['sniff' 'c

In [39]:
lab_id_seg = collect_action_segments_from_anno(annot_root, target_lab= "CautiousGiraffe" )
# tạo list_vid từ tên file .parquet
list_vid = sorted(lab_id_seg["video_id"].unique())
print(list_vid)

target_action1 = "reciprocalsniff"
target_action2 = "attack"
video_with_action = set()
video =[]
for vid_id in list_vid:
    vid1 = lab_id_seg[lab_id_seg.video_id == str(vid_id)]
    if target_action1 in vid1.action.unique(): # and target_action2 in vid1.action.unique(): 
        video.append(vid_id)
print(video)


['1341883680', '1539773935', '1657896715', '1729143180', '21954203', '265592701', '531143932', '673614042', '684541933', '956938805']
['1341883680', '1539773935', '1657896715', '1729143180', '21954203', '265592701', '531143932', '673614042', '684541933', '956938805']


In [40]:
for vid_id in list_vid:
    vid1 = lab_id_seg[lab_id_seg.video_id == vid_id]
    vid1 = vid1[vid1.action == 'reciprocalsniff']
    print(vid1.to_string())


              lab_id    video_id           action  agent  target  start_frame  end_frame  n_frames
0    CautiousGiraffe  1341883680  reciprocalsniff      1       2          108        115         8
1    CautiousGiraffe  1341883680  reciprocalsniff      2       1          108        115         8
5    CautiousGiraffe  1341883680  reciprocalsniff      1       2          812        845        34
6    CautiousGiraffe  1341883680  reciprocalsniff      2       1          812        845        34
7    CautiousGiraffe  1341883680  reciprocalsniff      2       1          938       1010        73
8    CautiousGiraffe  1341883680  reciprocalsniff      1       2          938       1010        73
10   CautiousGiraffe  1341883680  reciprocalsniff      1       2         1558       1605        48
11   CautiousGiraffe  1341883680  reciprocalsniff      2       1         1558       1605        48
13   CautiousGiraffe  1341883680  reciprocalsniff      1       2         2768       2795        28
14   Cauti

In [41]:
for vid_id in list_vid:
    vid1 = lab_id_seg[lab_id_seg.video_id == vid_id]
    vid1 = vid1[vid1.action == 'sniffbody']
    print(vid1.to_string())

             lab_id    video_id     action  agent  target  start_frame  end_frame  n_frames
2   CautiousGiraffe  1341883680  sniffbody      1       2          118        130        13
3   CautiousGiraffe  1341883680  sniffbody      1       2          387        400        14
9   CautiousGiraffe  1341883680  sniffbody      1       2         1028       1065        38
23  CautiousGiraffe  1341883680  sniffbody      1       2         3623       3659        37
28  CautiousGiraffe  1341883680  sniffbody      1       2         3959       3965         7
29  CautiousGiraffe  1341883680  sniffbody      1       2         4202       4235        34
36  CautiousGiraffe  1341883680  sniffbody      1       2         4803       4824        22
38  CautiousGiraffe  1341883680  sniffbody      1       2         5148       5235        88
53  CautiousGiraffe  1341883680  sniffbody      1       2         7253       7270        18
67  CautiousGiraffe  1341883680  sniffbody      1       2         8393       841

In [42]:
for vid_id in list_vid:
    vid1 = lab_id_seg[lab_id_seg.video_id == vid_id]
    vid1 = vid1[vid1.action == 'sniff']
    print(vid1.to_string())

              lab_id    video_id action  agent  target  start_frame  end_frame  n_frames
4    CautiousGiraffe  1341883680  sniff      2       1          401        409         9
12   CautiousGiraffe  1341883680  sniff      2       1         2742       2764        23
43   CautiousGiraffe  1341883680  sniff      2       1         6282       6317        36
56   CautiousGiraffe  1341883680  sniff      2       1         7412       7422        11
64   CautiousGiraffe  1341883680  sniff      2       1         8297       8312        16
75   CautiousGiraffe  1341883680  sniff      2       1         9827       9837        11
103  CautiousGiraffe  1341883680  sniff      2       1        12527      12599        73
104  CautiousGiraffe  1341883680  sniff      2       1        12817      12866        50
              lab_id    video_id action  agent  target  start_frame  end_frame  n_frames
126  CautiousGiraffe  1539773935  sniff      2       1         1516       1518         3
134  CautiousGiraffe 

In [43]:
for vid_id in list_vid:
    vid1 = lab_id_seg[lab_id_seg.video_id == vid_id]
    vid1 = vid1[vid1.action == 'sniffbody']
    print(vid1.to_string())

             lab_id    video_id     action  agent  target  start_frame  end_frame  n_frames
2   CautiousGiraffe  1341883680  sniffbody      1       2          118        130        13
3   CautiousGiraffe  1341883680  sniffbody      1       2          387        400        14
9   CautiousGiraffe  1341883680  sniffbody      1       2         1028       1065        38
23  CautiousGiraffe  1341883680  sniffbody      1       2         3623       3659        37
28  CautiousGiraffe  1341883680  sniffbody      1       2         3959       3965         7
29  CautiousGiraffe  1341883680  sniffbody      1       2         4202       4235        34
36  CautiousGiraffe  1341883680  sniffbody      1       2         4803       4824        22
38  CautiousGiraffe  1341883680  sniffbody      1       2         5148       5235        88
53  CautiousGiraffe  1341883680  sniffbody      1       2         7253       7270        18
67  CautiousGiraffe  1341883680  sniffbody      1       2         8393       841

In [44]:
for vid_id in list_vid:
    vid1 = lab_id_seg[lab_id_seg.video_id == vid_id]
    vid1 = vid1[vid1.action == 'sniffgenital']
    print(vid1.to_string())

              lab_id    video_id        action  agent  target  start_frame  end_frame  n_frames
24   CautiousGiraffe  1341883680  sniffgenital      1       2         3660       3852       193
37   CautiousGiraffe  1341883680  sniffgenital      1       2         4825       5132       308
46   CautiousGiraffe  1341883680  sniffgenital      1       2         6345       6362        18
59   CautiousGiraffe  1341883680  sniffgenital      1       2         7485       7497        13
72   CautiousGiraffe  1341883680  sniffgenital      1       2         9100       9127        28
78   CautiousGiraffe  1341883680  sniffgenital      1       2         9905       9937        33
81   CautiousGiraffe  1341883680  sniffgenital      1       2        10105      10122        18
82   CautiousGiraffe  1341883680  sniffgenital      1       2        10265      10312        48
87   CautiousGiraffe  1341883680  sniffgenital      1       2        10465      10505        41
89   CautiousGiraffe  1341883680  sniffg

In [45]:
for vid_id in list_vid:
    vid1 = lab_id_seg[lab_id_seg.video_id == vid_id]
    vid1 = vid1[vid1.action == 'escape']
    print(vid1.to_string())

Empty DataFrame
Columns: [lab_id, video_id, action, agent, target, start_frame, end_frame, n_frames]
Index: []
              lab_id    video_id  action  agent  target  start_frame  end_frame  n_frames
127  CautiousGiraffe  1539773935  escape      1       2         1557       1598        42
128  CautiousGiraffe  1539773935  escape      1       2         1622       1678        57
129  CautiousGiraffe  1539773935  escape      1       2         1697       1753        57
130  CautiousGiraffe  1539773935  escape      1       2         1767       1798        32
131  CautiousGiraffe  1539773935  escape      1       2         1857       1884        28
171  CautiousGiraffe  1539773935  escape      1       2         9543       9670       128
203  CautiousGiraffe  1539773935  escape      1       2        14043      14224       182
206  CautiousGiraffe  1539773935  escape      1       2        14358      14367        10
Empty DataFrame
Columns: [lab_id, video_id, action, agent, target, start_frame,

In [46]:
for vid_id in list_vid:
    vid1 = lab_id_seg[lab_id_seg.video_id == vid_id]
    vid1 = vid1[vid1.action == 'chase']
    print(vid1.to_string())

             lab_id    video_id action  agent  target  start_frame  end_frame  n_frames
27  CautiousGiraffe  1341883680  chase      1       2         3911       3958        48
86  CautiousGiraffe  1341883680  chase      1       2        10416      10453        38
88  CautiousGiraffe  1341883680  chase      1       2        10506      10598        93
93  CautiousGiraffe  1341883680  chase      1       2        10726      10765        40
Empty DataFrame
Columns: [lab_id, video_id, action, agent, target, start_frame, end_frame, n_frames]
Index: []
Empty DataFrame
Columns: [lab_id, video_id, action, agent, target, start_frame, end_frame, n_frames]
Index: []
              lab_id    video_id action  agent  target  start_frame  end_frame  n_frames
324  CautiousGiraffe  1729143180  chase      1       2         2402       2431        30
Empty DataFrame
Columns: [lab_id, video_id, action, agent, target, start_frame, end_frame, n_frames]
Index: []
              lab_id   video_id action  agent  ta