In [2]:
import os
import glob
import pandas as pd

def iter_tracking_files(tracking_root: str):

    for lab_dir in sorted(glob.glob(os.path.join(tracking_root, "*"))):
        if not os.path.isdir(lab_dir):
            continue
        lab_id = os.path.basename(lab_dir)
        for fp in sorted(glob.glob(os.path.join(lab_dir, "*.parquet"))):
            vid = os.path.splitext(os.path.basename(fp))[0]
            yield lab_id, vid, fp


def collect_bodypart_stats(tracking_root: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    rows_video = []
    rows_lab = []

    for lab_id, vid, fp in iter_tracking_files(tracking_root):
        df = pd.read_parquet(fp)

        if "bodypart" not in df.columns or "video_frame" not in df.columns:
            print(f"[WARN] {fp} thiếu 'bodypart' hoặc 'video_frame', bỏ qua.")
            continue

        # Tổng số frame trong video
        total_frames = df["video_frame"].nunique()
        if total_frames == 0:
            continue

        # Mỗi bodypart → số frame mà nó xuất hiện (unique frame)
        # drop_duplicates tránh đếm trùng khi 1 frame có nhiều mouse_id cùng bodypart
        df_bp_frame = df[["video_frame", "bodypart"]].drop_duplicates()
        vc = df_bp_frame.groupby("bodypart")["video_frame"].nunique()

        for bp, n_frames in vc.items():
            rows_video.append({
                "lab_id": lab_id,
                "video_id": vid,
                "bodypart": bp,
                "frames_with_bp": int(n_frames),
                "total_frames": int(total_frames),
                "ratio": float(n_frames) / float(total_frames),
            })

    bp_video_df = pd.DataFrame(rows_video)

    # Tổng hợp lên theo lab (tùy bạn định nghĩa ratio_in_lab thế nào)
    if not bp_video_df.empty:
        # Ví dụ: đếm tổng số frame mà bodypart đó xuất hiện trong lab
        grp = (bp_video_df
               .groupby(["lab_id", "bodypart"])["frames_with_bp"]
               .sum()
               .reset_index(name="frames_with_bp_lab"))
        lab_totals = grp.groupby("lab_id")["frames_with_bp_lab"].transform("sum")
        grp["ratio_in_lab"] = grp["frames_with_bp_lab"] / lab_totals
        bp_lab_df = grp
    else:
        bp_lab_df = pd.DataFrame(
            columns=["lab_id", "bodypart", "frames_with_bp_lab", "ratio_in_lab"]
        )

    return bp_lab_df, bp_video_df



In [3]:
tracking_root = "MABe-mouse-behavior-detection/train_tracking"  # chỉnh lại cho đúng
bp_lab_df, bp_video_df = collect_bodypart_stats(tracking_root)


In [4]:
x = bp_video_df[bp_video_df['lab_id'] == 'TranquilPanther']
print(x.video_id.unique())
x[x['video_id'] == '17308182']
x.bodypart.unique()


['1057775501' '1360332361' '1456719978' '1462669248' '1510465149'
 '1557355665' '1582739342' '1651649809' '1732555338' '1735658729'
 '1831430173' '1831501720' '1895797255' '1907520217' '1979655353'
 '2138781893' '357532314' '431495718' '477037938' '489560891' '62660378'
 '651966739' '675356666' '675808983' '962638464']


array(['ear_left', 'ear_right', 'hip_left', 'hip_right', 'neck', 'nose',
       'tail_base'], dtype=object)

In [5]:
# Đường dẫn tới file parquet
import pandas as pd

file_path = r"MABe-mouse-behavior-detection\train_tracking\LyricalHare\121552177.parquet"

# Đọc file parquet vào DataFrame
df = pd.read_parquet(file_path)

print(df.head())

   video_frame  mouse_id   bodypart           x           y
0            0         1   ear_left  266.665253  190.124222
1            0         1  ear_right  265.763611  157.215668
2            0         1       nose  288.347565  169.481750
3            0         1  tail_base  191.019943  183.924103
4            0         1   tail_tip  136.075562  160.868439
