In [36]:
from ultralytics import YOLO
import cv2
import numpy as np
import os
import pandas as pd
import json
import torch
from tqdm import tqdm

In [27]:
original_scenes = pd.read_csv("../../data/videos-trimmed.csv")
print(original_scenes.shape)
original_scenes.head()

(2277, 12)


Unnamed: 0,video_base,t_start_sec_x,t_end_sec_x,trimmed_name,start_frame12,window_activity,lat,long,timestamp,id,id_person,camera
0,columpios_cam4-2024-11-21 18:53:10.mp4,13.833333,17.833333,trimmed_columpios_cam4-2024-11-21_18_53_10_00-...,166.0,7.713624,25.653061,-100.285843,2024-11-22 00:53:10.041613+00:00,columpios_cam4-2024-11-21_185310.mp4:17527661-...,17527661-d095-4e94-bad5-cf142e0d4a79,columpios_cam4
1,columpioscam3-2024-09-25 11:41:35.mp4,6.25,10.25,trimmed_columpioscam3-2024-09-25_11_41_35_00-0...,75.0,7.713624,25.653034,-100.286041,2024-09-25 17:41:35.045100+00:00,columpioscam3-2024-09-25_114135.mp4:96:0,96,columpioscam3
2,columpioscam3-2024-10-03 18:29:30.mp4,12.916667,16.916667,trimmed_columpioscam3-2024-10-03_18_29_30_00-0...,155.0,7.713624,25.652994,-100.286118,2024-10-04 00:29:30.051348+00:00,columpioscam3-2024-10-03_182930.mp4:66a106a0-b...,66a106a0-bd03-49e9-8418-8c6ae98d020c,columpioscam3
3,columpioscam1-2024-11-17 10:25:24.mp4,10.0,14.0,trimmed_columpioscam1-2024-11-17_10_25_24_00-0...,120.0,7.713624,25.653151,-100.285637,2024-11-17 16:25:24.041494+00:00,columpioscam1-2024-11-17_102524.mp4:85745e56-a...,85745e56-a3e9-4b62-bb2d-c1e8ee152f86,columpioscam1
4,columpios_cam4-2024-11-21 19:16:11.mp4,6.333333,10.333333,trimmed_columpios_cam4-2024-11-21_19_16_11_00-...,76.0,7.713624,25.653048,-100.285912,2024-11-22 01:16:11.041568+00:00,columpios_cam4-2024-11-21_191611.mp4:addad6d5-...,addad6d5-1fe6-484f-964d-3302e3899325,columpios_cam4


## Preprocessing

We’ll combine three lightweight checks, all possible using yolov11n.pt (detection-only):



|**Check**|**Metric**|**Threshold**|**Purpose**|
|---|---|---|---|
|**Human presence**|% frames with ≥1 “person” detection|< 0.2 (20%)|Skip videos mostly empty|
|**Activity (motion)**|Mean inter-frame IoU change or optical flow|< 0.05|Skip static scenes|
|**Duration**|Clip length (seconds)|< 1.0|Skip ultra-short clips|
|**Edge occupancy**|% boxes touching frame edge|> 0.5|Flag for low-quality (partial) views|


In [19]:
detector = YOLO("yolo11n.pt")

def detect_persons(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"❌ Could not open video: {video_path}")
        return None

    total_frames, person_frames = 0, 0
    edge_touches, motion_values = [], []
    prev_boxes = None

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        total_frames += 1

        results = detector(frame, verbose=False)[0]
        boxes = results.boxes.xyxy.cpu().numpy()
        cls = results.boxes.cls.cpu().numpy()
        people = boxes[cls == 0]  # class 0 = person

        if len(people) > 0:
            person_frames += 1
            H, W = frame.shape[:2]
            touches = np.mean((people[:,0] < 5) | (people[:,1] < 5) |
                              (people[:,2] > W-5) | (people[:,3] > H-5))
            edge_touches.append(touches)

            if prev_boxes is not None:
                ious = []
                for p in people:
                    for q in prev_boxes:
                        ix1, iy1 = max(p[0], q[0]), max(p[1], q[1])
                        ix2, iy2 = min(p[2], q[2]), min(p[3], q[3])
                        inter = max(0, ix2-ix1) * max(0, iy2-iy1)
                        union = (p[2]-p[0])*(p[3]-p[1]) + (q[2]-q[0])*(q[3]-q[1]) - inter
                        if union > 0:
                            ious.append(inter/union)
                if ious:
                    motion_values.append(1 - np.mean(ious))

            prev_boxes = people

    fps = cap.get(cv2.CAP_PROP_FPS)
    cap.release()

    if fps == 0 or np.isnan(fps):
        fps = 30.0  # fallback assumption

    if total_frames == 0:
        print(f"❌ No readable frames: {video_path}")
        return None

    return {
        "presence_ratio": person_frames / total_frames,
        "mean_edge_touches": np.mean(edge_touches) if edge_touches else 0,
        "mean_motion": np.mean(motion_values) if motion_values else 0,
        "duration_sec": total_frames / fps,
    }

[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt': 100% ━━━━━━━━━━━━ 5.4MB 19.2MB/s 0.3s.2s<0.2s3s


In [16]:
def should_keep(meta):
    return (
        meta["presence_ratio"] >= 0.2 and
        meta["mean_motion"] >= 0.05 and
        meta["duration_sec"] > 1.0
    )

In [None]:
rows = []
skipped = []
base_dir = "./../../data/temp/videos/trimmed"
videos = [v for v in os.listdir(base_dir) if v.endswith(".mp4")]

for video in tqdm(videos, desc="Analyzing videos", unit="video"):
    video_path = os.path.join(base_dir, video)
    tqdm.write(f"[STEP] Processing: {video}")

    meta = detect_persons(video_path)
    if meta:
        meta["keep"] = should_keep(meta)
        meta["video"] = video
        rows.append(meta)

        if not meta["keep"]:
            tqdm.write(f"[SKIPPED] Skipped (low quality): {video} "
                       f"[presence={meta['presence_ratio']:.2f}, motion={meta['mean_motion']:.2f}]")
    else:
        skipped.append(video)
        tqdm.write(f"❌ Skipped (unreadable): {video}")

processed_videos = pd.DataFrame(rows)

print(f"Processed: {len(videos)} videos | Skipped unreadable: {len(skipped)} | Kept in results: {len(processed_videos)}")
if skipped:
    print("Skipped files:")
    for s in skipped:
        print(f"   - {s}")

> Processed: 1643 videos | Skipped unreadable: 0 | Kept in results: 1643

In [22]:
print(len(processed_videos))
processed_videos.head()

1643


Unnamed: 0,presence_ratio,mean_edge_touches,mean_motion,duration_sec,keep,video
0,0.0,0.0,0.0,1.79266,False,trimmed_columpioscam3-2024-11-12_08_32_46_00-0...
1,1.0,0.0,0.71643,3.920928,True,trimmed_columpioscam3-2024-09-25_21_26_59_00-0...
2,0.270833,0.163462,0.302138,4.005791,True,trimmed_columpioscam3-2024-10-22_20_12_50_00-0...
3,0.666667,0.0,0.636362,3.791675,True,trimmed_columpios_cam4-2024-12-13_20-02-00_00-...
4,0.0,0.0,0.0,0.375734,False,trimmed_columpios_cam4-2024-11-21_18_53_10_00-...


In [23]:
not_keep_count = (~processed_videos["keep"]).sum()
keep_count = processed_videos["keep"].sum()

print(f"Videos kept: {keep_count}")
print(f"Videos to remove: {not_keep_count}")

Videos kept: 1146
Videos to remove: 497


In [26]:
df_merged = original_scenes.merge(
    processed_videos,
    how="left",
    left_on="trimmed_name",
    right_on="video"
)

print("Merged shape:", df_merged.shape)
df_merged.head(3)

Merged shape: (2277, 18)


Unnamed: 0,video_base,t_start_sec_x,t_end_sec_x,trimmed_name,start_frame12,window_activity,lat,long,timestamp,id,id_person,camera,presence_ratio,mean_edge_touches,mean_motion,duration_sec,keep,video
0,columpios_cam4-2024-11-21 18:53:10.mp4,13.833333,17.833333,trimmed_columpios_cam4-2024-11-21_18_53_10_00-...,166.0,7.713624,25.653061,-100.285843,2024-11-22 00:53:10.041613+00:00,columpios_cam4-2024-11-21_185310.mp4:17527661-...,17527661-d095-4e94-bad5-cf142e0d4a79,columpios_cam4,0.0,0.0,0.0,0.375734,False,trimmed_columpios_cam4-2024-11-21_18_53_10_00-...
1,columpioscam3-2024-09-25 11:41:35.mp4,6.25,10.25,trimmed_columpioscam3-2024-09-25_11_41_35_00-0...,75.0,7.713624,25.653034,-100.286041,2024-09-25 17:41:35.045100+00:00,columpioscam3-2024-09-25_114135.mp4:96:0,96,columpioscam3,0.034091,0.0,0.168131,4.021505,False,trimmed_columpioscam3-2024-09-25_11_41_35_00-0...
2,columpioscam3-2024-10-03 18:29:30.mp4,12.916667,16.916667,trimmed_columpioscam3-2024-10-03_18_29_30_00-0...,155.0,7.713624,25.652994,-100.286118,2024-10-04 00:29:30.051348+00:00,columpioscam3-2024-10-03_182930.mp4:66a106a0-b...,66a106a0-bd03-49e9-8418-8c6ae98d020c,columpioscam3,0.333333,0.0,0.0,0.239414,False,trimmed_columpioscam3-2024-10-03_18_29_30_00-0...


In [28]:
df_merged = df_merged.drop(columns=["video"])

In [29]:
df_cleaned = df_merged[df_merged["keep"] == True].copy()

print("Cleaned merged shape:", df_cleaned.shape)
print("Removed:", (~df_merged["keep"]).sum(), "rows")

Cleaned merged shape: (1643, 17)
Removed: 634 rows


In [30]:
df_cleaned.to_csv("./../../data/prefiltered_scenes.csv", index=False)

## Pose Estimation

In [37]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [39]:
video_dir = "./../../data/temp/videos/trimmed"
output_dir = "./../../data/temp/skeletons"
os.makedirs(output_dir, exist_ok=True)

In [None]:
pose_model = YOLO("yolo11m-pose.pt")

In [40]:
videos = [f for f in os.listdir(video_dir) if f.endswith(".mp4")]
print(f"🎥 Found {len(videos)} videos to process.\n")

🎥 Found 1643 videos to process.



In [None]:
for video_file in tqdm(videos, desc="Processing videos"):
    video_path = os.path.join(video_dir, video_file)
    out_json = os.path.join(output_dir, video_file.replace(".mp4", ".json"))

    if os.path.exists(out_json):
        tqdm.write(f"[SKIPPED] Skipping {video_file} (already processed)")
        continue

    try:
        results = pose_model(video_path, stream=True, device=device, verbose=False)

        skeleton_data = []
        for frame_idx, r in enumerate(results):
            people = []
            if r.keypoints is not None and len(r.keypoints) > 0:
                kpts = r.keypoints.xy.cpu().numpy()      # (n_persons, 17, 2)
                conf = r.keypoints.conf.cpu().numpy()    # (n_persons, 17)
                for pid in range(len(kpts)):
                    joints = [
                        {"x": float(x), "y": float(y), "conf": float(c)}
                        for (x, y), c in zip(kpts[pid], conf[pid])
                    ]
                    people.append({"id": pid, "keypoints": joints})
            skeleton_data.append({"frame": frame_idx, "people": people})

        with open(out_json, "w") as f:
            json.dump(skeleton_data, f, indent=2)

        tqdm.write(f"Saved skeletons for {video_file}")

    except Exception as e:
        tqdm.write(f"[❌ERROR] Error processing {video_file}: {e}")
        continue

print("\nAll videos processed.")