## **A) Combined YOLO + DeepSORT**



In [None]:
# A) Combine YOLO + DeepSORT
# === Install required packages (run once) ===
# !pip install ultralytics deep-sort-realtime opencv-python pandas

from google.colab import drive
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import cv2
import os
import json
import pandas as pd
from pathlib import Path
import numpy as np

# === Parameters ===
VIDEO_FOLDER         = Path("/content/drive/MyDrive/FreeFuse_Project/Videos/Input")
OUTPUT_FOLDER        = Path("/content/drive/MyDrive/FreeFuse_Project/Videos/Output")
CONFIDENCE_THRESHOLD = 0.4
DETECTION_INTERVAL   = 1      # analyze every Nth frame
MAX_TRACK_AGE        = 30     # frames to keep a lost track
MIN_HITS             = 3      # detections before confirming a track

# drawing settings
MASK_COLOR           = (0, 255, 0)    # BGR mask outline color
MASK_THICKNESS       = 2              # mask polygon line thickness
TEXT_COLOR           = (255, 255, 255)# BGR text color
TEXT_FONT            = cv2.FONT_HERSHEY_SIMPLEX
TEXT_SCALE           = 0.6
TEXT_THICKNESS       = 2

# === 1) Mount Google Drive ===
drive.mount('/content/drive')

# === 2) Load YOLOv8-nano segmentation & DeepSORT ===
model   = YOLO('yolov8n-seg')           # auto-downloads nano-segmentation weights
tracker = DeepSort(max_age=MAX_TRACK_AGE, n_init=MIN_HITS)

# utility to compute IoU between two boxes

def compute_iou(boxA, boxB):
    xA1,yA1,xA2,yA2 = boxA
    xB1,yB1,xB2,yB2 = boxB
    xi1, yi1 = max(xA1,xB1), max(yA1,yB1)
    xi2, yi2 = min(xA2,xB2), min(yA2,yB2)
    inter = max(0, xi2-xi1) * max(0, yi2-yi1)
    union = (xA2-xA1)*(yA2-yA1) + (xB2-xB1)*(yB2-yB1) - inter
    return inter/union if union>0 else 0

annotations = []

# ensure output CSV and video folder exist
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

for video_file in os.listdir(VIDEO_FOLDER):
    if not video_file.lower().endswith(('.mp4','.mov','.avi')):
        continue

    cap        = cv2.VideoCapture(str(VIDEO_FOLDER/video_file))
    fps        = cap.get(cv2.CAP_PROP_FPS)
    width      = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height     = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_num  = 0
    video_name = Path(video_file).stem

    # prepare video writer
    output_path = OUTPUT_FOLDER / video_file
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_num % DETECTION_INTERVAL == 0:
            timestamp_sec = int(frame_num / fps)
            frame_id      = f"{video_name}_{timestamp_sec:04d}"

            # YOLOv8 segmentation inference
            results = model(frame)[0]

            dets_for_tracker = []
            det_meta = []
            for idx, (box, score, cls) in enumerate(zip(
                    results.boxes.xyxy, results.boxes.conf, results.boxes.cls)):
                conf = float(score)
                if conf < CONFIDENCE_THRESHOLD:
                    continue

                x1,y1,x2,y2 = box.cpu().numpy().astype(int)
                cls_id      = int(cls.cpu().numpy())
                name        = model.names[cls_id]

                # extract polygon in original image scale
                # YOLOv8 provides masks.xy which are already scaled
                poly = np.array(results.masks.xy[idx], dtype=np.int32)
                # ensure shape (-1,2)
                poly = poly.reshape(-1,2)

                dets_for_tracker.append([[x1,y1,x2-x1,y2-y1], conf, name])
                det_meta.append({
                    "bbox": (x1,y1,x2,y2),
                    "MID": f"/m/{cls_id:07d}",
                    "object_name": name,
                    "object_category": "unknown",
                    "mask_poly": poly.tolist(),
                    "confidence": conf,
                })

            # update tracker
            tracks = tracker.update_tracks(dets_for_tracker, frame=frame)

            if det_meta:
                for trk in tracks:
                    if not trk.is_confirmed():
                        continue
                    tx1,ty1,tx2,ty2 = trk.to_tlbr()
                    track_id = trk.track_id

                    # match detection by IoU
                    best_iou, best = max(
                        ((compute_iou((tx1,ty1,tx2,ty2), m["bbox"]), m) for m in det_meta),
                        key=lambda x: x[0]
                    )
                    if best_iou > 0.3:
                        # draw mask outline using original-scale polygon
                        pts = np.array(best["mask_poly"], np.int32)
                        if pts.size:
                            cv2.polylines(frame, [pts], isClosed=True, color=MASK_COLOR, thickness=MASK_THICKNESS)
                            # place label at first vertex
                            label_pos = tuple(pts[0])
                            cv2.putText(frame, best["object_name"], label_pos, TEXT_FONT,
                                        TEXT_SCALE, TEXT_COLOR, TEXT_THICKNESS, cv2.LINE_AA)

                        # record annotation
                        annotations.append({
                            "video_filename":    video_file,
                            "frame_id":          frame_id,
                            "track_id":          f"{video_name}_{track_id}",
                            "object_id":         f"{frame_id}_obj{track_id}",
                            "timestamp_sec":     timestamp_sec,
                            "image_width_px":    width,
                            "image_height_px":   height,
                            "MID":               best["MID"],
                            "object_name":       best["object_name"],
                            "object_category":   best["object_category"],
                            "x_min":             int(tx1),
                            "y_min":             int(ty1),
                            "x_max":             int(tx2),
                            "y_max":             int(ty2),
                            "segmentation_mask": json.dumps([best["mask_poly"]]),
                            "confidence":        best["confidence"],
                            "interaction_score": 0.0
                        })

        # write frame (with masks) to output
        writer.write(frame)
        frame_num += 1

    cap.release()
    writer.release()

# write CSV of annotations
out_csv = OUTPUT_FOLDER / "draft_annotations.csv"
pd.DataFrame(annotations).to_csv(out_csv, index=False)
print(f"Saved annotated video(s) to {OUTPUT_FOLDER}")
print(f"Saved annotations to {out_csv}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

0: 384x640 (no detections), 178.3ms
Speed: 4.7ms preprocess, 178.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 187.8ms
Speed: 4.0ms preprocess, 187.8ms inference, 5.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 motorcycle, 188.8ms
Speed: 5.1ms preprocess, 188.8ms inference, 13.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 212.1ms
Speed: 5.0ms preprocess, 212.1ms inference, 7.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 221.6ms
Speed: 4.8ms preprocess, 221.6ms inference, 13.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 184.0ms
Speed: 4.0ms preprocess, 184.0ms inference, 12.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 181.7ms
Speed: 4.2ms preprocess, 181.7ms inference, 

## **B) Detectron2 (Facebook AI Research)**

In [16]:
# Combine Detectron2 + DeepSORT + Mask Drawing
# Refactored from YOLO to Detectron2 instance segmentation
# === Install required packages (run once) ===
!pip install detectron2 deep-sort-realtime opencv-python pandas

from google.colab import drive
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.data import MetadataCatalog
from deep_sort_realtime.deepsort_tracker import DeepSort
import cv2
import os
import json
import pandas as pd
from pathlib import Path
import numpy as np

# === Parameters ===
VIDEO_FOLDER         = Path("/content/drive/MyDrive/FreeFuse_Project/Videos/Input")
OUTPUT_FOLDER        = Path("/content/drive/MyDrive/FreeFuse_Project/Videos/Output")
CONFIDENCE_THRESHOLD = 0.5
DETECTION_INTERVAL   = 5      # analyze every Nth frame
MAX_TRACK_AGE        = 30     # frames to keep a lost track
MIN_HITS             = 3      # detections before confirming a track

# drawing settings
MASK_COLOR           = (0, 255, 0)    # BGR mask outline color
MASK_THICKNESS       = 2              # mask polygon line thickness
TEXT_COLOR           = (255, 255, 255)# BGR text color
TEXT_FONT            = cv2.FONT_HERSHEY_SIMPLEX
TEXT_SCALE           = 0.6
TEXT_THICKNESS       = 2

# === 1) Mount Google Drive ===
drive.mount('/content/drive')

# === 2) Configure Detectron2 ===
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = CONFIDENCE_THRESHOLD
predictor = DefaultPredictor(cfg)
# metadata for class names
dataset = cfg.DATASETS.TRAIN[0] if len(cfg.DATASETS.TRAIN)>0 else "coco_2017_train"
class_names = MetadataCatalog.get(dataset).thing_classes

# Initialize DeepSORT tracker
tracker = DeepSort(max_age=MAX_TRACK_AGE, n_init=MIN_HITS)

# IoU utility
def compute_iou(boxA, boxB):
    xA1,yA1,xA2,yA2 = boxA
    xB1,yB1,xB2,yB2 = boxB
    xi1, yi1 = max(xA1,xB1), max(yA1,yB1)
    xi2, yi2 = min(xA2,xB2), min(yA2,yB2)
    inter = max(0, xi2-xi1) * max(0, yi2-yi1)
    union = (xA2-xA1)*(yA2-yA1) + (xB2-xB1)*(yB2-yB1) - inter
    return inter/union if union>0 else 0

annotations = []

# ensure output folders exist
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

for video_file in os.listdir(VIDEO_FOLDER):
    if not video_file.lower().endswith(('.mp4','.mov','.avi')):
        continue

    cap        = cv2.VideoCapture(str(VIDEO_FOLDER/video_file))
    fps        = cap.get(cv2.CAP_PROP_FPS)
    width      = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height     = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_num  = 0
    video_name = Path(video_file).stem

    # prepare video writer
    output_path = OUTPUT_FOLDER / video_file
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_num % DETECTION_INTERVAL == 0:
            timestamp_sec = int(frame_num / fps)
            frame_id      = f"{video_name}_{timestamp_sec:04d}"

            # Detectron2 inference
            outputs = predictor(frame)
            instances = outputs["instances"].to("cpu")
            boxes = instances.pred_boxes.tensor.numpy().astype(int)
            scores = instances.scores.numpy()
            classes = instances.pred_classes.numpy().astype(int)
            masks = instances.pred_masks.numpy()  # (N, H, W)

            dets_for_tracker = []
            det_meta = []
            for idx in range(len(boxes)):
                conf = float(scores[idx])
                if conf < CONFIDENCE_THRESHOLD:
                    continue

                x1,y1,x2,y2 = boxes[idx]
                cls_id = classes[idx]
                name = class_names[cls_id]

                # extract polygon from mask
                mask_arr = (masks[idx].astype(np.uint8)*255)
                contours, _ = cv2.findContours(mask_arr, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                poly = contours[0].reshape(-1,2).tolist() if contours else []

                dets_for_tracker.append([[x1,y1,x2-x1,y2-y1], conf, name])
                det_meta.append({
                    "bbox": (x1,y1,x2,y2),
                    "MID": f"/m/{cls_id:07d}",
                    "object_name": name,
                    "object_category": "unknown",
                    "mask_poly": poly,
                    "confidence": conf,
                })

            # update tracker
            tracks = tracker.update_tracks(dets_for_tracker, frame=frame)
            if det_meta:
                for trk in tracks:
                    if not trk.is_confirmed():
                        continue
                    tx1,ty1,tx2,ty2 = trk.to_tlbr()
                    track_id = trk.track_id

                    # match detection by IoU
                    best_iou, best = max(
                        ((compute_iou((tx1,ty1,tx2,ty2), m["bbox"]), m) for m in det_meta),
                        key=lambda x: x[0]
                    )
                    if best_iou > 0.3:
                        pts = np.array(best["mask_poly"], np.int32)
                        if pts.size:
                            cv2.polylines(frame, [pts], isClosed=True, color=MASK_COLOR, thickness=MASK_THICKNESS)
                            label_pos = tuple(pts[0])
                            cv2.putText(frame, best["object_name"], label_pos, TEXT_FONT,
                                        TEXT_SCALE, TEXT_COLOR, TEXT_THICKNESS, cv2.LINE_AA)

                        annotations.append({
                            "video_filename":    video_file,
                            "frame_id":          frame_id,
                            "track_id":          f"{video_name}_{track_id}",
                            "object_id":         f"{frame_id}_obj{track_id}",
                            "timestamp_sec":     timestamp_sec,
                            "image_width_px":    width,
                            "image_height_px":   height,
                            "MID":               best["MID"],
                            "object_name":       best["object_name"],
                            "object_category":   best["object_category"],
                            "x_min":             int(tx1),
                            "y_min":             int(ty1),
                            "x_max":             int(tx2),
                            "y_max":             int(ty2),
                            "segmentation_mask": json.dumps([best["mask_poly"]]),
                            "confidence":        best["confidence"],
                            "interaction_score": 0.0
                        })

        # write frame (with masks) to output
        writer.write(frame)
        frame_num += 1

    cap.release()
    writer.release()

# write CSV of annotations
out_csv = OUTPUT_FOLDER / "draft_annotations.csv"
pd.DataFrame(annotations).to_csv(out_csv, index=False)
print(f"Saved annotated video(s) to {OUTPUT_FOLDER}")
print(f"Saved annotations to {out_csv}")

[31mERROR: Could not find a version that satisfies the requirement detectron2 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for detectron2[0m[31m
[0m

ModuleNotFoundError: No module named 'detectron2'

## **C) TensorFlow Object Detection API + DeepLab**

In [18]:
# Combine TensorFlow Object Detection API + DeepLab + DeepSORT + Mask Drawing
# === Install required packages (run once) ===
!pip install tensorflow tensorflow-hub tensorflow-object-detection-api deep-sort-realtime opencv-python pandas

from google.colab import drive
import tensorflow as tf
import tensorflow_hub as hub
from object_detection.utils import label_map_util
from deep_sort_realtime.deepsort_tracker import DeepSort
import cv2
import os
import json
import pandas as pd
from pathlib import Path
import numpy as np

# === Parameters ===
VIDEO_FOLDER         = Path("/content/drive/MyDrive/FreeFuse_Project/Videos/Input")
OUTPUT_FOLDER        = Path("/content/drive/MyDrive/FreeFuse_Project/Videos/Output")
OD_MODEL_PATH        = Path("/content/drive/MyDrive/FreeFuse_Project/models/ssd_mobilenet_v2_coco/saved_model")
LABEL_MAP_PATH       = Path("/content/drive/MyDrive/FreeFuse_Project/models/mscoco_label_map.pbtxt")
DEEPLAB_MODEL_URL    = "https://tfhub.dev/tensorflow/deeplabv3/1"

CONFIDENCE_THRESHOLD = 0.5
DETECTION_INTERVAL   = 5      # analyze every Nth frame
MAX_TRACK_AGE        = 30     # frames to keep a lost track
MIN_HITS             = 3      # detections before confirming a track

# drawing settings
MASK_COLOR           = (0, 255, 0)    # BGR mask outline color
MASK_THICKNESS       = 2              # mask polygon line thickness
TEXT_COLOR           = (255, 255, 255)# BGR text color
TEXT_FONT            = cv2.FONT_HERSHEY_SIMPLEX
TEXT_SCALE           = 0.6
TEXT_THICKNESS       = 2

# === 1) Mount Google Drive ===
drive.mount('/content/drive')

# === 2) Load TensorFlow OD model ===
detect_fn = tf.saved_model.load(str(OD_MODEL_PATH))
category_index = label_map_util.create_category_index_from_labelmap(
    str(LABEL_MAP_PATH), use_display_name=True)

# === 3) Load DeepLab semantic segmentation model ===
seg_model = hub.load(DEEPLAB_MODEL_URL)

# Initialize DeepSORT tracker
tracker = DeepSort(max_age=MAX_TRACK_AGE, n_init=MIN_HITS)

# IoU utility
def compute_iou(boxA, boxB):
    xA1,yA1,xA2,yA2 = boxA
    xB1,yB1,xB2,yB2 = boxB
    xi1, yi1 = max(xA1,xB1), max(yA1,yB1)
    xi2, yi2 = min(xA2,xB2), min(yA2,yB2)
    inter = max(0, xi2-xi1) * max(0, yi2-yi1)
    union = (xA2-xA1)*(yA2-yA1) + (xB2-xB1)*(yB2-yB1) - inter
    return inter/union if union>0 else 0

annotations = []

# ensure output paths exist
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

for video_file in os.listdir(VIDEO_FOLDER):
    if not video_file.lower().endswith(('.mp4','.mov','.avi')):
        continue

    cap        = cv2.VideoCapture(str(VIDEO_FOLDER/video_file))
    fps        = cap.get(cv2.CAP_PROP_FPS)
    width      = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height     = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_num  = 0
    video_name = Path(video_file).stem

    # prepare video writer
    output_path = OUTPUT_FOLDER / video_file
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # run detection and segmentation at intervals
        if frame_num % DETECTION_INTERVAL == 0:
            timestamp_sec = int(frame_num / fps)
            frame_id      = f"{video_name}_{timestamp_sec:04d}"

            # prepare input for TF models
            input_tensor = tf.convert_to_tensor(frame)
            input_tensor = input_tensor[tf.newaxis, ...]

            # Object Detection inference
            detections = detect_fn(input_tensor)
            boxes_norm  = detections['detection_boxes'][0].numpy()
            classes     = detections['detection_classes'][0].numpy().astype(np.int32)
            scores      = detections['detection_scores'][0].numpy()

            # Semantic segmentation inference
            seg_input = tf.image.convert_image_dtype(frame, tf.uint8)[tf.newaxis, ...]
            seg_output = seg_model(seg_input)['default']  # shape: [1, H, W, num_classes]
            seg_map = tf.argmax(seg_output, axis=-1)[0].numpy().astype(np.uint8)

            dets_for_tracker = []
            det_meta = []
            for idx in range(len(scores)):
                conf = float(scores[idx])
                if conf < CONFIDENCE_THRESHOLD:
                    continue

                # convert normalized box coords to pixels
                y1,x1,y2,x2 = boxes_norm[idx]
                x1, y1 = int(x1 * width), int(y1 * height)
                x2, y2 = int(x2 * width), int(y2 * height)
                name = category_index[classes[idx]]['name']

                # crop semantic mask to detection box and binarize
                crop_mask = (seg_map[y1:y2, x1:x2] > 0).astype(np.uint8) * 255
                contours, _ = cv2.findContours(crop_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                # shift contour points by (x1,y1)
                poly = []
                if contours:
                    cnt = max(contours, key=cv2.contourArea)
                    cnt = cnt.reshape(-1, 2) + np.array([x1, y1])
                    poly = cnt.tolist()

                dets_for_tracker.append([[x1, y1, x2-x1, y2-y1], conf, name])
                det_meta.append({
                    'bbox': (x1, y1, x2, y2),
                    'MID':   f"/m/{classes[idx]:07d}",
                    'object_name': name,
                    'object_category': 'unknown',
                    'mask_poly': poly,
                    'confidence': conf,
                })

            # update tracker
            tracks = tracker.update_tracks(dets_for_tracker, frame=frame)
            if det_meta:
                for trk in tracks:
                    if not trk.is_confirmed():
                        continue
                    tx1, ty1, tx2, ty2 = trk.to_tlbr()
                    track_id = trk.track_id

                    # match detection by IoU
                    best_iou, best = max(
                        ((compute_iou((tx1, ty1, tx2, ty2), m['bbox']), m) for m in det_meta),
                        key=lambda x: x[0]
                    )
                    if best_iou > 0.3 and best['mask_poly']:
                        pts = np.array(best['mask_poly'], np.int32)
                        cv2.polylines(frame, [pts], isClosed=True, color=MASK_COLOR, thickness=MASK_THICKNESS)
                        cv2.putText(frame, best['object_name'], tuple(pts[0]), TEXT_FONT,
                                    TEXT_SCALE, TEXT_COLOR, TEXT_THICKNESS, cv2.LINE_AA)

                        annotations.append({
                            'video_filename':  video_file,
                            'frame_id':        frame_id,
                            'track_id':        f"{video_name}_{track_id}",
                            'object_id':       f"{frame_id}_obj{track_id}",
                            'timestamp_sec':   timestamp_sec,
                            'image_width_px':  width,
                            'image_height_px': height,
                            'MID':             best['MID'],
                            'object_name':     best['object_name'],
                            'object_category': best['object_category'],
                            'x_min':           int(tx1),
                            'y_min':           int(ty1),
                            'x_max':           int(tx2),
                            'y_max':           int(ty2),
                            'segmentation_mask': json.dumps([best['mask_poly']]),
                            'confidence':      best['confidence'],
                            'interaction_score': 0.0
                        })

        # write processed frame to output video
        writer.write(frame)
        frame_num += 1

    cap.release()
    writer.release()

# write CSV annotations
out_csv = OUTPUT_FOLDER / 'draft_annotations.csv'
pd.DataFrame(annotations).to_csv(out_csv, index=False)
print(f"Saved annotated video(s) to {OUTPUT_FOLDER}")
print(f"Saved annotations to {out_csv}")


Collecting tensorflow-object-detection-api
  Downloading tensorflow_object_detection_api-0.1.1.tar.gz (577 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m577.4/577.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jupyter (from tensorflow-object-detection-api)
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting contextlib2 (from tensorflow-object-detection-api)
  Downloading contextlib2-21.6.0-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting twine (from tensorflow-object-detection-api)
  Downloading twine-6.1.0-py3-none-any.whl.metadata (3.7 kB)
Collecting jupyterlab (from jupyter->tensorflow-object-detection-api)
  Downloading jupyterlab-4.4.4-py3-none-any.whl.metadata (16 kB)
Collecting readme-renderer>=35.0 (from twine->tensorflow-object-detection-api)
  Downloading readme_renderer-44.0-py3-none-any.whl.metadata (2.8 kB)
Collecting rfc3986>=1.4.0 (from twine->tensorflow

TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

## **D) MMDetection / MMTracking**

In [19]:
# Combine MMDetection + MMTracking + Mask Drawing
# === Install required packages (run once) ===
!pip install mmcv-full mmdet mmtrack opencv-python pandas

from google.colab import drive
import cv2
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
from mmdet.apis import init_detector, inference_detector
from mmtrack.apis import init_model as init_mot, inference_mot

# === Parameters ===
VIDEO_FOLDER         = Path("/content/drive/MyDrive/FreeFuse_Project/Videos/Input")
OUTPUT_FOLDER        = Path("/content/drive/MyDrive/FreeFuse_Project/Videos/Output")
DET_CFG              = "/content/drive/MyDrive/FreeFuse_Project/configs/mmdet/mask_rcnn_r50_fpn_3x_coco.py"
DET_CKPT             = "/content/drive/MyDrive/FreeFuse_Project/checkpoints/mask_rcnn_r50_fpn_3x_coco.pth"
MOT_CFG              = "/content/drive/MyDrive/FreeFuse_Project/configs/mmtrack/bytertrack_faster-rcnn_fpn_4e_mot17-private-half.py"
MOT_CKPT             = "/content/drive/MyDrive/FreeFuse_Project/checkpoints/bytetrack_faster-rcnn_fpn_mot17.pth"
CONFIDENCE_THRESHOLD = 0.5
DETECTION_INTERVAL   = 5      # analyze every Nth frame

# drawing settings
MASK_COLOR           = (0, 255, 0)    # BGR mask outline
MASK_THICKNESS       = 2              # mask polygon line thickness
TEXT_COLOR           = (255, 255, 255)# label color
TEXT_FONT            = cv2.FONT_HERSHEY_SIMPLEX
TEXT_SCALE           = 0.6
TEXT_THICKNESS       = 2

# === 1) Mount Google Drive ===
drive.mount('/content/drive')

# === 2) Initialize MMDetection & MMTracking ===
det_model = init_detector(DET_CFG, DET_CKPT, device='cuda:0')
mot_model = init_mot(MOT_CFG, MOT_CKPT, device='cuda:0')

# utility to compute IoU
def compute_iou(boxA, boxB):
    xA1,yA1,xA2,yA2 = boxA
    xB1,yB1,xB2,yB2 = boxB
    xi1, yi1 = max(xA1,xB1), max(yA1,yB1)
    xi2, yi2 = min(xA2,xB2), min(yA2,yB2)
    inter = max(0, xi2-xi1) * max(0, yi2-yi1)
    union = (xA2-xA1)*(yA2-yA1) + (xB2-xB1)*(yB2-yB1) - inter
    return inter/union if union>0 else 0

annotations = []
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

for video_file in os.listdir(VIDEO_FOLDER):
    if not video_file.lower().endswith(('.mp4','.mov','.avi')):
        continue
    cap = cv2.VideoCapture(str(VIDEO_FOLDER / video_file))
    fps = cap.get(cv2.CAP_PROP_FPS)
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_num = 0
    video_name = Path(video_file).stem

    # setup writer
    out_path = OUTPUT_FOLDER / video_file
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(str(out_path), fourcc, fps, (w, h))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_num % DETECTION_INTERVAL == 0:
            timestamp = int(frame_num / fps)
            frame_id = f"{video_name}_{timestamp:04d}"

            # 1) Detect & segment with MMDetection
            det_results = inference_detector(det_model, frame)
            bbox_results, mask_results = det_results

            # 2) Track with MMTracking
            track_results, _ = inference_mot(mot_model, det_results, frame)
            # track_results: list of dict with 'track_bboxes', 'track_ids'
            tracks = track_results[0]
            track_bboxes = tracks['track_bboxes']  # np.ndarray[N,5]
            track_ids    = tracks['track_ids']     # list of N ids

            # 3) Draw masks per track
            for cls_id, bboxes in enumerate(bbox_results):
                for i, bbox in enumerate(bboxes):
                    score = float(bbox[4])
                    if score < CONFIDENCE_THRESHOLD:
                        continue
                    x1,y1,x2,y2 = map(int, bbox[:4])

                    # find corresponding track ID by IoU
                    best_iou, best_idx = 0, -1
                    for idx, tb in enumerate(track_bboxes):
                        iou = compute_iou((x1,y1,x2,y2), tb[:4])
                        if iou > best_iou:
                            best_iou, best_idx = iou, idx
                    if best_iou < 0.3:
                        continue
                    track_id = track_ids[best_idx]

                    # extract mask polygon
                    mask = mask_results[cls_id][i]  # binary mask
                    mask_u8 = (mask.astype(np.uint8) * 255)
                    contours, _ = cv2.findContours(mask_u8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                    if not contours:
                        continue
                    pts = max(contours, key=cv2.contourArea).reshape(-1,2)

                    # draw
                    cv2.polylines(frame, [pts], True, MASK_COLOR, MASK_THICKNESS)
                    cv2.putText(frame, det_model.CLASSES[cls_id], tuple(pts[0]), TEXT_FONT,
                                TEXT_SCALE, TEXT_COLOR, TEXT_THICKNESS, cv2.LINE_AA)

                    # record
                    annotations.append({
                        'video_filename': video_file,
                        'frame_id': frame_id,
                        'track_id': f"{video_name}_{track_id}",
                        'object_id': f"{frame_id}_obj{track_id}",
                        'timestamp_sec': timestamp,
                        'image_width_px': w,
                        'image_height_px': h,
                        'MID': f"/m/{cls_id:07d}",
                        'object_name': det_model.CLASSES[cls_id],
                        'object_category': 'unknown',
                        'x_min': x1, 'y_min': y1, 'x_max': x2, 'y_max': y2,
                        'segmentation_mask': json.dumps([pts.tolist()]),
                        'confidence': score,
                        'interaction_score': 0.0
                    })

        writer.write(frame)
        frame_num += 1

    cap.release()
    writer.release()

# export CSV
out_csv = OUTPUT_FOLDER / 'draft_annotations.csv'
pd.DataFrame(annotations).to_csv(out_csv, index=False)
print(f"Saved videos to {OUTPUT_FOLDER}")
print(f"Saved annotations to {out_csv}")

Collecting mmcv-full
  Downloading mmcv-full-1.7.2.tar.gz (607 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m607.9/607.9 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mmdet
  Downloading mmdet-3.3.0-py3-none-any.whl.metadata (29 kB)
Collecting mmtrack
  Downloading mmtrack-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting addict (from mmcv-full)
  Downloading addict-2.4.0-py3-none-any.whl.metadata (1.0 kB)
Collecting yapf (from mmcv-full)
  Downloading yapf-0.43.0-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting terminaltables (from mmdet)
  Downloading terminaltables-3.1.10-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting attributee (from mmtrack)
  Downloading attributee-0.1.9.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dotty-dict (from mmtrac

ModuleNotFoundError: No module named 'mmdet'