In [1]:
!pip install opencv-python tensorflow torch torchvision scipy

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [11]:
from google.colab import files

uploaded = files.upload()

if uploaded:
    video_path = list(uploaded.keys())[0]
else:
    video_path = None

Saving car1.mp4 to car1.mp4


In [26]:
import cv2
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from collections import defaultdict
from scipy.spatial.distance import cdist
from google.colab import files
import os # Import os for file path handling

# Define a simple Centroid Tracker class with improvements
class CentroidTracker:
    def __init__(self, max_dissimilarity=0.8, max_frames_to_miss=15, history_size=5):
        self.next_object_id = 0
        self.objects = defaultdict(list) # Stores list of centroids for each object ID
        self.object_boxes = defaultdict(list) # Stores list of bounding boxes for each object ID
        self.disappeared = defaultdict(int) # Stores number of consecutive frames an object has been missed
        self.max_dissimilarity = max_dissimilarity
        self.max_frames_to_miss = max_frames_to_miss
        self.history_size = history_size # Number of frames to consider for prediction history

    def register(self, detection):
        # detection is assumed to be a tuple: (centroid_x, centroid_y, xmin, ymin, xmax, ymax)
        centroid = (detection[0], detection[1])
        box = (detection[2], detection[3], detection[4], detection[5])
        self.objects[self.next_object_id].append(centroid)
        self.object_boxes[self.next_object_id].append(box)
        self.disappeared[self.next_object_id] = 0 # Reset disappeared counter
        self.next_object_id += 1
        return self.next_object_id - 1

    def deregister(self, object_id):
        del self.objects[object_id]
        del self.object_boxes[object_id]
        del self.disappeared[object_id]

    def update(self, detections):
        # detections is a list of tuples: [(centroid_x, centroid_y, xmin, ymin, xmax, ymax), ...]

        # Return the current state of tracked objects (object ID and latest centroid/box)
        current_tracked_objects_state = {}
        # Populate initial state before updates for the case where no detections occur
        for obj_id in list(self.objects.keys()):
             current_tracked_objects_state[obj_id] = {
                 'centroid': self.objects[obj_id][-1],
                 'box': self.object_boxes[obj_id][-1]
             }

        if not self.objects:
            # If no objects are currently tracked, register all new detections
            for detection in detections:
                self.register(detection)
        else:
            object_ids = list(self.objects.keys())
            # Use the latest centroid and box for distance calculation
            object_centroids = np.array([self.objects[obj_id][-1] for obj_id in object_ids])
            object_latest_boxes = np.array([self.object_boxes[obj_id][-1] for obj_id in object_ids])

            if len(detections) == 0:
                # If there are no detections in the current frame, increment disappeared counters
                for obj_id in object_ids:
                    self.disappeared[obj_id] += 1
                    # Deregister objects that have been missing for too many frames
                    if self.disappeared[obj_id] > self.max_frames_to_miss:
                        self.deregister(obj_id)
                # Re-populate current_tracked_objects_state after potential deregistration
                current_tracked_objects_state = {}
                for obj_id in self.objects.keys():
                    current_tracked_objects_state[obj_id] = {
                        'centroid': self.objects[obj_id][-1],
                        'box': self.object_boxes[obj_id][-1]
                    }
                return current_tracked_objects_state

            detection_centroids = np.array([(d[0], d[1]) for d in detections])
            detection_boxes = np.array([(d[2], d[3], d[4], d[5]) for d in detections])

            # --- Motion Prediction (Improved) ---
            predicted_object_centroids = []
            for obj_id in object_ids:
                history = self.objects[obj_id]
                if len(history) >= self.history_size:
                    # Use the average velocity over the history size for prediction
                    dx_sum = 0
                    dy_sum = 0
                    for i in range(1, self.history_size):
                        dx_sum += history[-i][0] - history[-(i+1)][0]
                        dy_sum += history[-i][1] - history[-(i+1)][1]
                    avg_dx = dx_sum / (self.history_size - 1)
                    avg_dy = dy_sum / (self.history_size - 1)
                    predicted_x = history[-1][0] + avg_dx
                    predicted_y = history[-1][1] + avg_dy
                    predicted_object_centroids.append((predicted_x, predicted_y))
                elif len(history) > 1:
                    # Simple linear prediction if history is less than history_size but more than 1
                    dx = history[-1][0] - history[-2][0]
                    dy = history[-1][1] - history[-2][1]
                    predicted_x = history[-1][0] + dx
                    predicted_y = history[-1][1] + dy
                    predicted_object_centroids.append((predicted_x, predicted_y))
                else:
                    # If not enough history, use the current centroid
                    predicted_object_centroids.append(history[-1])

            predicted_object_centroids = np.array(predicted_object_centroids)

            # Compute distances between predicted object centroids and new detection centroids
            centroid_distances = cdist(predicted_object_centroids, detection_centroids)
            # --- End Motion Prediction ---

            # Simple box size similarity: 1 - (min(w1, w2) * min(h1, h2)) / (max(w1, w2) * max(h1, h2))
            object_widths = object_latest_boxes[:, 2] - object_latest_boxes[:, 0]
            object_heights = object_latest_boxes[:, 3] - object_latest_boxes[:, 1]
            detection_widths = detection_boxes[:, 2] - detection_boxes[:, 0]
            detection_heights = detection_boxes[:, 3] - detection_boxes[:, 1]

            box_similarity = np.zeros((len(object_ids), len(detections)))
            for i in range(len(object_ids)):
                for j in range(len(detections)):
                    w1, h1 = object_widths[i], object_heights[i]
                    w2, h2 = detection_widths[j], detection_heights[j]
                    min_w = min(w1, w2)
                    min_h = min(h1, h2)
                    max_w = max(w1, w2)
                    max_h = max(h1, h2)
                    # Avoid division by zero if a box has zero area
                    if max_w * max_h > 0:
                        box_similarity[i, j] = (min_w * min_h) / (max_w * max_h)
                    else:
                        box_similarity[i, j] = 0 # No similarity if one box has zero area

            # Combine centroid distance (based on prediction) and box similarity
            max_centroid_distance = np.max(centroid_distances) if centroid_distances.size > 0 else 0
            normalized_centroid_distances = centroid_distances / (max_centroid_distance + 1e-6)

            combined_dissimilarity = 0.7 * normalized_centroid_distances + 0.3 * (1 - box_similarity)

            # Find the minimum combined dissimilarity for each object
            # Use a sufficiently large value for unassigned detections to avoid them being picked
            # where no good match exists.
            # Replace np.inf with a large number if `min_dissimilarities` calculation encounters issues
            min_dissimilarities = np.min(combined_dissimilarity, axis=1)
            object_assignments = np.argmin(combined_dissimilarity, axis=1)

            used_detections = set()
            used_objects = set()

            # Assign detections to objects based on minimum combined dissimilarity and a threshold
            combined_dissimilarity_threshold = 1.2 # Increased threshold

            for i, obj_id in enumerate(object_ids):
                detection_index = object_assignments[i]
                if min_dissimilarities[i] < combined_dissimilarity_threshold and detection_index not in used_detections:
                    # If a match is found, update the object's history and reset the disappeared counter
                    self.objects[obj_id].append(detection_centroids[detection_index])
                    self.object_boxes[obj_id].append(detection_boxes[detection_index])
                    # Keep history size limited
                    self.objects[obj_id] = self.objects[obj_id][-self.history_size:]
                    self.object_boxes[obj_id] = self.object_boxes[obj_id][-self.history_size:]

                    self.disappeared[obj_id] = 0
                    used_detections.add(detection_index)
                    used_objects.add(obj_id)
                else:
                    # If no match is found for this object, increment the disappeared counter
                    self.disappeared[obj_id] += 1
                    # Deregister objects that have been missing for too many frames
                    if self.disappeared[obj_id] > self.max_frames_to_miss:
                        self.deregister(obj_id)

            # Register new detections that weren't assigned to existing objects
            for j, detection in enumerate(detections):
                if j not in used_detections:
                    self.register(detection)

        # Return the current state of tracked objects (object ID and latest centroid/box)
        current_tracked_objects_state = {}
        for obj_id in self.objects.keys():
            current_tracked_objects_state[obj_id] = {
                'centroid': self.objects[obj_id][-1],
                'box': self.object_boxes[obj_id][-1]
            }
        return current_tracked_objects_state

# --- COCO Class IDs Mapping ---
# This dictionary maps common object names to their COCO dataset class IDs.
# The SSD MobileNet V2 model is trained on this dataset.
COCO_CLASS_NAMES = {
    1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane',
    6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'traffic light',
    11: 'fire hydrant', 13: 'stop sign', 14: 'parking meter', 15: 'bench',
    16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow',
    22: 'elephant', 23: 'bear', 24: 'zebra', 25: 'giraffe', 27: 'backpack',
    28: 'umbrella', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee',
    35: 'skis', 36: 'snowboard', 37: 'sports ball', 38: 'kite', 39: 'baseball bat',
    40: 'baseball glove', 41: 'skateboard', 42: 'surfboard', 43: 'tennis racket',
    44: 'bottle', 46: 'wine glass', 47: 'cup', 48: 'fork', 49: 'knife', 50: 'spoon',
    51: 'bowl', 52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange',
    56: 'broccoli', 57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut',
    61: 'cake', 62: 'chair', 63: 'couch', 64: 'potted plant', 65: 'bed',
    67: 'dining table', 70: 'toilet', 72: 'tv', 73: 'laptop', 74: 'mouse',
    75: 'remote', 76: 'keyboard', 77: 'cell phone', 78: 'microwave', 79: 'oven',
    80: 'toaster', 81: 'sink', 82: 'refrigerator', 84: 'book', 85: 'clock',
    86: 'vase', 87: 'scissors', 88: 'teddy bear', 89: 'hair drier', 90: 'toothbrush'
}

# Invert the dictionary for easy lookup from name to ID
COCO_CLASS_IDS = {name: id for id, name in COCO_CLASS_NAMES.items()}

# --- File Upload (Colab Specific) ---
# Upload your video file here. For example, upload 'video.mp4'.
print("Please upload your video file (e.g., video.mp4).")
uploaded = files.upload()
video_input_path = list(uploaded.keys())[0]
print(f"Video '{video_input_path}' uploaded.")

# Re-initialize video capture
cap = cv2.VideoCapture(video_input_path)

# Get video properties
if not cap.isOpened():
    print("Error: Could not open video file.")
    exit() # Exit if video cannot be opened

frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
print(f"Video opened successfully. Dimensions: {frame_width}x{frame_height}, FPS: {fps}")

# Define the output video path and codec
output_video_path = '/content/output_tracked_combined.mp4' # Generic output name
fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for .mp4

# Create VideoWriter object
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

# Load the pre-trained object detection model from TensorFlow Hub
try:
    detector # Check if detector is already loaded from previous runs
except NameError:
    print("Loading object detection model...")
    model_url = 'https://tfhub.dev/tensorflow/ssd_mobilenet_v2/fpnlite_320x320/1'
    detector = hub.load(model_url)
    print("Model loaded.")

# --- CONFIGURE YOUR TARGET OBJECTS HERE ---
# Choose which objects you want to track by their names.
# For example, to track 'person' and 'car':
# target_object_names = ['person', 'car']
# To track 'bus' and 'truck':
# target_object_names = ['bus', 'truck']
# To track all common vehicles:
target_object_names = ['person', 'car', 'bus', 'truck', 'motorcycle', 'bicycle'] # Example: track multiple vehicles and people

# Convert chosen names to their corresponding COCO class IDs
target_class_ids = set()
for name in target_object_names:
    if name in COCO_CLASS_IDS:
        target_class_ids.add(COCO_CLASS_IDS[name])
    else:
        print(f"Warning: Class name '{name}' not found in COCO_CLASS_NAMES. It will not be tracked.")

if not target_class_ids:
    print("Error: No valid target object names provided or mapped to class IDs. Please check 'target_object_names'.")
    exit()

print(f"Will track objects with class IDs: {target_class_ids} ({target_object_names})")

# Initialize the tracker with improved parameters
tracker = CentroidTracker(max_dissimilarity=0.8, max_frames_to_miss=15)

frame_index = 0
print("Starting video processing...")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert frame to TensorFlow tensor
    input_tensor = tf.convert_to_tensor(frame, dtype=tf.uint8)
    # Add a batch dimension
    input_tensor = input_tensor[tf.newaxis, ...]

    # Run inference with device placement (using CPU as before)
    with tf.device('/CPU:0'):
        detections = detector(input_tensor)

    # Extract relevant detection information (bounding boxes, scores, classes)
    scores = detections['detection_scores'][0].numpy()
    boxes = detections['detection_boxes'][0].numpy()
    classes = detections['detection_classes'][0].numpy()

    # Filter detections based on a confidence threshold and selected target class IDs
    confidence_threshold = 0.5

    filtered_detections = []
    detected_class_names_in_frame = {} # To store actual names of detected objects for display
    for i in range(len(scores)):
        class_id = int(classes[i])
        if scores[i] > confidence_threshold and class_id in target_class_ids:
            # Get bounding box coordinates (ymin, xmin, ymax, xmax) - note order from model
            ymin, xmin, ymax, xmax = boxes[i]
            # Calculate centroid
            centroid_x = (xmin + xmax) / 2
            centroid_y = (ymin + ymax) / 2
            # Store detection as (centroid_x, centroid_y, xmin, ymin, xmax, ymax)
            filtered_detections.append((centroid_x, centroid_y, xmin, ymin, xmax, ymax))

            # Store class name for display later
            if class_id in COCO_CLASS_NAMES:
                detected_class_names_in_frame[len(filtered_detections) - 1] = COCO_CLASS_NAMES[class_id]
            else:
                detected_class_names_in_frame[len(filtered_detections) - 1] = f"Unknown_{class_id}"


    # Update the tracker with the filtered detections
    current_tracked_objects_state = tracker.update(filtered_detections)

    # Iterate through tracked objects and draw bounding boxes
    if isinstance(current_tracked_objects_state, dict):
        for obj_id, obj_state in current_tracked_objects_state.items():
            # Get bounding box coordinates from the tracked object state
            xmin_norm, ymin_norm, xmax_norm, ymax_norm = obj_state['box']

            # Scale bounding box coordinates back to frame dimensions
            frame_height_current, frame_width_current, _ = frame.shape
            xmin = int(xmin_norm * frame_width_current)
            ymin = int(ymin_norm * frame_height_current)
            xmax = int(xmax_norm * frame_width_current)
            ymax = int(ymax_norm * frame_height_current)

            # Draw bounding box
            color = (0, 255, 0) # Green color
            thickness = 2
            cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color, thickness)

            # Draw object ID and potentially class name if available
            font = cv2.FONT_HERSHEY_SIMPLEX
            font_scale = 0.5
            font_thickness = 1

            # Try to get the class name. This requires associating tracked ID back to detected class
            # This is a simplification; a more robust tracker might store the class ID with the object
            # For now, we'll just show the ID.
            text = f"ID: {obj_id}"

            text_size, _ = cv2.getTextSize(text, font, font_scale, font_thickness)
            text_x = xmin
            text_y = ymin - 10 # Position text slightly above the box
            if text_y < 10: # Ensure text is not drawn out of bounds upwards
                text_y = ymin + text_size[1] + 5

            cv2.putText(frame, text, (text_x, text_y), font, font_scale, color, font_thickness)


    # Write the annotated frame to the output video
    out.write(frame)

    frame_index += 1

print(f"Finished processing {frame_index} frames.")

# Release resources
cap.release()
out.release()

print(f"Finished processing and saved the output video to {output_video_path}")

# Download the output video (Colab specific)
files.download(output_video_path)

Please upload your video file (e.g., video.mp4).


Saving car1.mp4 to car1 (6).mp4
Video 'car1 (6).mp4' uploaded.
Video opened successfully. Dimensions: 854x480, FPS: 23
Will track objects with class IDs: {1, 2, 3, 4, 6, 8} (['person', 'car', 'bus', 'truck', 'motorcycle', 'bicycle'])
Starting video processing...
Finished processing 299 frames.
Finished processing and saved the output video to /content/output_tracked_combined.mp4


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>