In [1]:
from google.colab import files

# Upload the video file
uploaded = files.upload()

Saving street_scene.mp4 to street_scene.mp4


In [2]:
import torch
import torchvision
import cv2
import os

# Load the pre-trained Faster R-CNN model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Full list of COCO category names (91 classes)
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
    'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
    'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
    'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle',
    'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
    'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
    'teddy bear', 'hair drier', 'toothbrush'
]

def detect_vehicles_and_draw_boxes(image, model, threshold=0.5):
    """Detects vehicles in an image and draws bounding boxes."""
    img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    img_tensor = torch.from_numpy(img_rgb).float().permute(2, 0, 1).unsqueeze(0) / 255.0
    img_tensor = img_tensor.to(device)

    # Perform inference
    with torch.no_grad():
        predictions = model(img_tensor)[0]

    # Ensure predictions are available and valid
    if 'labels' not in predictions or len(predictions['labels']) == 0:
        return image  # No predictions, return original image

    # Loop over predictions and draw bounding boxes for vehicles
    for i in range(len(predictions['labels'])):
        if i >= len(predictions['scores']):
            continue  # Avoid index out of range if fewer scores than labels

        score = predictions['scores'][i].item()
        if score > threshold:
            label_idx = predictions['labels'][i].item()

            # Safeguard: Check if label_idx is within valid range
            if label_idx >= len(COCO_INSTANCE_CATEGORY_NAMES):
                print(f"Warning: label_idx {label_idx} out of range for frame.")
                continue  # Skip this detection

            label = COCO_INSTANCE_CATEGORY_NAMES[label_idx]

            # Only draw boxes for vehicles (car, bus, truck)
            if label in ['car', 'bus', 'truck']:
                box = predictions['boxes'][i].cpu().numpy().astype(int)
                cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
                cv2.putText(image, f'{label}: {score:.2f}', (box[0], box[1] - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    return image

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:03<00:00, 50.1MB/s]


In [3]:
import cv2
import os

# Path to the uploaded video file
video_path = 'street_scene.mp4'  

# Create directories to store frames and output video
frames_dir = 'frames'
output_frames_dir = 'output_frames'
os.makedirs(frames_dir, exist_ok=True)
os.makedirs(output_frames_dir, exist_ok=True)

# Split video into frames
vidcap = cv2.VideoCapture(video_path)
success, image = vidcap.read()
count = 0

while success:
    frame_path = os.path.join(frames_dir, f"frame{count:04d}.jpg")
    cv2.imwrite(frame_path, image)  # Save the frame
    success, image = vidcap.read()
    count += 1

print(f"Extracted {count} frames from the video.")

# Process frames (detect vehicles and draw bounding boxes)
for frame_file in os.listdir(frames_dir):
    frame_path = os.path.join(frames_dir, frame_file)
    output_frame_path = os.path.join(output_frames_dir, frame_file)

    frame = cv2.imread(frame_path)
    output_frame = detect_vehicles_and_draw_boxes(frame, model)
    cv2.imwrite(output_frame_path, output_frame)

print("Processed all frames and saved them with bounding boxes in 'output_frames'.")

# Recompile the processed frames into a video
output_video_path = 'output_street_scene.mp4'
frame_files = sorted(os.listdir(output_frames_dir))

# Get frame size
frame_sample = cv2.imread(os.path.join(output_frames_dir, frame_files[0]))
height, width, _ = frame_sample.shape

# Initialize video writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = 20.0
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

# Write processed frames to video
for frame_file in frame_files:
    frame_path = os.path.join(output_frames_dir, frame_file)
    frame = cv2.imread(frame_path)
    out.write(frame)

out.release()
print(f"Output video saved as {output_video_path}")

Extracted 452 frames from the video.
Processed all frames and saved them with bounding boxes in 'output_frames'.
Output video saved as output_street_scene.mp4
