In [14]:
# !pip install torch torchvision opencv-python
# !pip install pandas
# !pip install requests
# !pip install --upgrade numpy

In [17]:
import torch
import cv2
import numpy as np
from torchvision.transforms import Compose, Resize, ToTensor

# Load YOLOv5 model
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

# Load MiDaS model
midas_model = torch.hub.load('intel-isl/MiDaS', 'MiDaS_small')
midas_transform = torch.hub.load('intel-isl/MiDaS', 'transforms').small_transform

# Function to estimate depth
def estimate_depth(image):

    input_image = midas_transform(image).unsqueeze(0)
    input_image = input_image.squeeze(1)  # Removes the second dimension
    with torch.no_grad():
        depth = midas_model(input_image)
    depth = torch.nn.functional.interpolate(
        depth.unsqueeze(1),
        size=image.shape[:2],
        mode='bicubic',
        align_corners=False
    ).squeeze()
    depth = depth.cpu().numpy()
    return depth

# Function to detect objects and estimate distances
def detect_objects_and_estimate_distances(image_path):
    # Load image
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Object detection
    results = yolo_model(image_rgb)
    detections = results.pandas().xyxy[0]

    # Depth estimation
    depth_map = estimate_depth(image_rgb)

    # Compile results
    objects_with_distances = []
    for _, row in detections.iterrows():
        xmin, ymin, xmax, ymax = map(int, [row['xmin'], row['ymin'], row['xmax'], row['ymax']])
        object_name = row['name']

        # Extract depth values within the bounding box
        object_depth = depth_map[ymin:ymax, xmin:xmax]
        median_depth = np.median(object_depth)

        # Append result
        objects_with_distances.append({
            "name": object_name,
            "distance": f"{median_depth:.2f}m"
        })

    return objects_with_distances

image_path = 'test3.png'
results = detect_objects_and_estimate_distances(image_path)
for result in results:
    print(f"Object: {result['name']}, Distance: {result['distance']}")

Using cache found in /Users/bereket/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-12-16 Python-3.9.6 torch-2.5.1 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
Using cache found in /Users/bereket/.cache/torch/hub/intel-isl_MiDaS_master
Using cache found in /Users/bereket/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master


Loading weights:  None


Using cache found in /Users/bereket/.cache/torch/hub/intel-isl_MiDaS_master
  with amp.autocast(autocast):


Object: person, Distance: 235.74m
Object: person, Distance: 478.62m
Object: chair, Distance: 311.19m
Object: chair, Distance: 201.15m
Object: chair, Distance: 164.29m
Object: couch, Distance: 600.03m
Object: chair, Distance: 155.03m
Object: chair, Distance: 137.15m
Object: person, Distance: 218.52m
Object: chair, Distance: 144.23m
Object: chair, Distance: 150.07m
Object: cup, Distance: 680.78m
Object: chair, Distance: 124.76m
Object: couch, Distance: 127.83m
Object: chair, Distance: 175.12m
Object: person, Distance: 175.52m
Object: chair, Distance: 125.59m
Object: dining table, Distance: 381.76m
Object: person, Distance: 235.02m


In [19]:
import torch
import cv2
import numpy as np
from torchvision.transforms import Compose, Resize, ToTensor

# Load YOLOv5 model
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

# Load MiDaS model
midas_model = torch.hub.load('intel-isl/MiDaS', 'MiDaS_small')
midas_transform = torch.hub.load('intel-isl/MiDaS', 'transforms').small_transform

# Function to estimate depth
def estimate_depth(image):
    input_image = midas_transform(image).unsqueeze(0)
    input_image = input_image.squeeze(1)  # Removes the second dimension
    with torch.no_grad():
        depth = midas_model(input_image)
    depth = torch.nn.functional.interpolate(
        depth.unsqueeze(1),
        size=image.shape[:2],
        mode='bicubic',
        align_corners=False
    ).squeeze()
    depth = depth.cpu().numpy()
    return depth

# Function to detect objects, estimate distances, and annotate image
def detect_and_annotate(image_path, output_path):
    # Load image
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Object detection
    results = yolo_model(image_rgb)
    detections = results.pandas().xyxy[0]

    # Depth estimation
    depth_map = estimate_depth(image_rgb)

    # Annotate image with bounding boxes and labels
    for _, row in detections.iterrows():
        xmin, ymin, xmax, ymax = map(int, [row['xmin'], row['ymin'], row['xmax'], row['ymax']])
        object_name = row['name']

        # Extract depth values within the bounding box
        object_depth = depth_map[ymin:ymax, xmin:xmax]
        median_depth = np.median(object_depth)

        # Draw bounding box
        cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)

        # Prepare label with object name and distance
        label = f"{object_name}: {median_depth:.2f}m"

        # Calculate label position
        label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        label_ymin = max(ymin, label_size[1] + 10)

        # Draw label background
        cv2.rectangle(image, (xmin, label_ymin - label_size[1] - 10), (xmin + label_size[0], label_ymin + 5), (0, 255, 0), cv2.FILLED)

        # Put label text above the bounding box
        cv2.putText(image, label, (xmin, label_ymin - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)

    # Save the annotated image
    cv2.imwrite(output_path, image)

# Example usage
image_path = 'test3.png'
output_path = 'annotated_test3.png'
detect_and_annotate(image_path, output_path)


Using cache found in /Users/bereket/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-12-16 Python-3.9.6 torch-2.5.1 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
Using cache found in /Users/bereket/.cache/torch/hub/intel-isl_MiDaS_master
Using cache found in /Users/bereket/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master


Loading weights:  None


Using cache found in /Users/bereket/.cache/torch/hub/intel-isl_MiDaS_master
  with amp.autocast(autocast):
