# **Import Required Libraries**

In [1]:
!pip install mediapipe opencv-python torch torchvision torchaudio timm
!pip install matplotlib

Collecting mediapipe
  Downloading mediapipe-0.10.21-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidi

In [2]:
import torch
import cv2
import mediapipe as mp
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor, Normalize

# **Model Initialization**

In [3]:
midas = torch.hub.load('intel-isl/MiDaS', 'MiDaS_small')
midas_transforms = torch.hub.load('intel-isl/MiDaS', 'transforms')
transform = midas_transforms.small_transform

Downloading: "https://github.com/intel-isl/MiDaS/zipball/master" to /root/.cache/torch/hub/master.zip


Loading weights:  None


Downloading: "https://github.com/rwightman/gen-efficientnet-pytorch/zipball/master" to /root/.cache/torch/hub/master.zip
Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite3-b733e338.pth" to /root/.cache/torch/hub/checkpoints/tf_efficientnet_lite3-b733e338.pth
Downloading: "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt" to /root/.cache/torch/hub/checkpoints/midas_v21_small_256.pt
100%|██████████| 81.8M/81.8M [00:00<00:00, 137MB/s]
Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


In [4]:
# Set MiDaS to evaluation mode
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
midas.to(device)
midas.eval()

MidasNet_small(
  (pretrained): Module(
    (layer1): Sequential(
      (0): Conv2dSameExport(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
      (1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
      (3): Sequential(
        (0): DepthwiseSeparableConv(
          (conv_dw): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (bn1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
          (act1): ReLU6(inplace=True)
          (se): Identity()
          (conv_pw): Conv2d(32, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn2): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
          (act2): Identity()
        )
      )
      (4): Sequential(
        (0): InvertedResidual(
          (conv_pw): Conv2d(24, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(144,

In [10]:
# Initialize MediaPipe BlazePose GHUM
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False, model_complexity=2, enable_segmentation=False, smooth_landmarks=True)


Downloading model to /usr/local/lib/python3.11/dist-packages/mediapipe/modules/pose_landmark/pose_landmark_heavy.tflite


In [11]:
video_path = '/content/Kirolos_video.mp4'
cap = cv2.VideoCapture(video_path)

In [12]:
# Check if the video was opened successfully
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

# Get video properties
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

# Define output video writer
out = cv2.VideoWriter('output_with_3d_and_depth.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))


In [13]:
# Process video frames
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert frame to RGB for MediaPipe
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = pose.process(image_rgb)

    # Prepare frame for MiDaS depth estimation
    img_midas = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img_midas = transform(img_midas)
    if img_midas.dim() == 3:  # Ensure the input has batch dimension
        img_midas = img_midas.unsqueeze(0)
    img_midas = img_midas.to(device)

    # Perform depth estimation
    with torch.no_grad():
        prediction = midas(img_midas)
        prediction = torch.nn.functional.interpolate(
            prediction.unsqueeze(1),
            size=frame.shape[:2],
            mode='bicubic',
            align_corners=False
        ).squeeze()
    depth_map = prediction.cpu().numpy()

    # Normalize depth map for visualization
    depth_map_normalized = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    depth_colored = cv2.applyColorMap(depth_map_normalized, cv2.COLORMAP_MAGMA)

    if results.pose_world_landmarks and results.pose_landmarks:
        world_landmarks = results.pose_world_landmarks.landmark
        image_landmarks = results.pose_landmarks.landmark

        # Get 3D coordinates of left and right heels
        left_heel_3d = world_landmarks[mp_pose.PoseLandmark.LEFT_HEEL]
        right_heel_3d = world_landmarks[mp_pose.PoseLandmark.RIGHT_HEEL]

        # Convert 3D landmarks to numpy arrays
        left_coords = np.array([left_heel_3d.x, left_heel_3d.y, left_heel_3d.z])
        right_coords = np.array([right_heel_3d.x, right_heel_3d.y, right_heel_3d.z])

        # Calculate 3D Euclidean distance
        distance_3d_meters = np.linalg.norm(left_coords - right_coords)

        # Retrieve 2D coordinates for annotation
        left_heel_2d = image_landmarks[mp_pose.PoseLandmark.LEFT_HEEL]
        right_heel_2d = image_landmarks[mp_pose.PoseLandmark.RIGHT_HEEL]

        left_heel_coords_2d = (int(left_heel_2d.x * width), int(left_heel_2d.y * height))
        right_heel_coords_2d = (int(right_heel_2d.x * width), int(right_heel_2d.y * height))

        # Annotate frame with distance and keypoints
        annotated_frame = frame.copy()
        cv2.line(annotated_frame, left_heel_coords_2d, right_heel_coords_2d, (0, 255, 0), 3)
        cv2.putText(annotated_frame, f'Distance: {distance_3d_meters:.2f} m', (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
        cv2.circle(annotated_frame, left_heel_coords_2d, 5, (255, 0, 0), -1)
        cv2.circle(annotated_frame, right_heel_coords_2d, 5, (255, 0, 0), -1)

        # Overlay the depth map onto the frame
        overlay_combined = cv2.addWeighted(annotated_frame, 0.6, depth_colored, 0.4, 0)

        # Write the annotated frame to the output video
        out.write(overlay_combined)
    else:
        # If no pose is detected, write the frame with depth map
        out.write(cv2.addWeighted(frame, 0.6, depth_colored, 0.4, 0))

# Release resources
cap.release()
out.release()
print("Processing complete. Output saved as 'output_with_3d_and_depth.mp4'")


Processing complete. Output saved as 'output_with_3d_and_depth.mp4'
