In [1]:
!apt-get update
!apt-get install -y libosmesa6-dev libgl1-mesa-glx libglfw3
!pip install opencv-python-headless mediapipe==0.10.21 open3d==0.19.0 torch torchvision plotly
!pip install -U kaleido


Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,723 kB]
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,369 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-sec

In [2]:
import cv2
import torch
import numpy as np
import mediapipe as mp
import open3d as o3d
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from torchvision import transforms
from IPython.display import display
import time

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Enable cuDNN benchmarking for optimal performance
torch.backends.cudnn.benchmark = True


Using device: cpu


In [3]:
model_type = "MiDaS_small"
midas = torch.hub.load("intel-isl/MiDaS", model_type)
midas.to(device)
midas.eval()

midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
transform = (midas_transforms.small_transform
             if model_type == "MiDaS_small"
             else midas_transforms.default_transform)


Downloading: "https://github.com/intel-isl/MiDaS/zipball/master" to /root/.cache/torch/hub/master.zip


Loading weights:  None


Downloading: "https://github.com/rwightman/gen-efficientnet-pytorch/zipball/master" to /root/.cache/torch/hub/master.zip
Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite3-b733e338.pth" to /root/.cache/torch/hub/checkpoints/tf_efficientnet_lite3-b733e338.pth
Downloading: "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt" to /root/.cache/torch/hub/checkpoints/midas_v21_small_256.pt
100%|██████████| 81.8M/81.8M [00:02<00:00, 33.2MB/s]
Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


In [4]:
mp_pose = mp.solutions.pose
pose_estimator = mp_pose.Pose(
    static_image_mode=True,       # set True since we're processing a single image
    model_complexity=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)
pose_connections = mp_pose.POSE_CONNECTIONS

def backproject(u, v, depth_value, fx, fy, ppx, ppy):
    """
    Back-project pixel (u, v) with depth_value (meters) to 3D.
    """
    z = depth_value
    x = (u - ppx) * z / fx
    y = (v - ppy) * z / fy
    return np.array([x, y, z])


In [5]:
mp_pose = mp.solutions.pose
pose_estimator = mp_pose.Pose(
    static_image_mode=True,       # set True since we're processing a single image
    model_complexity=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)
pose_connections = mp_pose.POSE_CONNECTIONS

def backproject(u, v, depth_value, fx, fy, ppx, ppy):
    """
    Back-project pixel (u, v) with depth_value (meters) to 3D.
    """
    z = depth_value
    x = (u - ppx) * z / fx
    y = (v - ppy) * z / fy
    return np.array([x, y, z])


In [6]:
video_path = "/content/demo.mp4"  # Replace with your video file path

cap = cv2.VideoCapture(video_path)

# Suppose we want the frame at 4 seconds (if short video) or 4 minutes (240 seconds). Adjust as needed.
target_sec = 4  # 4.0 seconds from start. For 4 minutes, set to 240
cap.set(cv2.CAP_PROP_POS_MSEC, target_sec * 1000)  # Position in milliseconds

ret, frame = cap.read()
cap.release()

if not ret:
    raise ValueError("Could not retrieve frame at the specified time.")

print("Frame extracted successfully at", target_sec, "seconds.")


Frame extracted successfully at 4 seconds.


In [7]:
# Downscale factor for speed (optional)
downscale_factor = 1.0  # e.g., 0.5 to half the resolution
max_depth_meters = 5.0  # approximate maximum scene depth

original_height, original_width, _ = frame.shape
proc_width = int(original_width * downscale_factor)
proc_height = int(original_height * downscale_factor)

# Resize frame if desired
frame_proc = cv2.resize(frame, (proc_width, proc_height), interpolation=cv2.INTER_AREA)
frame_rgb = cv2.cvtColor(frame_proc, cv2.COLOR_BGR2RGB)

# --- Depth Estimation (MiDaS) ---
with torch.no_grad():
    input_batch = transform(frame_rgb).to(device)
    prediction = midas(input_batch)
    prediction = torch.nn.functional.interpolate(
        prediction.unsqueeze(1),
        size=(proc_height, proc_width),
        mode="bilinear",
        align_corners=False
    ).squeeze()
torch.cuda.synchronize()

depth_map = prediction.cpu().numpy()
# Normalize depth to [0,1]
depth_map_norm = cv2.normalize(depth_map, None, 0, 1, norm_type=cv2.NORM_MINMAX)

# --- Create Open3D Point Cloud ---
o3d_color = o3d.geometry.Image(frame_rgb)
o3d_depth = o3d.geometry.Image((depth_map_norm * 1000).astype(np.uint16))
rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth(
    o3d_color, o3d_depth,
    depth_scale=1000.0,
    convert_rgb_to_intensity=False
)

# Approximate camera intrinsics
fx = fy = proc_width
ppx = proc_width / 2
ppy = proc_height / 2

intrinsic = o3d.camera.PinholeCameraIntrinsic(proc_width, proc_height, fx, fy, ppx, ppy)
pcd = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd, intrinsic)
# Adjust orientation
pcd.transform([[1, 0, 0, 0],
               [0, -1, 0, 0],
               [0, 0, -1, 0],
               [0, 0, 0, 1]])

# --- Pose Estimation (MediaPipe) ---
results = pose_estimator.process(frame_rgb)
keypoints_3d = []
if results.pose_landmarks:
    for landmark in results.pose_landmarks.landmark:
        u = int(landmark.x * proc_width)
        v = int(landmark.y * proc_height)
        u_clamped = np.clip(u, 0, proc_width - 1)
        v_clamped = np.clip(v, 0, proc_height - 1)
        # Relative depth at that pixel
        depth_val = depth_map_norm[v_clamped, u_clamped]
        # Convert to approximate meters
        depth_in_meters = depth_val * max_depth_meters
        keypoints_3d.append(backproject(u, v, depth_in_meters, fx, fy, ppx, ppy))
else:
    # If no landmarks, create dummy array
    keypoints_3d = [np.array([0, 0, 0]) for _ in range(33)]
keypoints_3d = np.array(keypoints_3d)

print("Depth estimation and pose detection complete.")


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
pts = np.asarray(pcd.points)
if len(pcd.colors) > 0:
    colors = np.asarray(pcd.colors)
else:
    colors = np.ones((pts.shape[0], 3))

# Create a scatter3d trace for the point cloud
pcd_trace = go.Scatter3d(
    x=pts[:, 0], y=pts[:, 1], z=pts[:, 2],
    mode='markers',
    marker=dict(
        size=1,
        color=['rgb({},{},{})'.format(int(c[0]*255), int(c[1]*255), int(c[2]*255))
               for c in colors],
        opacity=0.8
    ),
    name='Point Cloud'
)

# Create line traces for the skeleton
line_traces = []
for connection in pose_connections:
    start_idx, end_idx = connection
    if start_idx < len(keypoints_3d) and end_idx < len(keypoints_3d):
        p0 = keypoints_3d[start_idx]
        p1 = keypoints_3d[end_idx]
        line_trace = go.Scatter3d(
            x=[p0[0], p1[0]],
            y=[p0[1], p1[1]],
            z=[p0[2], p1[2]],
            mode='lines',
            line=dict(color='green', width=5),
            showlegend=False
        )
        line_traces.append(line_trace)

fig = go.Figure(data=[pcd_trace] + line_traces)
fig.update_layout(scene=dict(aspectmode='data'),
                  title="Digital Twin - Single Frame")
fig.show()
fig.write_image("output.png")


Buffered data was truncated after reaching the output size limit.