In [1]:
import cv2
import numpy as np
import torch


In [None]:
# Choose model type: "DPT_Large" (more accurate) or "MiDaS_small" (faster)
model_type = "DPT_Large"

# Load the MiDaS model from PyTorch Hub
midas = torch.hub.load("intel-isl/MiDaS", model_type)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)
midas.eval()

# Load the appropriate transforms for the chosen model
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
if model_type in ["DPT_Large", "DPT_Hybrid"]:
    transform = midas_transforms.dpt_transform
else:
    transform = midas_transforms.small_transform


Downloading: "https://github.com/intel-isl/MiDaS/zipball/master" to /root/.cache/torch/hub/master.zip
Downloading: "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt" to /root/.cache/torch/hub/checkpoints/dpt_large_384.pt
100%|██████████| 1.28G/1.28G [00:15<00:00, 86.2MB/s]
Using cache found in /root/.cache/torch/hub/intel-isl_MiDaS_master


In [None]:
# Define maximum disparity (pixel shift) for closest objects
max_disp = 20

# Open the input video
input_video = "input.mp4"
cap = cv2.VideoCapture(input_video)
if not cap.isOpened():
    raise ValueError("Error opening video file")

# Retrieve video properties
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Prepare a VideoWriter for the output (side-by-side) video
output_video = "output_stereo.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
# Output width is doubled (left + right view)
out = cv2.VideoWriter(output_video, fourcc, fps, (width * 2, height))


In [None]:
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"Processing {frame_count} frames...")

frame_index = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the frame from BGR to RGB
    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Preprocess image and move to GPU
    input_batch = transform(img_rgb).to(device)

    # Estimate depth with the MiDaS model
    with torch.no_grad():
        prediction = midas(input_batch)

    # Resize depth prediction to original frame dimensions
    prediction_resized = torch.nn.functional.interpolate(
        prediction.unsqueeze(1),
        size=(height, width),
        mode="bicubic",
        align_corners=False
    ).squeeze().cpu().numpy()

    # Normalize the depth map to range [0, 1]
    depth_min = prediction_resized.min()
    depth_max = prediction_resized.max()
    normalized_depth = (prediction_resized - depth_min) / (depth_max - depth_min + 1e-8)

    # Compute disparity: closer objects (lower depth) get a higher disparity.
    # We use (1 - normalized_depth) so that nearer objects have larger disparity.
    disparity = max_disp * (1 - normalized_depth)
    disparity = disparity.astype(np.float32)

    # Create a meshgrid for pixel coordinates
    xx, yy = np.meshgrid(np.arange(width), np.arange(height))
    # For the right view, shift pixels horizontally by subtracting the disparity
    # (simulating the perspective of a right-offset camera)
    map_x = (xx - disparity).astype(np.float32)
    map_y = yy.astype(np.float32)

    # Warp the original frame to generate the right view using the computed mapping
    right_view = cv2.remap(frame, map_x, map_y, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)

    # Concatenate the original frame (left view) and the warped frame (right view) side-by-side
    stereo_frame = np.concatenate((frame, right_view), axis=1)

    # Write the stereoscopic frame to the output video
    out.write(stereo_frame)

    frame_index += 1
    if frame_index % 10 == 0:
        print(f"Processed {frame_index}/{frame_count} frames", end='\r')

cap.release()
out.release()
print("\nStereoscopic conversion complete!")


Processing 1238 frames...

Stereoscopic conversion complete!
