<a href="https://colab.research.google.com/github/ayagup/stablediffusion/blob/main/depth_estimation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# %pip install torch>=2.0.0 transformers>=4.35.0 accelerate>=0.24.0 Pillow>=10.0.0 opencv-python>=4.8.0 matplotlib>=3.7.0 numpy<2.0.0 timm>=0.9.0


/bin/bash: line 1: 2.0.0: No such file or directory
Note: you may need to restart the kernel to use updated packages.


In [None]:
# %pip install torch transformers accelerate Pillow opencv-python matplotlib "numpy<2.0.0" timm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
"""
Simple Depth Estimation Example
Quick test for monocular depth estimation using Hugging Face models
"""

import torch
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
from PIL import Image
import numpy as np
import os
import gc

# Suppress warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Memory optimization
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


def simple_depth_estimation(
    image_path: str,
    output_path: str = "depth_map.png",
    model_name: str = "Intel/dpt-large",
    colormap: str = "viridis",
):
    """
    Estimate depth from a single image

    Args:
        image_path: Path to input image (local file or URL)
        output_path: Path to save depth map
        model_name: HuggingFace model name
        colormap: Matplotlib colormap for visualization
    """

    # Clear GPU memory from any previous runs
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        print("üßπ Cleared GPU memory cache\n")

    # Check GPU
    device = "cuda" if torch.cuda.is_available() else "cpu"
    num_gpus = torch.cuda.device_count() if device == "cuda" else 0

    print(f"\n{'='*60}")
    print("Simple Depth Estimation")
    print(f"{'='*60}")
    print(f"Device: {device}")
    if num_gpus > 0:
        print(f"GPUs: {num_gpus}")
        for i in range(num_gpus):
            print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"{'='*60}\n")

    # Load image
    print(f"Loading image: {image_path}")
    if image_path.startswith('http://') or image_path.startswith('https://'):
        # Load from URL
        import requests
        from io import BytesIO
        response = requests.get(image_path)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_path).convert('RGB')

    print(f"Image size: {image.size[0]}x{image.size[1]}")

    # Load model and processor
    print(f"\nLoading model: {model_name}")
    print("(This will download the model on first run)\n")

    processor = AutoImageProcessor.from_pretrained(model_name)
    model = AutoModelForDepthEstimation.from_pretrained(model_name)

    # Move to GPU with memory optimization
    if device == "cuda":
        if num_gpus > 1:
            print(f"üöÄ Multi-GPU mode: distributing across {num_gpus} GPUs")
            # Use DataParallel for multi-GPU
            model = torch.nn.DataParallel(model)
            model = model.to(device)
        else:
            print(f"‚ö° Single GPU mode")
            model = model.to(device)
    else:
        model = model.to(device)

    model.eval()

    print("\nEstimating depth...")

    # Prepare inputs
    inputs = processor(images=image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Inference
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_depth = outputs.predicted_depth

    # Post-process
    prediction = predicted_depth.squeeze().cpu().numpy()

    # Normalize to 0-255
    depth_min = prediction.min()
    depth_max = prediction.max()
    normalized_depth = (prediction - depth_min) / (depth_max - depth_min)
    normalized_depth = (normalized_depth * 255).astype(np.uint8)

    print(f"Depth range: {depth_min:.2f} to {depth_max:.2f}")

    # Apply colormap
    import matplotlib.pyplot as plt
    import matplotlib.cm as cm

    cmap = cm.get_cmap(colormap)
    colored_depth = cmap(normalized_depth / 255.0)
    colored_depth = (colored_depth[:, :, :3] * 255).astype(np.uint8)

    # Save
    depth_image = Image.fromarray(colored_depth)

    # Resize to match input image size
    depth_image = depth_image.resize(image.size, Image.LANCZOS)
    depth_image.save(output_path)

    print(f"\n‚úÖ Success!")
    print(f"Depth map saved to: {output_path}")
    print(f"Output size: {depth_image.size[0]}x{depth_image.size[1]}\n")

    return normalized_depth, depth_image


In [None]:
# if __name__ == "__main__":
#     import argparse

#     parser = argparse.ArgumentParser(description='Simple Depth Estimation')
#     parser.add_argument('--image', type=str, required=True,
#                         help='Path to input image')
#     parser.add_argument('--output', type=str, default='depth_map.png',
#                         help='Output depth map path')
#     parser.add_argument('--model', type=str, default='Intel/dpt-large',
#                         help='Model name (Intel/dpt-large, Intel/dpt-hybrid-midas, etc.)')
#     parser.add_argument('--colormap', type=str, default='viridis',
#                         help='Matplotlib colormap (viridis, plasma, inferno, magma)')

#     args = parser.parse_args()

try:
    simple_depth_estimation(
        image_path='https://cdn.getyourguide.com/image/format=auto,fit=contain,gravity=auto,quality=60,width=1440,height=650,dpr=2/tour_img/cbd0dea59e2a7432c2e279cc6af7fd14b8c67f31353c27c5f157ea07dc458b71.png',
        output_path='/kaggle/working/depth_map.png',
        model_name='Intel/dpt-large',
        colormap='viridis',
    )
except Exception as e:
    print(f"\n‚ùå Error: {e}")
    import traceback
    traceback.print_exc()


üßπ Cleared GPU memory cache


Simple Depth Estimation
Device: cuda
GPUs: 1
  GPU 0: Tesla P100-PCIE-16GB

Loading image: https://cdn.getyourguide.com/image/format=auto,fit=contain,gravity=auto,quality=60,width=1440,height=650,dpr=2/tour_img/cbd0dea59e2a7432c2e279cc6af7fd14b8c67f31353c27c5f157ea07dc458b71.png
Image size: 1947x1300

Loading model: Intel/dpt-large
(This will download the model on first run)



preprocessor_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json:   0%|          | 0.00/942 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.37G [00:00<?, ?B/s]

Some weights of DPTForDepthEstimation were not initialized from the model checkpoint at Intel/dpt-large and are newly initialized: ['neck.fusion_stage.layers.0.residual_layer1.convolution1.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution1.weight', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚ö° Single GPU mode

Estimating depth...
Depth range: 0.00 to 38.92

‚úÖ Success!
Depth map saved to: /kaggle/working/depth_map.png
Output size: 1947x1300



  cmap = cm.get_cmap(colormap)
