[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/SeedVR2-jupyter/blob/main/SeedVR2_jupyter.ipynb)

In [None]:
%cd /content
!GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main https://github.com/microsoft/DAViD

!wget https://facesyntheticspubwedata.z6.web.core.windows.net/iccv-2025/models/multi-task-model-vitl16_384.onnx
!wget https://huggingface.co/uf/OmniAvatar_Assets/resolve/main/img.png

%cd /content/DAViD
!pip install -r requirement.txt
!pip install onnxruntime-gpu

In [None]:
%cd /content/DAViD

import os
import sys
from typing import Optional
import numpy as np
from PIL import Image

sys.path.append(os.path.join(os.getcwd(), "runtime"))

from depth_estimator import RelativeDepthEstimator
from multi_task_estimator import MultiTaskEstimator
from soft_foreground_segmenter import SoftForegroundSegmenter
from surface_normal_estimator import SurfaceNormalEstimator
from visualize import (
    create_concatenated_display,
    visualize_foreground,
    visualize_normal_maps,
    visualize_relative_depth_map,
)


def generate(
    image_path: str,
    multitask_model: Optional[str] = None,
    depth_model: Optional[str] = None,
    foreground_model: Optional[str] = None,
    normal_model: Optional[str] = None,
    output_path: Optional[str] = None,
    headless: bool = False,
) -> None:
    """Main function to run the demo with input arguments.

    Args:
        image_path (str): Path to the input image.
        multitask_model (Optional[str]): Path to the multi-task ONNX model. Defaults to None.
        depth_model (Optional[str]): Path to the depth estimation ONNX model. Defaults to None.
        foreground_model (Optional[str]): Path to the foreground segmentation ONNX model. Defaults to None.
        normal_model (Optional[str]): Path to the surface normal estimation ONNX model. Defaults to None.
        output_path (Optional[str]): Directory to save output results. Defaults to None.
        headless (bool): Run without GUI display (for headless servers). Defaults to False.
    """
    if not os.path.exists(image_path):
        print(f"Error: Image not found: {image_path}")
        return

    multitask_available = multitask_model and os.path.exists(multitask_model)
    depth_available = depth_model and os.path.exists(depth_model)
    foreground_available = foreground_model and os.path.exists(foreground_model)
    normal_available = normal_model and os.path.exists(normal_model)

    if not (
        multitask_available
        or depth_available
        or foreground_available
        or normal_available
    ):
        print("Error: At least one model must be provided and exist.")
        print("Available options:")
        print("  multitask_model: Multi-task model for all tasks")
        print("  depth_model: Individual depth estimation model")
        print("  foreground_model: Individual foreground segmentation model")
        print("  normal_model: Individual surface normal estimation model")
        return

    if output_path and not os.path.exists(output_path):
        os.makedirs(output_path, exist_ok=True)

    # Load image using Pillow and convert to numpy array (RGB to BGR for compatibility)
    try:
        pil_image = Image.open(image_path).convert("RGB")
        image = np.array(pil_image)[:, :, ::-1]  # Convert RGB to BGR
    except Exception as e:
        print(f"Error: Could not read the image from {image_path}: {e}")
        return

    results = {}

    has_individual_models = (
        (depth_model and os.path.exists(depth_model))
        or (foreground_model and os.path.exists(foreground_model))
        or (normal_model and os.path.exists(normal_model))
    )

    if has_individual_models:
        results["individual"] = process_with_individual_models(
            image, depth_model, foreground_model, normal_model
        )
    if multitask_model:
        results["multitask"] = process_with_multitask_model(image, multitask_model)

    display_results(image, results, output_path, headless)


def process_with_individual_models(
    image: np.ndarray,
    depth_model: Optional[str] = None,
    foreground_model: Optional[str] = None,
    normal_model: Optional[str] = None,
):
    """Process image using individual models for each task."""
    results = {}

    if depth_model:
        depth_estimator = RelativeDepthEstimator(
            onnx_model=depth_model, is_inverse=True
        )
        results["depth"] = depth_estimator.estimate_relative_depth(image)

    if foreground_model:
        foreground_segmenter = SoftForegroundSegmenter(onnx_model=foreground_model)
        results["foreground"] = foreground_segmenter.estimate_foreground_segmentation(
            image
        )

    if normal_model:
        normal_estimator = SurfaceNormalEstimator(onnx_model=normal_model)
        results["normal"] = normal_estimator.estimate_normal(image)

    return results


def process_with_multitask_model(image: np.ndarray, multitask_model: str):
    """Process image using multi-task model."""
    multitask_estimator = MultiTaskEstimator(
        onnx_model=multitask_model, is_inverse_depth=False
    )
    return multitask_estimator.estimate_all_tasks(image)


def display_results(
    image: np.ndarray,
    results: dict[str, np.ndarray],
    output_path: Optional[str] = None,
    headless: bool = False,
):
    """Display results by saving them to disk."""
    if "individual" in results:
        individual_result = display_single_model_results(
            image, results["individual"], prefix="Individual"
        )
        if output_path:
            save_image(
                individual_result,
                output_path=output_path,
                filename="individual_results.png",
            )
    if "multitask" in results:
        multitask_results = results["multitask"]
        multitask_result = display_single_model_results(
            image, multitask_results, prefix="Multi-task"
        )
        if output_path:
            save_image(
                multitask_result,
                output_path=output_path,
                filename="multitask_results.png",
            )

    if "individual" in results and "multitask" in results:
        if len(results["individual"]) == len(results["multitask"]):
            # Concatenate vertically (stack images)
            compare_results = np.vstack((individual_result, multitask_result))
            if output_path:
                save_image(
                    compare_results,
                    output_path=output_path,
                    filename="comparison_results.png",
                )

    if not headless:
        print("Headless mode disabled, but GUI display is not supported in this version.")
        print("Results have been saved to the output directory if specified.")


def display_single_model_results(image, model_results, prefix=""):
    """
    Display results from a single model (individual or multitask) by creating a concatenated visualization.

    Args:
        image (np.ndarray): Input image in BGR format.
        model_results (dict): Dictionary containing model outputs (e.g., 'depth', 'foreground', 'normal').
        prefix (str, optional): Prefix for visualization labels. Defaults to "".

    Returns:
        np.ndarray: Concatenated visualization image in BGR format.

    Raises:
        ValueError: If input image is invalid or model_results is empty.
        KeyError: If expected visualization functions are not defined.
    """
    if not isinstance(image, np.ndarray) or image.size == 0:
        raise ValueError("Invalid input image")
    if not model_results:
        raise ValueError("model_results dictionary is empty")

    visualizations = [image]
    labels = ["Input Image"]

    foreground_mask = model_results.get("foreground")

    if "depth" in model_results:
        depth_vis = visualize_relative_depth_map(image, model_results["depth"], foreground_mask)
        save_image(depth_vis, output_path="output", filename="depth_vis.png")
        visualizations.append(depth_vis)
        labels.append(f"{prefix} Depth Map")

    if "foreground" in model_results:
        foreground_vis = visualize_foreground(image, model_results["foreground"])
        save_image(foreground_vis, output_path="output", filename="foreground_vis.png")
        visualizations.append(foreground_vis)
        labels.append(f"{prefix} Foreground Mask")

    if "normal" in model_results:
        normal_vis = visualize_normal_maps(image, model_results["normal"], foreground_mask)
        save_image(normal_vis, output_path="output", filename="normal_vis.png")
        visualizations.append(normal_vis)
        labels.append(f"{prefix} Normal Map")

    result = create_concatenated_display(visualizations, labels, downscale=2)
    return result


def save_image(image, output_path, filename="individual_results.png"):
    """
    Save an image to the specified path using Pillow.

    Args:
        image (np.ndarray): Image to save (BGR format).
        output_path (str): Directory path to save the image.
        filename (str, optional): Output filename. Defaults to "individual_results.png".

    Raises:
        ValueError: If output_path is invalid or image is invalid.
    """
    if not os.path.isdir(output_path):
        raise ValueError(f"Output directory does not exist: {output_path}")
    if not isinstance(image, np.ndarray) or image.size == 0:
        raise ValueError("Invalid image for saving")

    output_file = os.path.join(output_path, filename)
    # Convert BGR to RGB (Pillow expects RGB)
    image_rgb = image[:, :, ::-1]  # Reverse channel order
    # Convert numpy array to Pillow Image
    pil_image = Image.fromarray(image_rgb)
    # Save the image
    pil_image.save(output_file)

In [None]:
generate(image_path="/content/DAViD/img.png",
            multitask_model="/content/multi-task-model-vitl16_384.onnx",
            depth_model=None,
            foreground_model=None,
            normal_model=None,
            output_path="output",
            headless=True,)
Image.open("/content/DAViD/output/multitask_results.png")