In [None]:
%%capture

%cd /content
!GIT_LFS_SKIP_SMUDGE=1 git clone -b dev https://github.com/camenduru/InstantMesh
%cd /content/InstantMesh

!pip install pytorch-lightning==2.1.2 gradio==3.50.2 einops omegaconf torchmetrics webdataset accelerate tensorboard
!pip install PyMCubes trimesh rembg transformers diffusers==0.20.2 bitsandbytes imageio[ffmpeg] xatlas plyfile
!pip install git+https://github.com/NVlabs/nvdiffrast jax==0.4.19 jaxlib==0.4.19 ninja
!pip install gradio

In [None]:
import numpy as np
import rembg
from PIL import Image
from pytorch_lightning import seed_everything
from einops import rearrange
from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler
from huggingface_hub import hf_hub_download
from src.utils.infer_util import remove_background, resize_foreground
import os
from torchvision.transforms import v2
from huggingface_hub import hf_hub_download
from omegaconf import OmegaConf
from PIL import Image
import numpy as np
from einops import rearrange, repeat
import tempfile
from tqdm import tqdm
import imageio
import torch
import gradio as gr
from src.utils.train_util import instantiate_from_config
from src.utils.camera_util import (FOV_to_intrinsics, get_zero123plus_input_cameras,get_circular_camera_poses,)
from src.utils.mesh_util import save_obj, save_obj_with_mtl


In [None]:
torch.cuda.empty_cache()


In [None]:
import os
import gradio as gr
import torch
from PIL import Image
import numpy as np
from tqdm import tqdm
import tempfile
import imageio
from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler
from huggingface_hub import hf_hub_download
from einops import rearrange
from rembg import remove, new_session
from omegaconf import OmegaConf
torch.cuda.empty_cache()
# Define the necessary classes and methods
class InstantMeshGenerator:
    def __init__(self, model_path, unet_repo_id, unet_filename, device='cuda'):
        self.pipeline = DiffusionPipeline.from_pretrained(model_path, custom_pipeline="zero123plus", torch_dtype=torch.float16)
        self.pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipeline.scheduler.config, timestep_spacing='trailing')
        unet_ckpt_path = hf_hub_download(repo_id=unet_repo_id, filename=unet_filename, repo_type="model")
        state_dict = torch.load(unet_ckpt_path, map_location='cpu')
        self.pipeline.unet.load_state_dict(state_dict, strict=True)
        self.device = torch.device(device)
        self.pipeline = self.pipeline.to(self.device)
        self.seed_everything(0)

    @staticmethod
    def seed_everything(seed):
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    def preprocess(self, input_image, do_remove_background):
        rembg_session = new_session() if do_remove_background else None
        if do_remove_background:
            input_image = remove(input_image, session=rembg_session)
            input_image = resize_foreground(input_image, 0.85)
        return input_image

    def generate_mvs(self, input_image, sample_steps, sample_seed):
        self.seed_everything(sample_seed)
        generator = torch.Generator(device=self.device)
        z123_image = self.pipeline(
            input_image,
            num_inference_steps=sample_steps,
            generator=generator,
        ).images[0]
        show_image = np.asarray(z123_image, dtype=np.uint8)
        show_image = torch.from_numpy(show_image)     # (960, 640, 3)
        show_image = rearrange(show_image, '(n h) (m w) c -> (n m) h w c', n=3, m=2)
        show_image = rearrange(show_image, '(n m) h w c -> (n h) (m w) c', n=2, m=3)
        show_image = Image.fromarray(show_image.numpy())
        return z123_image, show_image

def check_input_image(input_image):
    return input_image

def preprocess(input_image, do_remove_background):
    torch.cuda.empty_cache()
    mesh_generator = InstantMeshGenerator(
        model_path="sudo-ai/zero123plus-v1.2",
        unet_repo_id="TencentARC/InstantMesh",
        unet_filename="diffusion_pytorch_model.bin"
    )
    processed_image = mesh_generator.preprocess(input_image, do_remove_background)
    return processed_image

def generate_mvs(processed_image, sample_steps, sample_seed):
    torch.cuda.empty_cache()
    mesh_generator = InstantMeshGenerator(
        model_path="sudo-ai/zero123plus-v1.2",
        unet_repo_id="TencentARC/InstantMesh",
        unet_filename="diffusion_pytorch_model.bin"
    )
    mv_image, mv_show_image = mesh_generator.generate_mvs(processed_image, sample_steps, sample_seed)

    return mv_image, mv_show_image

class InstantMeshProcessor:
    def __init__(self):
        self.pipeline = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.config_path = 'configs/instant-mesh-base.yaml'
        self.config = OmegaConf.load(self.config_path)
        self.config_name = os.path.basename(self.config_path).replace('.yaml', '')
        self.model_config = self.config.model_config
        self.infer_config = self.config.infer_config
        self.model_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="instant_mesh_base.ckpt", repo_type="model")
        self.model = instantiate_from_config(self.model_config)
        state_dict = torch.load(self.model_ckpt_path, map_location='cpu')['state_dict']
        state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.') and 'source_camera' not in k}
        self.model.load_state_dict(state_dict, strict=True)
        self.model = self.model.to(self.device).eval()
        self.IS_FLEXICUBES = True if self.config_name.startswith('instant-mesh') else False
        if self.IS_FLEXICUBES:
            self.model.init_flexicubes_geometry(self.device, fovy=30.0)

    def images_to_video(self, images, output_path, fps=30):
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        frames = [(images[i].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8).clip(0, 255) for i in range(images.shape[0])]
        imageio.mimwrite(output_path, np.stack(frames), fps=fps, codec='h264')

    def get_render_cameras(self, batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False):
        c2ws = get_circular_camera_poses(M=M, radius=radius, elevation=elevation)
        if is_flexicubes:
            cameras = torch.linalg.inv(c2ws).unsqueeze(0).repeat(batch_size, 1, 1, 1)
        else:
            extrinsics = c2ws.flatten(-2)
            intrinsics = FOV_to_intrinsics(30.0).unsqueeze(0).repeat(M, 1, 1).float().flatten(-2)
            cameras = torch.cat([extrinsics, intrinsics], dim=-1).unsqueeze(0).repeat(batch_size, 1, 1)
        return cameras

    def make_mesh(self, mesh_fpath, planes):
        mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
        mesh_dirname = os.path.dirname(mesh_fpath)
        mesh_vis_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.glb")
        with torch.no_grad():
            mesh_out = self.model.extract_mesh(planes, use_texture_map=True, **self.infer_config)
            vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
            save_obj_with_mtl(
                vertices.data.cpu().numpy(),
                uvs.data.cpu().numpy(),
                faces.data.cpu().numpy(),
                mesh_tex_idx.data.cpu().numpy(),
                tex_map.permute(1, 2, 0).data.cpu().numpy(),
                mesh_fpath,
            )
            print(f"Mesh with texmap saved to {mesh_fpath}")
        return mesh_fpath

    def make3d(self, images):
        torch.cuda.empty_cache()
        images = np.asarray(images, dtype=np.float32) / 255.0
        images = torch.from_numpy(images).permute(2, 0, 1).contiguous().float()
        images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=3, m=2)
        input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to(self.device)
        render_cameras = self.get_render_cameras(batch_size=1, radius=4.5, elevation=20.0, is_flexicubes=self.IS_FLEXICUBES).to(self.device)
        images = images.unsqueeze(0).to(self.device)
        images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1)
        directory = '/content/tmp'
        if not os.path.exists(directory):
            os.makedirs(directory)
        tempfile.tempdir = directory
        mesh_fpath = tempfile.NamedTemporaryFile(suffix=".obj", delete=False).name
        mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
        mesh_dirname = os.path.dirname(mesh_fpath)
        video_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.mp4")
        with torch.no_grad():
            planes = self.model.forward_planes(images, input_cameras)
            chunk_size = 20 if self.IS_FLEXICUBES else 1
            render_size = 384
            frames = []
            for i in tqdm(range(0, render_cameras.shape[1], chunk_size)):
                if self.IS_FLEXICUBES:
                    frame = self.model.forward_geometry(planes, render_cameras[:, i:i+chunk_size], render_size=render_size)['img']
                else:
                    frame = self.model.synthesizer(planes, cameras=render_cameras[:, i:i+chunk_size], render_size=render_size)['images_rgb']
                frames.append(frame)
            frames = torch.cat(frames, dim=1)
            self.images_to_video(frames[0], video_fpath, fps=30)
            mesh_fpath = self.make_mesh(mesh_fpath, planes)

            !cp -f {video_fpath} /content/InstantMesh/output_video.mp4
            !cp -f {mesh_fpath} /content/InstantMesh/output_model.obj
            mesh_fpath='/content/InstantMesh/output_model.obj'
            video_fpath='/content/InstantMesh/output_video.mp4'
        print("make3d is Doneeeeeeeeeeeeeeee")
        return video_fpath

    def path(self,img):
        torch.cuda.empty_cache()
        path=InstantMeshProcessor().make3d(img)
        print(path)
        return path

# Create the Gradio interface
with gr.Blocks() as demo:
    with gr.Row(variant="panel"):
        with gr.Column():
            with gr.Row():
                input_image = gr.Image(
                    label="Input Image",
                    image_mode="RGBA",
                    sources="upload",
                    width=256,
                    height=256,
                    type="pil",
                    elem_id="content_image",
                )
                processed_image = gr.Image(
                    label="Processed Image",
                    image_mode="RGBA",
                    width=256,
                    height=256,
                    type="pil",
                    interactive=False
                )
            with gr.Row():
                with gr.Group():
                    do_remove_background = gr.Checkbox(
                        label="Remove Background", value=True
                    )
                    sample_seed = gr.Number(value=42, label="Seed Value", precision=0)

                    sample_steps = gr.Slider(
                        label="Sample Steps",
                        minimum=30,
                        maximum=75,
                        value=75,
                        step=5
                    )

            with gr.Row():
                submit = gr.Button("Generate", elem_id="generate", variant="primary")


        with gr.Column():

            with gr.Row():

                with gr.Column():
                    mv_show_images = gr.Image(
                        label="Generated Multi-views",
                        type="pil",
                        width=379,
                        interactive=False
                    )

                with gr.Column():
                    output_video = gr.Video(
                        label="video", format="mp4",
                        width=379,
                        autoplay=True,
                        interactive=False,
                        # type="filepath"
                    )

            # with gr.Row():
            #     with gr.Tab("OBJ"):
            #         output_model_obj = gr.Model3D(
            #             label="Output Model (OBJ Format)",
            #             #width=768,
            #             interactive=False,
            #             type='"filepath"'
            #         )



    mv_images = gr.State()

    submit.click(fn=check_input_image, inputs=[input_image]).success(
        fn=preprocess,
        inputs=[input_image, do_remove_background],
        outputs=[processed_image],
    ).success(
        fn=generate_mvs,
        inputs=[processed_image, sample_steps, sample_seed],
        outputs=[mv_images, mv_show_images],
    ).success(
        fn=InstantMeshProcessor().path,
        inputs=[mv_images],
        outputs=[output_video],
        # timeout=60  # Set your desired timeout in seconds

    )

demo.queue(max_size=1000)
demo.launch(debug=True,enable_queue=True)


  input_image = gr.Image(


IMPORTANT: You are using gradio version 3.50.2, however version 4.29.0 is available, please upgrade.
--------


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


instant_mesh_base.ckpt:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vitb16 and are newly initialized: ['encoder.layer.0.adaLN_modulation.1.bias', 'encoder.layer.0.adaLN_modulation.1.weight', 'encoder.layer.1.adaLN_modulation.1.bias', 'encoder.layer.1.adaLN_modulation.1.weight', 'encoder.layer.10.adaLN_modulation.1.bias', 'encoder.layer.10.adaLN_modulation.1.weight', 'encoder.layer.11.adaLN_modulation.1.bias', 'encoder.layer.11.adaLN_modulation.1.weight', 'encoder.layer.2.adaLN_modulation.1.bias', 'encoder.layer.2.adaLN_modulation.1.weight', 'encoder.layer.3.adaLN_modulation.1.bias', 'encoder.layer.3.adaLN_modulation.1.weight', 'encoder.layer.4.adaLN_modulation.1.bias', 'encoder.layer.4.adaLN_modulation.1.weight', 'encoder.layer.5.adaLN_modulation.1.bias', 'encoder.layer.5.adaLN_modulation.1.weight', 'encoder.layer.6.adaLN_modulation.1.bias', 'encoder.layer.6.adaLN_modulation.1.weight', 'encoder.layer.7.adaLN_modulation.1.bias', 'encoder.layer.7.adaLN_modulation.1.w

preprocessor_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
  demo.launch(debug=True,enable_queue=True)


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://06e3335a54012446dd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




model_index.json:   0%|          | 0.00/2.63k [00:00<?, ?B/s]

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

(…)e_extractor_vae/preprocessor_config.json:   0%|          | 0.00/369 [00:00<?, ?B/s]

(…)_extractor_clip/preprocessor_config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/391 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/681M [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

vision_encoder/config.json:   0%|          | 0.00/672 [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/708 [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/745 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/8 [00:00<?, ?it/s]

The config attributes {'dropout': 0.0, 'reverse_transformer_layers_per_block': None} were passed to UNet2DConditionModel, but are not expected and will be ignored. Please verify your config.json configuration file.


diffusion_pytorch_model.bin:   0%|          | 0.00/1.73G [00:00<?, ?B/s]

Downloading data from 'https://github.com/danielgatis/rembg/releases/download/v0.0.0/u2net.onnx' to file '/root/.u2net/u2net.onnx'.
100%|████████████████████████████████████████| 176M/176M [00:00<00:00, 151GB/s]


Loading pipeline components...:   0%|          | 0/8 [00:00<?, ?it/s]

The config attributes {'dropout': 0.0, 'reverse_transformer_layers_per_block': None} were passed to UNet2DConditionModel, but are not expected and will be ignored. Please verify your config.json configuration file.
  return F.conv2d(input, weight, bias, self.stride,


  0%|          | 0/75 [00:00<?, ?it/s]

  show_image = torch.from_numpy(show_image)     # (960, 640, 3)
Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vitb16 and are newly initialized: ['encoder.layer.0.adaLN_modulation.1.bias', 'encoder.layer.0.adaLN_modulation.1.weight', 'encoder.layer.1.adaLN_modulation.1.bias', 'encoder.layer.1.adaLN_modulation.1.weight', 'encoder.layer.10.adaLN_modulation.1.bias', 'encoder.layer.10.adaLN_modulation.1.weight', 'encoder.layer.11.adaLN_modulation.1.bias', 'encoder.layer.11.adaLN_modulation.1.weight', 'encoder.layer.2.adaLN_modulation.1.bias', 'encoder.layer.2.adaLN_modulation.1.weight', 'encoder.layer.3.adaLN_modulation.1.bias', 'encoder.layer.3.adaLN_modulation.1.weight', 'encoder.layer.4.adaLN_modulation.1.bias', 'encoder.layer.4.adaLN_modulation.1.weight', 'encoder.layer.5.adaLN_modulation.1.bias', 'encoder.layer.5.adaLN_modulation.1.weight', 'encoder.layer.6.adaLN_modulation.1.bias', 'encoder.layer.6.adaLN_modulation.1.weight', 'encoder.layer.7

Mesh with texmap saved to /content/tmp/tmpz4k6xj3c.obj
make3d is Doneeeeeeeeeeeeeeee
/content/InstantMesh/output_video.mp4
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://06e3335a54012446dd.gradio.live


