In [1]:
%reset -f

In [2]:
!rm -rf /content/*

In [3]:
import importlib
importlib.invalidate_caches()

In [4]:
import torch
torch.cuda.empty_cache()

In [None]:
!kill -9 -1

In [1]:
!pip install diffusers mediapipe>=0.10.8 transformers huggingface-hub omegaconf
!pip install einops opencv-python face-alignment decord ffmpeg-python
!pip install safetensors soundfile

!git clone https://github.com/a12343g/LatentSync.git
%cd LatentSync

import os
from google.colab import files
import torch
from omegaconf import OmegaConf
from diffusers import AutoencoderKL, DDIMScheduler
from latentsync.models.unet import UNet3DConditionModel
from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
from latentsync.whisper.audio2feature import Audio2Feature
from diffusers.utils.import_utils import is_xformers_available
from accelerate.utils import set_seed
import ipywidgets as widgets

os.makedirs("/root/.cache/torch/hub/checkpoints", exist_ok=True)



!mkdir -p checkpoints

!wget -P checkpoints/ https://huggingface.co/chunyu-li/LatentSync/resolve/main/latentsync_unet.pt
!wget -P checkpoints/whisper/ https://huggingface.co/chunyu-li/LatentSync/resolve/main/whisper/tiny.pt
!wget https://huggingface.co/stabilityai/sd-vae-ft-mse/resolve/main/diffusion_pytorch_model.safetensors -O checkpoints/diffusion_pytorch_model.safetensors
!wget https://huggingface.co/stabilityai/sd-vae-ft-mse/raw/main/config.json -O checkpoints/config.json




def perform_inference(video_path, audio_path, seed=1247, num_inference_steps=20, guidance_scale=1.0, output_path="output_video.mp4"):
    config_path = "configs/unet/first_stage.yaml"
    inference_ckpt_path = "checkpoints/latentsync_unet.pt"

    config = OmegaConf.load(config_path)

    is_fp16_supported = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] > 7
    dtype = torch.float16 if is_fp16_supported else torch.float32

    scheduler = DDIMScheduler.from_pretrained("configs")

    whisper_model_path = "checkpoints/tiny.pt"
    audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)

    vae = AutoencoderKL.from_pretrained("checkpoints", torch_dtype=dtype, local_files_only=True)
    vae.config.scaling_factor = 0.18215
    vae.config.shift_factor = 0

    unet, _ = UNet3DConditionModel.from_pretrained(
        OmegaConf.to_container(config.model),
        inference_ckpt_path,
        device="cpu",
    )

    unet = unet.to(dtype=dtype)

    if is_xformers_available():
        unet.enable_xformers_memory_efficient_attention()
        print('x_formers available!')

    pipeline = LipsyncPipeline(
        vae=vae,
        audio_encoder=audio_encoder,
        unet=unet,
        scheduler=scheduler,
    ).to("cuda")

    set_seed(seed)

    pipeline(
        video_path=video_path,
        audio_path=audio_path,
        video_out_path=output_path,
        video_mask_path=output_path.replace(".mp4", "_mask.mp4"),
        num_frames=config.data.num_frames,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        weight_dtype=dtype,
        width=config.data.resolution,
        height=config.data.resolution,
    )
    return output_path


Collecting face-alignment
  Downloading face_alignment-1.4.1-py2.py3-none-any.whl.metadata (7.4 kB)
Collecting decord
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->face-alignment)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->face-alignment)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->face-alignment)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->face-alignment)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu1

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

--2025-02-10 12:38:00--  https://huggingface.co/chunyu-li/LatentSync/resolve/main/latentsync_unet.pt
Resolving huggingface.co (huggingface.co)... 3.165.160.11, 3.165.160.59, 3.165.160.61, ...
Connecting to huggingface.co (huggingface.co)|3.165.160.11|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/cc/09/cc0953f842c11ec735bcfef4c46e35abbcfcfbdb1449e9014c1a358c63b5b054/63197c73d21ad55ddf2b6e5cc38d0a19a1e494317aefe2707c6b6c6fc952f3c7?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27latentsync_unet.pt%3B+filename%3D%22latentsync_unet.pt%22%3B&Expires=1739194500&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczOTE5NDUwMH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2NjLzA5L2NjMDk1M2Y4NDJjMTFlYzczNWJjZmVmNGM0NmUzNWFiYmNmY2ZiZGIxNDQ5ZTkwMTRjMWEzNThjNjNiNWIwNTQvNjMxOTdjNzNkMjFhZDU1ZGRmMmI2ZTVjYzM4ZDBhMTlhMWU0OTQzMTdhZWZlMjcwN2M2YjZjNmZjOTUyZjNjNz9yZXNwb25zZS1j

In [3]:
!python -m scripts.inference \
    --unet_config_path "configs/unet/second_stage.yaml" \
    --inference_ckpt_path "checkpoints/latentsync_unet.pt" \
    --guidance_scale 1.5 \
    --video_path "/content/v1.mp4" \
    --audio_path "/content/a1.wav" \
    --video_out_path "/content/output_video.mp4"

2025-02-10 12:45:05.844096: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739191505.878020    2710 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739191505.888438    2710 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Input video path: /content/v1.mp4
Input audio path: /content/a1.wav
Loaded checkpoint path: checkpoints/latentsync_unet.pt
  checkpoint = torch.load(fp, map_location=device)
config.json: 100% 547/547 [00:00<00:00, 3.61MB/s]
diffusion_pytorch_model.safetensors: 100% 335M/335M [00:01<00:00, 229MB/s]
  ckpt = torch.load(ckpt_path, map_location=device)
Initial seed: 1247
Downloading: "https://www.adrianbulat.com/downloads/python-fan/s3fd