# Running zeroscope locally to test the properties of the pipeline

In [1]:
import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from diffusers.utils import export_to_video
import shutil
import os
# Check gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")



Using device: cuda:0


In [2]:
pipe = DiffusionPipeline.from_pretrained("../zeroscope_v2_576w", torch_dtype=torch.float16)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
# pipe.enable_model_cpu_offload()
pipe.to(device)

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

TextToVideoSDPipeline {
  "_class_name": "TextToVideoSDPipeline",
  "_diffusers_version": "0.20.2",
  "_name_or_path": "../zeroscope_v2_576w",
  "scheduler": [
    "diffusers",
    "DPMSolverMultistepScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet3DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [4]:
text_emb1 = pipe._encode_prompt(
        ["a dog","cat"],
        "cuda:0",
        num_images_per_prompt=1,
        do_classifier_free_guidance=False,
    )

text_emb2 = pipe._encode_prompt(
        "a dog",
        "cuda:0",
        num_images_per_prompt=1,
        do_classifier_free_guidance=False,
    )

print(text_emb1.shape)
# compare tensors in first dimension
# print(torch.allclose(text_emb1[:,1,:], text_emb1[:,0,:]))

torch.Size([2, 77, 1024])


In [6]:
import nvidia_smi

nvidia_smi.nvmlInit()

handle = nvidia_smi.nvmlDeviceGetHandleByIndex(1)
# card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate

info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)

print("Total memory:", info.total)
print("Free memory:", info.free)
print("Used memory:", info.used)

nvidia_smi.nvmlShutdown()

Total memory: 25769803776
Free memory: 22704619520
Used memory: 3065184256


### Perform inference

In [10]:
## Perform inference

prompt = "a toddler is holding a hose up high and spraying water. The baby is over the lawn"
video_frames = pipe(prompt, num_inference_steps=10, height=320, width=320, num_frames=24).frames
video_path = export_to_video(video_frames)

# Move video to output folder
output_path = f'../zeroscope_test_generations/{prompt}.mp4'
output_gif_filepath = f'../zeroscope_test_generations/{prompt}.gif'
shutil.move(video_path, output_path)
print(output_path)

# Make gif
from utils import vid_to_gif
vid_to_gif(output_path, output_gif_filepath, size=256)




  0%|          | 0/10 [00:00<?, ?it/s]

24 <class 'list'> <class 'numpy.ndarray'>
(320, 320, 3)
../zeroscope_test_generations/a baby is holding a hose up high and spraying water. The baby is over the lawn.mp4
MoviePy - Building file  ../zeroscope_test_generations/a baby is holding a hose up high and spraying water. The baby is over the lawn.gif
MoviePy - - Generating GIF frames.


                                                  

MoviePy - - File ready: ../zeroscope_test_generations/a baby is holding a hose up high and spraying water. The baby is over the lawn.gif.




### Testing RAM usage when loading all BOLDMoments videos at once

In [9]:
import os
import numpy as np  

n_frames_to_load = 45
size = 268
n_videos = 1102

# Show current RAM usage
print("RAM usage:", os.popen('free -t -m').readlines()[1].split()[2], "MB")

# Make an array of random values
videos = np.random.rand(n_videos, n_frames_to_load, 3, size, size)

# videos = np.random.rand((n_videos, n_frames_to_load, 3, size, size))

# DEBUG: Print memory size of videos array
print("Videos array size:", videos.nbytes/1024**2, "MB")

# Show current RAM usage after loading videos
print("RAM usage:", os.popen('free -t -m').readlines()[1].split()[2], "MB")

RAM usage: 23065 MB
Videos array size: 81522.03735351562 MB
RAM usage: 104675 MB
