#### Large RAM is required to load the larger models. Running on GPU can optimize inference speed.

In [1]:
import sys
import os
import torch
import numpy as np
from lavis.models import load_model_and_preprocess

import decord
from decord import VideoReader
from decord import cpu, gpu
decord.bridge.set_bridge('torch')

#### Load an example video

In [2]:
def load_video(vr, start_time, end_time, fps, num_frames=20):
    start_index = int(round(start_time * fps))
    end_index = int(round(end_time * fps))
    select_frame_index = np.rint(np.linspace(start_index, end_index-1, num_frames)).astype(int).tolist()
    frames = vr.get_batch(select_frame_index).permute(3, 0, 1, 2).to(torch.float32)
    return frames

file_path = "example/video.mp4"
vr = VideoReader(file_path, ctx=cpu(0))
total_frames = len(vr)
fps = vr.get_avg_fps()
duration = total_frames / fps

print("video_duration: {:.1f}, fps: {:.1f}".format(duration, fps))

video_duration: 70.3, fps: 24.0


In [3]:
# setup device to use
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
print(device)

cuda


#### Visualize the full video

In [4]:
from IPython.display import HTML

# Embed video using HTML
HTML("""
<video width="640" height="480" controls>
  <source src="example/video.mp4" type="video/mp4">
</video>
""")

#### Load pre-trained InstructBlip model weights

In [5]:
# We associate a model with its preprocessors to make it easier for inference.
# You can specify the memory_bank_length and num_frames here.
model, vis_processors, _ = load_model_and_preprocess(
    name="blip2_vicuna_instruct_malmm", model_type="vicuna7b", is_eval=True, device=device, memory_bank_length=10, num_frames=20,
)
# model, vis_processors, _ = load_model_and_preprocess(
#     name="blip2_vicuna_instruct_malmm", model_type="vicuna13b", is_eval=True, device=device, memory_bank_length=10, num_frames=20,
# )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Load finetuned model weights

In [6]:
# Model loads the default config from lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml.
# If you want to load a finetuned checkpoints, such as the finetuned model weight of ActivityNet-QA dataset,
# you need to first set the load_finetuned=True and specify the finetuned checkpoint path and reload the model again.

# load_finetuned: True
# finetuned: "saved_model/ActivityNet_qa/checkpoint_best.pth"

# model, vis_processors, _ = load_model_and_preprocess(
#     name="blip2_vicuna_instruct_malmm", model_type="vicuna7b", is_eval=True, device=device, memory_bank_length=10, num_frames=20,
# )

#### Instructed zero-shot video-to-language generation

In [10]:
# load video by specifying the start_time and end_time
video = load_video(vr, start_time=0, end_time=duration, fps=fps, num_frames=20)
# prepare the video as model input using the associated processors
video = vis_processors["eval"](video).to(device).unsqueeze(0)
model.generate({"image": video, "prompt": "Question: what is the recipe of this video? Answer:"})

['scrambled eggs']

#### Online off-the-shelf setting with custom questions

In [8]:
video = load_video(vr, start_time=0, end_time=37, fps=fps, num_frames=20)
video = vis_processors["eval"](video).to(device).unsqueeze(0)
model.generate({"image": video, "prompt": "Question: what will happen for the next 5 seconds? Answer:"})

['eggs will be scrambled']

#### Generate multiple answers

In [9]:
video = load_video(vr, start_time=0, end_time=duration, fps=fps, num_frames=20)
video = vis_processors["eval"](video).to(device).unsqueeze(0)
model.generate({"image": video, "prompt": "Question: what does this video show? Answer:"}, num_captions=5)

['cooking', 'recipe', 'egg recipe', 'cooking eggs', 'cooking eggs on stove']