In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import argparse
import os
import random

import numpy as np
import torch
import torch.backends.cudnn as cudnn
import gradio as gr

from video_llama.common.config import Config
from video_llama.common.dist_utils import get_rank
from video_llama.common.registry import registry
from video_llama.conversation.conversation_video import (
    Chat, Conversation, default_conversation,SeparatorStyle,conv_llava_llama_2
)
import decord
decord.bridge.set_bridge('torch')

In [4]:
from video_llama.datasets.builders import *
from video_llama.models import *
from video_llama.processors import *
from video_llama.runners import *
from video_llama.tasks import *

In [5]:
def setup_seeds(config):
    seed = config.run_cfg.seed + get_rank()

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    cudnn.benchmark = False
    cudnn.deterministic = True

In [6]:
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

In [7]:
args = dict(
    cfg_path="./eval_configs/video_llama_eval_only_vl_edited.yaml",
    model_type="llama_v2",
    gpu_id=0,
    options=[],
)
args = AttrDict(args)
args

{'cfg_path': './eval_configs/video_llama_eval_only_vl_edited.yaml',
 'model_type': 'llama_v2',
 'gpu_id': 0,
 'options': []}

#### Load config

In [8]:
cfg = Config(args)

In [9]:
model_config = cfg.model_cfg
model_config.device_8bit = args.gpu_id
model_cls = registry.get_model_class(model_config.arch)

In [10]:
model_config.llama_model

'/work/piyush/pretrained_checkpoints/LargeModels/VideoLLAMA/Video-LLaMA-2-7B-Pretrained/llama-2-7b-chat-hf/'

In [11]:
model_config.ckpt

'/work/piyush/pretrained_checkpoints/LargeModels/VideoLLAMA/Video-LLaMA-2-7B-Pretrained/VL_LLaMA_2_7B_Pretrained.pth'

In [12]:
model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
model = model.eval()

Loading VIT
Loading VIT Done
Loading Q-Former


Using pad_token, but it is not set yet.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:36<00:00, 18.30s/it]


Load first Checkpoint: /work/piyush/pretrained_checkpoints/LargeModels/VideoLLAMA/Video-LLaMA-2-7B-Pretrained/VL_LLaMA_2_7B_Pretrained.pth


In [13]:
vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)

In [14]:
chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))

In [15]:
chat

<video_llama.conversation.conversation_video.Chat at 0x7f3407802850>

In [16]:
n_params = np.sum([p.numel() for p in model.parameters()])
n_params / 1e6

7851.579264

In [22]:
video_path = "./examples/birthday.mp4"

In [18]:
chat_state = conv_llava_llama_2.copy()

In [19]:
chat_state

Conversation(system='You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.', roles=('USER', 'ASSISTANT'), messages=[], offset=0, sep_style=<SeparatorStyle.LLAMA_2: 3>, sep='<s>', sep2='</s>', skip_next=False, conv_id=None)

In [20]:
chat_state.system =  "You are able to understand the visual content that the user provides."\
    "Follow the instructions carefully and explain your answers in detail."
img_list = []
llm_message = chat.upload_video_without_audio(video_path, chat_state, img_list)

./examples/birthday.mp4


In [23]:
user_message = "What is this video showing?"
chat.ask(user_message, chat_state)

In [26]:
num_beams = 1
temperature = 1.0

In [27]:
llm_message = chat.answer(
    conv=chat_state,
    img_list=img_list,
    num_beams=num_beams,
    temperature=temperature,
    max_new_tokens=300,
    max_length=2000,
)[0]

In [29]:
print(llm_message)

Based on the frames provided in the video, it appears to be showing a young man sitting at a table with a laptop and a birthday cake. Here's a detailed explanation of each frame:
0.0 seconds: The video starts with the man sitting at a table, wearing a blue shirt and a yellow party hat. He's looking directly at the camera with a smile on his face.
2.1 seconds: The man reaches out and turns on the laptop.
4.2 seconds: He starts typing on the laptop and looks up, as if he's checking something.
6.2 seconds: The man types some more on the laptop and then leans back in his chair, looking relaxed.
8.3 seconds: He types on the laptop and then looks up again, this time with a big smile on his face. He's probably excited about something.
10.4 seconds: The man types on the laptop and then stands up, holding the laptop in one hand and the birthday cake in the other. He's probably getting ready to make a toast.
12.5 seconds: The man raises the laptop and the birthday cake, both of which are decorat

**Inference on a SSv2 video sample**

In [33]:
video_path = "../TimeBound.v1/sample_data/folding_paper.mp4"
assert os.path.exists(video_path)

In [34]:
chat_state = conv_llava_llama_2.copy()
chat_state.system =  "You are able to understand the visual content that the user provides."\
    "Follow the instructions carefully and explain your answers in detail."
img_list = []
llm_message = chat.upload_video_without_audio(video_path, chat_state, img_list)

../TimeBound.v1/sample_data/folding_paper.mp4


In [36]:
user_message = """
    Given this video, you have to select which is the option that correctly describes the video.
    (a) Someone unfolding a paper. (b) Someone folding a paper.

    You have to only answer (a) or (b).
"""
chat.ask(user_message, chat_state)


llm_message = chat.answer(
    conv=chat_state,
    img_list=img_list,
    num_beams=num_beams,
    temperature=temperature,
    max_new_tokens=300,
    max_length=2000,
)[0]

print(llm_message)

Based on the video provided, the correct option is (b) Someone folding a paper.

In the video, we can see a person holding a white sheet of paper and folding it in a sequence of 8 frames. The person starts by taking the paper and folding it in half lengthwise, then folds it in half again, creating a crease in the middle. The person then folds the paper in half a third time, creating another crease. The video ends with the person holding the folded paper in their hand.
Therefore, the correct option is (b) Someone folding a paper.
