In [1]:
import argparse
import os
import random

import numpy as np
import torch
import torch.backends.cudnn as cudnn
import gradio as gr

from video_llama.common.config import Config
from video_llama.common.dist_utils import get_rank
from video_llama.common.registry import registry
from video_llama.conversation.conversation_video import (
    Chat, Conversation, default_conversation,SeparatorStyle,conv_llava_llama_2
)
import decord
decord.bridge.set_bridge('torch')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from video_llama.datasets.builders import *
from video_llama.models import *
from video_llama.processors import *
from video_llama.runners import *
from video_llama.tasks import *

In [3]:
def setup_seeds(config):
    seed = config.run_cfg.seed + get_rank()

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    cudnn.benchmark = False
    cudnn.deterministic = True

In [4]:
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

In [5]:
args = dict(
    cfg_path="./eval_configs/video_llama_eval_only_vl_edited.yaml",
    model_type="llama_v2",
    gpu_id=0,
    options=[],
)
args = AttrDict(args)
args

{'cfg_path': './eval_configs/video_llama_eval_only_vl_edited.yaml',
 'model_type': 'llama_v2',
 'gpu_id': 0,
 'options': []}

In [6]:
cfg = Config(args)

In [7]:
model_config = cfg.model_cfg
model_config.device_8bit = args.gpu_id
model_cls = registry.get_model_class(model_config.arch)


In [8]:
model_config.llama_model

'/work/piyush/pretrained_checkpoints/LargeModels/VideoLLAMA/Video-LLaMA-2-7B-Pretrained/llama-2-7b-chat-hf/'

In [9]:
model_config.ckpt

'/work/piyush/pretrained_checkpoints/LargeModels/VideoLLAMA/Video-LLaMA-2-7B-Pretrained/VL_LLaMA_2_7B_Pretrained.pth'

In [10]:
# local_folder = "/users/piyush/.cache/huggingface/hub/models--DAMO-NLP-SG--Video-LLaMA-2-7B-Pretrained/"\
#     "snapshots/52407e33d301c6fbab629c4a98391905a02e849b/llama-2-7b-chat-hf"
# model_config.llama_model = "meta-llama/Llama-2-7b-chat-hf"
# local_folder = "/work/piyush/pretrained_checkpoints/LargeModels/VideoLLAMA/Video-LLaMA-2-7B-Pretrained/llama-2-7b-chat-hf/"
# model_config.llama_model = local_folder

In [11]:
model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
model = model.eval()



Loading VIT
Loading VIT Done
Loading Q-Former


Using pad_token, but it is not set yet.
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.78s/it]


Load first Checkpoint: /work/piyush/pretrained_checkpoints/LargeModels/VideoLLAMA/Video-LLaMA-2-7B-Pretrained/VL_LLaMA_2_7B_Pretrained.pth


In [12]:
vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)

In [14]:
chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))

In [25]:
chat

<video_llama.conversation.conversation_video.Chat at 0x7fbd0f043880>

In [19]:
n_params = np.sum([p.numel() for p in model.parameters()])
n_params / 1e6

7851.579264

In [26]:
video_path = "./examples/birthday.mp4"
text = "What is this video showing?"

In [27]:
chat_state = conv_llava_llama_2.copy()

In [28]:
chat_state

Conversation(system='You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.', roles=('USER', 'ASSISTANT'), messages=[], offset=0, sep_style=<SeparatorStyle.LLAMA_2: 3>, sep='<s>', sep2='</s>', skip_next=False, conv_id=None)

In [29]:
chat_state.system =  "You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail."
img_list = []
llm_message = chat.upload_video_without_audio(video_path, chat_state, img_list)

./examples/birthday.mp4


RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR