Code Reference: https://huggingface.co/LanguageBind/Video-LLaVA-7B-hf

In [1]:
import os 
print(os.getenv("CONDA_DEFAULT_ENV"))

stable_env


In [2]:
import av
import torch
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch.nn.functional as F
from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor

In [3]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`list[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

### Load Model

In [4]:
# Load the model in half-precision
model = VideoLlavaForConditionalGeneration.from_pretrained(
    "LanguageBind/Video-LLaVA-7B-hf", 
    dtype=torch.float16, 
    device_map="auto", 
    attn_implementation="eager"
)
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
model

VideoLlavaForConditionalGeneration(
  (model): VideoLlavaModel(
    (video_tower): CLIPVisionModel(
      (vision_model): CLIPVisionTransformer(
        (embeddings): CLIPVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
          (position_embedding): Embedding(257, 1024)
        )
        (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (encoder): CLIPEncoder(
          (layers): ModuleList(
            (0-23): 24 x CLIPEncoderLayer(
              (self_attn): CLIPAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
              )
              (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True

In [None]:
# Load the video as an np.arrau, sampling uniformly 8 frames
from huggingface_hub import hf_hub_download
# video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename='/home/aritrad/video-study/INTRO.mp4', repo_type="dataset")

### Set Video Path

In [None]:
video_path = '/home/aritrad/MSR-Project/samples/4min-video.mp4'

In [None]:
container = av.open(video_path)
total_frames = container.streams.video[0].frames
indices = np.arange(0, total_frames, total_frames / 16).astype(int)
video = read_video_pyav(container, indices)

In [None]:
total_frames

In [None]:
indices

In [None]:
model.config

### Question 1 - 4 min Sample

In [None]:
question = 'How many disciplines does electrical engineering overlaps with as shown in the slides in the video'

In [None]:
# For better results, we recommend to prompt the model in the following format
prompt = f"USER: <video>\n{question}? ASSISTANT:"
inputs = processor(text=prompt, videos=video, return_tensors="pt")

In [None]:
inputs = inputs.to('cuda')

In [None]:
%%time
out = model.generate(
    **inputs, 
    max_new_tokens=256,
    output_hidden_states =True,
    return_dict_in_generate=True
)

In [None]:
generated_text = processor.batch_decode(
    out['sequences'], 
    skip_special_tokens=True, 
    clean_up_tokenization_spaces=True
)
generated_text[0].split('ASSISTANT:')[1].strip()

### Question 2

In [None]:
# For better results, we recommend to prompt the model in the following format
prompt = "USER: <video>\nWhat colour the sample turn to after adding vinegar?? ASSISTANT:"
inputs = processor(text=prompt, videos=video, return_tensors="pt")
inputs = inputs.to('cuda')

In [None]:
%%time
out = model.generate(**inputs, max_new_tokens=256)
processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [None]:
plot_activations(out)

### Question 3

In [None]:
# For better results, we recommend to prompt the model in the following format
prompt = "USER: <video>\nWhat is the nature of the pickles made: Acidic or basic? ASSISTANT:"
inputs = processor(text=prompt, videos=video, return_tensors="pt")
inputs = inputs.to('cuda')

In [None]:
%%time
out = model.generate(
    **inputs, 
    max_new_tokens=256,
    output_hidden_states =True,
    return_dict_in_generate=True
)
processor.batch_decode(out['sequences'], skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [None]:
plot_activations(out)

In [None]:
plot_all_activations(out, inputs, processor)

### Question 4

In [None]:
# For better results, we recommend to prompt the model in the following format
prompt = "USER: <video>\nWill the vehicles in the video be safe to eat? ASSISTANT:"
inputs = processor(text=prompt, videos=video, return_tensors="pt")
inputs = inputs.to('cuda')

In [None]:
%%time
out = model.generate(
    **inputs, 
    max_new_tokens=256,
    output_hidden_states =True,
    return_dict_in_generate=True
)
processor.batch_decode(out['sequences'], skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [None]:
plot_activations(out)

### Question 5

In [None]:
# For better results, we recommend to prompt the model in the following format
prompt = "USER: <video>\Which vehicle is present in the video? ASSISTANT:"
inputs = processor(text=prompt, videos=video, return_tensors="pt")
inputs = inputs.to('cuda')

In [None]:
%%time
out = model.generate(
    **inputs, 
    max_new_tokens=256,
    output_hidden_states =True,
    return_dict_in_generate=True
)
processor.batch_decode(out['sequences'], skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [None]:
plot_activations(out)

### Question 6

In [None]:
# For better results, we recommend to prompt the model in the following format
prompt = "USER: <video>\nWhat is the nature of the doughnuts made: Acidic or basic? ASSISTANT:"
inputs = processor(text=prompt, videos=video, return_tensors="pt")
inputs = inputs.to('cuda')

In [None]:
%%time
out = model.generate(
    **inputs, 
    max_new_tokens=256,
    output_hidden_states =True,
    return_dict_in_generate=True
)
processor.batch_decode(out['sequences'], skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [None]:
plot_activations(out)

In [None]:
plot_all_activations(out, inputs, processor)