In [None]:
import os 
print(os.getenv("CONDA_DEFAULT_ENV"))

In [None]:
import av
import torch
import numpy as np
from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor

In [None]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`list[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

In [None]:
# Load the model in half-precision
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", torch_dtype=torch.float16, device_map="auto")
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")

In [None]:
pwd

In [None]:
# Load the video as an np.arrau, sampling uniformly 8 frames
from huggingface_hub import hf_hub_download
# video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename='/home/aritrad/video-study/INTRO.mp4', repo_type="dataset")

### Set Video Path

In [None]:
video_path = '/home/aritrad/MSR-Project/samples/black-screen.mp4'

In [None]:
container = av.open(video_path)
total_frames = container.streams.video[0].frames
indices = np.arange(0, total_frames, total_frames / 8).astype(int)
video = read_video_pyav(container, indices)

### Question 1 - 4 min Sample

In [None]:
question = 'When spinlocks in operating system should be used as per the video'

In [None]:
# For better results, we recommend to prompt the model in the following format
prompt = f"USER: <video>\n{question}? ASSISTANT:"
inputs = processor(text=prompt, videos=video, return_tensors="pt")

In [None]:
%%time
out = model.generate(**inputs, max_new_tokens=256)
generated_text = processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
generated_text[0].split('ASSISTANT:')[1].strip()

### Question 2

In [None]:
# For better results, we recommend to prompt the model in the following format
prompt = "USER: <video>\nWhat does the standish group report says? ASSISTANT:"
inputs = processor(text=prompt, videos=video, return_tensors="pt")

In [None]:
%%time
out = model.generate(**inputs, max_new_tokens=256)
processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)

### Question 3

In [None]:
# For better results, we recommend to prompt the model in the following format
prompt = "USER: <video>\nWhat is the IEEE definition of software engineering discussed in the video? ASSISTANT:"
inputs = processor(text=prompt, videos=video, return_tensors="pt")

In [None]:
%%time
out = model.generate(**inputs, max_new_tokens=256)
processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)

### Question 4

In [None]:
# For better results, we recommend to prompt the model in the following format
prompt = "USER: <video>\Which factors are contributing to the software crisis? ASSISTANT:"
inputs = processor(text=prompt, videos=video, return_tensors="pt")

In [None]:
%%time
out = model.generate(**inputs, max_new_tokens=256)
processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)