In [None]:
import cv2
import os

from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset 
from vllm.assets.video import VideoAsset
from vllm.utils import FlexibleArgumentParser
# Specify the maximum number of frames per video to be 4. This can be changed.
# llm = LLM("Qwen/Qwen2.5-VL-3B-Instruct", limit_mm_per_prompt={"image": 4}, max_model_len=10000)
llm = LLM(
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        max_model_len=8000,
        max_num_seqs=10)


In [None]:
sampling_params = SamplingParams(temperature=0.7,
                                     max_tokens=256)

In [None]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("HuggingFaceFV/finevideo", split="train")

# Access the first video
video = ds[0]["mp4"]

#full dataset (600GB of data)
#dataset = load_dataset("HuggingFaceFV/finevideo/", split="train", num_proc=24)


In [None]:
# 1. Specify a dataset of video (100 videos)
# 2. Specify a prompt that asks the model to generate a description of the video * 100
# 3. You should be able to feed all of it together at once to the model to generate (DO NOT FEED ONE BY ONE)

In [None]:
from decord import VideoReader, cpu
from PIL import Image
import matplotlib.pyplot as plt 
from tqdm import tqdm
import numpy as np 

In [None]:

# Load the videos 
dataset_path = "/home/ubuntu/temp/10k_vid_dataset/mp4s"
os.makedirs(dataset_path, exist_ok=True)

num_videos = 50
videos = []
num_chunks = 3
time_per_chunk = 10
for i in tqdm(range(num_videos), total=num_videos):
    #save video 
    video_path = os.path.join(dataset_path, f"video_{i}.mp4")
    with open(video_path, "wb") as f:
        f.write(ds[i]["mp4"])
    # read video    
    vr = VideoReader(video_path, ctx=cpu(0))
    vid_dict = {"vid_path": video_path}
    success = True
    for chunk_idx in range(num_chunks):
        try:
            frame_idxs = np.arange(chunk_idx * time_per_chunk * 30, (chunk_idx + 1) * time_per_chunk * 30, 30)
            nd_frames = vr.get_batch(frame_idxs).asnumpy()
            vid_dict[f"vT{chunk_idx}"] = nd_frames
        except:
            print(f"Error processing")
            success = False
    if(success):
        videos.append(vid_dict)

In [None]:
# DATA STRUCTURE
# videos = [{"vid_path": "path...",
#            "vid": [frame1, frame2, frame3, ...],
#            "T1": {"description": {"Q": "text...", "A": "text..."},
#                   "temporal": {"time": int, "content": "text..."}},
#                   "question": {"Q": "text...", "A": "text..."},
#           }]

In [None]:
from vllm.assets.video import VideoAsset

def generate_video_description(video_frame_list, prompt, num_chunks):
    messages = []
    for idx, dict_obj in enumerate(video_frame_list):
        for chunk_idx in range(num_chunks):
            if(type(prompt) == list):
                prompt_curr = prompt[idx*num_chunks + chunk_idx]
            else:
                prompt_curr = prompt
            message = {"prompt": prompt_curr, "multi_modal_data": {"video": dict_obj[f"vT{chunk_idx}"]}}
            messages.append(message)
    # Perform inference and log output.
    print("Starting inference on ", len(messages), " messages")
    outputs = llm.generate(messages, sampling_params=sampling_params)
    generated_texts = []
    for idx in range(len(outputs)):
        generated_text = outputs[idx].outputs[0].text
        generated_texts.append(generated_text)
    print("Generated", len(generated_texts), "texts")
    return generated_texts
    

# Description Generation

In [None]:
prompt_describe = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
              f"<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
              f"Describe what is shown in the video. <|im_end|>\n"
              "<|im_start|>assistant\n")

generated_texts = generate_video_description(videos, prompt_describe, num_chunks)

for idx, text in enumerate(generated_texts):
    chunk_idx = idx%num_chunks
    list_idx = idx//num_chunks
    start_time = chunk_idx * time_per_chunk
    end_time = (chunk_idx + 1) * time_per_chunk
    print(f"Video {list_idx} chunk {chunk_idx} time {start_time} to {end_time} seconds")
    videos[list_idx][f"T{chunk_idx}_description"] = {"Q": f"What occurs between {start_time} and {end_time} seconds?", "A": text}

In [None]:
print(videos[-1]["vid_path"])
for key in videos[-1].keys():
    if("description" in key):
        print(f"{key}: {videos[-1][key]}")

# Time Step Generation

In [None]:
prompt_timestep = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
              f"<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
              "What occurs at time {time} seconds?<|im_end|>\n"
              "<|im_start|>assistant\n")

sample_time_steps = np.random.randint(0, time_per_chunk, num_videos*num_chunks)

prompt_list_timesteps = []
for time_step in sample_time_steps:
    prompt = prompt_timestep.replace("{time}", str(time_step))
    prompt_list_timesteps.append(prompt)

generated_texts = generate_video_description(videos, prompt_list_timesteps, num_chunks)

for idx, text in enumerate(generated_texts):
    chunk_idx = idx%num_chunks
    list_idx = idx//num_chunks
    vid_time = sample_time_steps[idx]
    calc_time = chunk_idx*time_per_chunk+sample_time_steps[idx] # have to recalculate due to chunking
    videos[list_idx][f"T{chunk_idx}_timestep"] = {"Q:": f"What occurs at time {calc_time} seconds?", "A": text.replace(str(vid_time), str(calc_time))}

In [None]:
print(videos[0].keys())
for key in videos[0].keys():
    if("timestep" in key):
        print(f"{key}: {videos[0][key]}")

# QA

In [None]:
prompt_gen_question =  ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
              f"<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
              f"What is a vision understanding question you can ask about the visual content of the video. Only output the question<|im_end|>\n"
              "<|im_start|>assistant\n")

generated_texts = generate_video_description(videos, prompt_gen_question, num_chunks)

questions = generated_texts

In [None]:
prompt_answer_question =  ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
              f"<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
              "{question}<|im_end|>\n"
              "<|im_start|>assistant\n")


prompt_qas = []
for q in questions:
    qa = prompt_answer_question.replace("{question}", q)
    prompt_qas.append(qa)


generated_texts = generate_video_description(videos, prompt_qas, num_chunks)
answers = generated_texts

for idx in range(len(answers)):
    chunk_idx = idx%num_chunks
    list_idx = idx//num_chunks
    q = questions[idx]
    a = answers[idx]
    videos[list_idx][f"T{chunk_idx}_qa"] = {"Q": q, "A": a}

In [None]:
vid_set = videos[-3]
print(vid_set.keys())
for key in vid_set.keys():
    if("qa" in key):
        print("Q", vid_set[key]["Q"])
        print("A", vid_set[key]["A"])
        print("______________")

In [None]:
vid_set = videos[-8]
print(vid_set.keys())
print(vid_set["vid_path"])
for key in vid_set.keys():
    if("vT" not in key):
        print(key, vid_set[key])

In [None]:
#save to huggingface dictionary
from datasets import Dataset
root_path = "/home/ubuntu/temp/10k_vid_dataset/"
#recreate data list without video frames
data_list = []
for vid in videos:
    vid_dict = {}
    for key in vid.keys():
        if("vT" not in key):
            vid_dict[key] = vid[key]
    data_list.append(vid_dict)
#save to huggingface dataset
dataset = Dataset.from_list(data_list)
dataset.save_to_disk(os.path.join(root_path, "finevideo_dataset"))