In [1]:
import os
import sys
import json

import torch
from tqdm import tqdm
from PIL import Image
import torch.nn.functional as F
from utils.video import read_frames_decord
from torchvision.transforms.v2 import PILToTensor
from models.modeling_encoders import AutoEncoder

import shared.utils as su

In [2]:
# Load model
model_path = "/work/piyush/experiments/CaRe/special_milestones/care-stage2-nli-27k-ego4d-3k"
encoder = AutoEncoder.from_pretrained(
    model_path,
    dtype=torch.float16,
    device_map='cuda:0',
)

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading EncoderForQwen2VL from /work/piyush/experiments/CaRe/special_milestones/care-stage2-nli-27k-ego4d-3k


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
# Load data
data = "carebench"
data_path = "../data.json"
assert os.path.exists(data_path), f"{data_path} not found"
with open(data_path) as f:
    data_configs = json.load(f)
data_config = data_configs[data]
data_config

{'anno_path': '/scratch/shared/beegfs/piyush/datasets/CaReBench/json/metadata.json',
 'data_root': '/scratch/shared/beegfs/piyush/datasets/CaReBench/videos',
 'media_type': 'video'}

In [4]:
anno_path = data_config['anno_path']
with open(anno_path) as f:
    data = json.load(f)
len(data)

1000

In [5]:
num_frames = 32
trim30 = False

item = data[0]
video_path = f"{data_config['data_root']}/{item['video']}"
assert os.path.exists(video_path)

caption = item['caption']
# video = read_frames_decord(
#     video_path, num_frames=num_frames, trimmed30=trim30)

# caption, video.shape

In [6]:
with torch.no_grad():
    text_emb = encoder.encode_text(caption)
text_emb.shape

From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


torch.Size([1, 3584])

In [7]:
duration = su.video.get_duration(video_path)

In [8]:
import decord
vr = decord.VideoReader(video_path)
vr[0].shape

torch.Size([720, 1280, 3])

In [11]:
256 * 1280/720

455.1111111111111

In [12]:
from models.modeling_basemodels import EOL_PROMPTS
from qwen_vl_utils import process_vision_info

messages = [{
    "role": "user",
    "content": [
        {"type": "video", "video": video_path, 'fps': 32. / duration, "resized_height": 256, "resized_width": 455},
        {"type": "text", "text": EOL_PROMPTS['video']},
    ],
}]
prompt = encoder.processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
print(prompt)
print(video_inputs[0].shape)

inputs = encoder.processor(
    text=[prompt],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
    # **video_kwargs,
)
inputs = inputs.to("cuda")

video_reader_backend decord error, use torchvision as default, msg: 'Tensor' object has no attribute 'asnumpy'


<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|video_pad|><|vision_end|><video>
Summary above video in one word:<|im_end|>
<|im_start|>assistant

torch.Size([32, 3, 252, 448])


In [13]:
with torch.inference_mode():
    output = encoder.model.generate(
        **inputs, max_new_tokens=1, output_hidden_states=True, return_dict_in_generate=True,
    )
    z = output.hidden_states[0][-1][:, -1, :].cpu()
z.shape

torch.Size([1, 3584])

In [None]:
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)

In [16]:
prompt = encoder.video_eol_prompt
prompt = prompt.replace("<video>", "<|vision_start|><|video_pad|><|vision_end|>")
print(prompt)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|video_pad|><|vision_end|>
Summary above video in one word:<|im_end|>
<|im_start|>assistant



In [18]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4",
            },
            {"type": "text", "text": "Describe this video."},
        ],
    }
]

#In Qwen 2.5 VL, frame rate information is also input into the model to align with absolute time.
# Preparation for inference
text = encoder.processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
print(text)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|video_pad|><|vision_end|>Describe this video.<|im_end|>
<|im_start|>assistant



In [15]:
with torch.no_grad():
    video_emb = encoder.encode_vision(video.unsqueeze(0)).cpu()
video_emb.shape

OutOfMemoryError: CUDA out of memory. Tried to allocate 4.25 GiB. GPU 0 has a total capacity of 44.47 GiB of which 2.79 GiB is free. Including non-PyTorch memory, this process has 41.68 GiB memory in use. Of the allocated memory 37.48 GiB is allocated by PyTorch, and 4.01 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)