In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoProcessor

device = "cuda:0"
model_path = "DAMO-NLP-SG/VideoLLaMA3-2B"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    device_map={"": device},
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)



In [None]:
conversation = [
    {"role": "system", "content": "You are a helpful assistant."},
    {
        "role": "user",
        "content": [
            {"type": "video", "video": {"video_path": "/home/ubuntu/temp/mp4s/0.mp4", "fps": 1, "max_frames": 180, "start_time": 0, "end_time": 15}},
            {"type": "text", "text": "Describe the video."},
        ]
    },
    {"role": "assistant", "content": "It is a police academy training video."},
    {"role": "user", "content": "What is the video about?"},
]

inputs = processor(
    conversation=conversation,
    add_system_prompt=True,
    add_generation_prompt=True,
    return_tensors="pt"
)
inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
if "pixel_values" in inputs:
    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
output_ids = model.generate(**inputs, max_new_tokens=1024)
response = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
print(response)

In [None]:
from transformers.cache_utils import DynamicCache
conversation = [
    {"role": "system", "content": "You are a helpful assistant."},
    {
        "role": "user",
        "content": [
            {"type": "video", "video": {"video_path": "/home/ubuntu/temp/mp4s/0.mp4", "fps": 1, "max_frames": 180, "start_time": 0, "end_time": 15}},
            {"type": "text", "text": "Describe the video."},
        ]
    },
    {"role": "assistant", "content": "It is a police academy training video."},
    {"role": "user", "content": "What is the video about?"},
]

inputs = processor(
    conversation=conversation,
    add_system_prompt=True,
    add_generation_prompt=False,
    return_tensors="pt"
)
inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
if "pixel_values" in inputs:
    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
kv_cache = DynamicCache()
a = model.forward(**inputs, past_key_values=kv_cache, use_cache=True)
print(kv_cache[0][0].shape)

In [None]:
inputs["input_ids"][:,-50:]

In [None]:
tokenizer = processor.tokenizer 
tokens = tokenizer.tokenize("assistant")
input_ids = [tokenizer.convert_tokens_to_ids(tokens)]

token_output_list = []

input_ids = [[77091, 198]]
output = model.forward(input_ids=torch.tensor(input_ids).cuda(), past_key_values=kv_cache, use_cache=True)
print(kv_cache[0][0].shape)

max_idx=torch.argmax(output.logits, dim=-1)
max_idx = max_idx[:,1:]
print(max_idx)
print(tokenizer.convert_ids_to_tokens(max_idx))
token_output_list.append(max_idx[0][0])

In [None]:
for i in range(10):
    output = model.forward(input_ids=max_idx, past_key_values=kv_cache, use_cache=True)
    max_idx=torch.argmax(output.logits, dim=-1)
    print(tokenizer.convert_ids_to_tokens(max_idx))
    print(max_idx)
    token_output_list.append(max_idx[0][0])

In [None]:
token_output_list

In [None]:
tokenizer.decode(token_output_list)