In [72]:
import os
import sys
import shutil
os.environ['TOKENIZERS_PARALLELISM'] = "False"

import torch
import pandas as pd
import numpy as np
import json
from torch.nn.functional import cosine_similarity
from utils.video import read_frames_decord
from IPython.display import display, Markdown, Latex

import shared.utils as su
from notebooks.eval_care_retrieval import load_model

In [3]:
from models.modeling_encoders import AutoEncoder

model_id = "/work/piyush/experiments/CaRe/Tarsier-7b/nli-9k+ego4d-1k/merged_checkpoint"
!ls $model_id

encoder = AutoEncoder.from_pretrained(model_id, device_map='auto')
su.misc.num_params(encoder.model)

added_tokens.json		  model.safetensors.index.json
config.json			  preprocessor_config.json
generation_config.json		  processor_config.json
metrics				  special_tokens_map.json
model-00001-of-00003.safetensors  tokenizer.json
model-00002-of-00003.safetensors  tokenizer.model
model-00003-of-00003.safetensors  tokenizer_config.json


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading EncoderForTarsier from /work/piyush/experiments/CaRe/Tarsier-7b/nli-9k+ego4d-1k/merged_checkpoint
### do_image_padding is set as False, images will be resized directly!


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
TarsierForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

::: Number of total parameters in TarsierForConditionalGeneration: 7063.427M


In [4]:
# Load NextQA dataset
data_dir = "/scratch/shared/beegfs/piyush/datasets/NExTQA"
csv_path = f"{data_dir}/mc.csv"
df = pd.read_csv(csv_path)

In [70]:
from utils.video import read_frames_decord
from utils.model import transform_pixel_values
from torchvision.transforms.v2 import (
    ToPILImage,
)


def convert_to_prompt(messages):
    """
    Convert a list of message dictionaries to a prompt string.
    
    Args:
        messages: List of message dictionaries with 'role' and 'content' fields
        
    Returns:
        Formatted prompt string
    """
    prompt = ""
    
    for message in messages:
        role = message["role"].upper()
        prompt += f"{role}: "
        
        content_items = message["content"]
        for item in content_items:
            if item["type"] == "video":
                prompt += "<video>\n"
            elif item["type"] == "text":
                prompt += item["text"]
        
        prompt += " "
    
    prompt += "ASSISTANT: "
    
    return prompt


def generate_answer_for_videoqa(encoder, video_path, question, options, n_frames=16):
    """
    Generates an answer for VideoQA.

    Args:
        video_path (str): video path
        question (str): question
        options (list[str]): list out all the options
        n_frames (int): number of frames
    """
    assert os.path.exists(video_path)

    # Convert options into a single string
    indexed_options = [f"{j}: {v}" for j, v in enumerate(options)]
    option_string = '\n'.join(indexed_options)
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": video_path,
                    "fps": 8.0,
                },
                {
                    "type": "text",
                    "text": f"""Answer the following question by choosing the right option from provided choices. \n
                    Question: {question} \n
                    Options: \n {option_string}
                    """
                },
            ],
        }
    ]

    # Convert into a single string prompt
    prompt = convert_to_prompt(messages)

    # Prepare video
    pixel_values = read_frames_decord(video_path, n_frames).unsqueeze(0)
    pixel_values = transform_pixel_values(pixel_values)
    nframes = pixel_values.shape[1]
    to_image = ToPILImage()
    batched_frames = []
    for batch in pixel_values:
        frames = [to_image(v) for v in batch]
        batched_frames.append(frames)

    # Run through model
    for frames in batched_frames:
        input_prompt = prompt.replace("<video>", "<image>"*len(frames))
        input_ids = encoder.processor.get_text_inputs(input_prompt)
        frames = encoder.processor.get_pixel_values(frames)
        inputs = {
            "input_ids": input_ids,
            "pixel_values": frames,
        }
        inputs = {k:v.to(encoder.model.device) for k,v in inputs.items() if v is not None}
        outputs = encoder.model.generate(
            **inputs,
            **generate_kwargs,
        )
        output_text = encoder.processor.tokenizer.decode(
            outputs[0][inputs['input_ids'][0].shape[0]:], skip_special_tokens=True,
        )
        break # Safe to break since it is only a single sample
    return output_text, indexed_options

In [80]:
row

{'video': 9528960175,
 'frame_count': 4500,
 'width': 640,
 'height': 480,
 'question': 'where is the child at',
 'answer': 3,
 'qid': 7,
 'type': 'DL',
 'a0': 'playground',
 'a1': 'along a pathway',
 'a2': 'dining table',
 'a3': 'bedroom',
 'a4': 'bathtub'}

In [84]:
def show_keyvalue_markdown(key, value, color='black'):
    display(Markdown(f'<span style="color:{color}">{key}</span>: {value}'))

In [88]:
# Test on a sample
# i = 10
i = np.random.randint(len(df))
row = df.iloc[i].to_dict()

video_path = f"{data_dir}/NExTVideo/{row['video']}.mp4"
assert os.path.exists(video_path)
display(su.visualize.show_single_image_sequence(video_path, label=f"Video ID: {row['video']}"))

question = row['question']

# Add question mark
if not question.endswith("?"):
    question += '?'

# Prepare options for MCQ
options = [
    row['a0'], row['a1'], row['a2'], row['a3'], row['a4'],
]

show_keyvalue_markdown('**Question**', question, color='blue')
show_keyvalue_markdown('**Options**\n', options, color='blue')
# display(Markdown(f'<span style="color:blue">**Question**</span>: {question}'))
# display(Markdown(f'**Options**: \n {options}'))

generated_answer, indexed_options = generate_answer_for_videoqa(
    encoder=encoder, 
    video_path=video_path,
    question=question,
    options=options,
    n_frames=16,
)
# print(generated_answer)
# print(indexed_options)

show_keyvalue_markdown('**Generated text**', generated_answer, color='red')

answer_true = indexed_options[row['answer']]
show_keyvalue_markdown('**Ground truth**', answer_true, color='limegreen')

# display(Markdown(f"**Generated text**: {generated_answer}"))


# display(Markdown(f"**True answer**: {answer_true}"))

VBox(children=(HTML(value='Video ID: 11019529085'), Output()))

<span style="color:blue">**Question**</span>: where did the man get the microphone from?

<span style="color:blue">**Options**
</span>: ['laptop', 'lady in skirt', 'man in shirt', 'his teammates', 'microphone stand']

<span style="color:red">**Generated text**</span>: Answer: 4: microphone stand

<span style="color:limegreen">**Ground truth**</span>: 4: microphone stand

### Dev code

In [24]:
# Messages containing a images list as a video and a text query
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": video_path,
                "fps": 8.0,
            },
            {
                "type": "text",
                "text": f"""Answer the following question by choosing the right option from provided choices. \n
                Question: {question} \n
                Options: \n {option_string}
                """
            },
        ],
    }
]
import json
print(json.dumps(messages, indent=1))

[
 {
  "role": "user",
  "content": [
   {
    "type": "video",
    "video": "/scratch/shared/beegfs/piyush/datasets/NExTQA/NExTVideo/9528960175.mp4",
    "fps": 8.0
   },
   {
    "type": "text",
    "text": "Answer the following question by choosing the right option from provided choices. \n\n                Question: where is the child at? \n\n                Options: \n 0: playground\n1: along a pathway\n2: dining table\n3: bedroom\n4: bathtub\n                "
   }
  ]
 }
]


In [58]:
def convert_to_prompt(messages):
    """
    Convert a list of message dictionaries to a prompt string.
    
    Args:
        messages: List of message dictionaries with 'role' and 'content' fields
        
    Returns:
        Formatted prompt string
    """
    prompt = ""
    
    for message in messages:
        role = message["role"].upper()
        prompt += f"{role}: "
        
        content_items = message["content"]
        for item in content_items:
            if item["type"] == "video":
                prompt += "<video>\n"
            elif item["type"] == "text":
                prompt += item["text"]
        
        prompt += " "
    
    prompt += "ASSISTANT: "
    
    return prompt


prompt = convert_to_prompt(messages)
print(prompt)

USER: <video>
Answer the following question by choosing the right option from provided choices. 

                Question: where is the child at? 

                Options: 
 0: playground
1: along a pathway
2: dining table
3: bedroom
4: bathtub
                 ASSISTANT: 


In [66]:
# Process video
from utils.video import read_frames_decord
from utils.model import transform_pixel_values
from torchvision.transforms.v2 import (
    ToPILImage,
)

n_frames = 16
pixel_values = read_frames_decord(video_path, n_frames).unsqueeze(0)
pixel_values = transform_pixel_values(pixel_values)
nframes = pixel_values.shape[1]
to_image = ToPILImage()
batched_frames = []
for batch in pixel_values:
    frames = [to_image(v) for v in batch]
    batched_frames.append(frames)

for frames in batched_frames:
    input_prompt = prompt.replace("<video>", "<image>"*len(frames))
    input_ids = encoder.processor.get_text_inputs(input_prompt)
    frames = encoder.processor.get_pixel_values(frames)
    inputs = {
        "input_ids": input_ids,
        "pixel_values": frames
    }
    inputs = {k:v.to(encoder.model.device) for k,v in inputs.items() if v is not None}
    outputs = encoder.model.generate(
        **inputs,
        **generate_kwargs,
    )
    output_text = encoder.processor.tokenizer.decode(
        outputs[0][inputs['input_ids'][0].shape[0]:], skip_special_tokens=True,
    )
print(output_text)

Answer: 3: bedroom


In [35]:
print(messages[0]['content'][0]['video'])

/scratch/shared/beegfs/piyush/datasets/NExTQA/NExTVideo/9528960175.mp4


In [39]:
from models.tarsier2.dataset.utils import format_one_sample

In [48]:
def process_one(
    model, processor, prompt, video_file, generate_kwargs,
):
    # inputs = processor(prompt, video_file, edit_prompt=True, return_prompt=True)
    sample = format_one_sample(video_file, prompt)
    print(sample)
    batch_data = processor(sample)
    print(f"###Prompt:\n{get_prompt_from_data_dict(sample)}")
    model_inputs = {}
    for k, v in batch_data.items():
        if not isinstance(v, torch.Tensor):
            continue
        model_inputs[k] = v.to(model.device)
    outputs = model.generate(
        **model_inputs,
        **generate_kwargs,
    )
    output_text = processor.processor.tokenizer.decode(
        outputs[0][model_inputs['input_ids'][0].shape[0]:], skip_special_tokens=True,
    )
    return output_text

In [50]:
generate_kwargs = {
    "do_sample": False,
    "max_new_tokens": 128,
    "top_p": 1,
    "temperature": 0.,
    "use_cache": True,
}
input_file = messages[0]['content'][0]['video']
prompt = messages[0]['content'][-1]['text']

pred = process_one(encoder.model, encoder.processor, prompt, input_file, generate_kwargs)
pred

{'messages': [{'role': 'user', 'content': [{'type': 'video', 'video': {'video_file': '/scratch/shared/beegfs/piyush/datasets/NExTQA/NExTVideo/9528960175.mp4'}}, {'type': 'text', 'text': 'Answer the following question by choosing the right option from provided choices. \n\n                Question: where is the child at? \n\n                Options: \n 0: playground\n1: along a pathway\n2: dining table\n3: bedroom\n4: bathtub\n                '}]}, {'role': 'assistant', 'content': []}], 'task': 'video/QA'}
> [0;32m/users/piyush/projects/CaReBench/models/tarsier/processor.py[0m(139)[0;36mget_text_inputs[0;34m()[0m
[0;32m    138 [0;31m        [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 139 [0;31m        [0mprompt_ids[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mtokenizer[0m[0;34m.[0m[0mencode[0m[0;34m([0m[0mtext[0m[0;34m,[0m [0madd_special_tokens[0m[0;34m=[0m[

ipdb>  text


{'messages': [{'role': 'user', 'content': [{'type': 'video', 'video': {'video_file': '/scratch/shared/beegfs/piyush/datasets/NExTQA/NExTVideo/9528960175.mp4'}}, {'type': 'text', 'text': 'Answer the following question by choosing the right option from provided choices. \n\n                Question: where is the child at? \n\n                Options: \n 0: playground\n1: along a pathway\n2: dining table\n3: bedroom\n4: bathtub\n                '}]}, {'role': 'assistant', 'content': []}], 'task': 'video/QA'}


ipdb>  self.tokenizer.encode(text, add_special_tokens=True)


*** TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]


ipdb>  self.tokenizer.encode('answer this qn', add_special_tokens=True)


[1, 1234, 445, 3855, 29876]


ipdb>  q


In [18]:
encoder.video_eol_prompt

'USER: <video>\nSummary above video in one word: ASSISTANT: '

In [15]:

# Preparation for inference
text = encoder.processor.processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference
generated_ids = encoder.model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

AttributeError: 'CustomImageProcessor' object has no attribute 'apply_chat_template'

In [14]:
encoder.processor.processor

<models.tarsier.processor.CustomImageProcessor at 0x7f9e5b505810>