In [1]:
import os
import sys
import shutil
os.environ['TOKENIZERS_PARALLELISM'] = "False"

import torch
import pandas as pd
import numpy as np
import json
from torch.nn.functional import cosine_similarity
from utils.video import read_frames_decord
from IPython.display import display, Markdown, Latex

import shared.utils as su
from utils.video import read_frames_decord
from utils.model import transform_pixel_values
from torchvision.transforms.v2 import (
    ToPILImage,
)
# from notebooks.eval_care_retrieval import load_model

### Load model

In [2]:
from models.modeling_encoders import AutoEncoder

# model_id = "/work/piyush/experiments/CaRe/Tarsier-7b/nli-9k+ego4d-1k/merged_checkpoint"
model_id = "/work/piyush/experiments/CaRe/Tarsier-7b/final-10112025/nli_9000+ego_1000+subj_replaced-seed_42/merged_checkpoint"
# !ls $model_id

encoder = AutoEncoder.from_pretrained(model_id, device_map='auto')
su.misc.num_params(encoder.model)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading EncoderForTarsier from /work/piyush/experiments/CaRe/Tarsier-7b/final-10112025/nli_9000+ego_1000+subj_replaced-seed_42/merged_checkpoint
### do_image_padding is set as False, images will be resized directly!


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
TarsierForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

::: Number of total parameters in TarsierForConditionalGeneration: 7063.427M


### TVBench

In [3]:
data_dir = "/scratch/shared/beegfs/piyush/datasets/TVBench"
video_dir = f"{data_dir}/video"
csv_path = f"{data_dir}/all_except_ntu120vids.csv"
print(f"CSV file: {csv_path}")
assert os.path.exists(csv_path), f"CSV file not found: {csv_path}"
df = pd.read_csv(csv_path)
df.shape

CSV file: /scratch/shared/beegfs/piyush/datasets/TVBench/all_except_ntu120vids.csv


(2405, 13)

In [4]:
data_root = "/scratch/shared/beegfs/piyush/datasets/TVBench"
data_dir = f"{data_root}/json"
data_list = {
    "Action Count": ("action_count.json", f"{data_root}/video/action_count", "video", False),
    "Object Count": ("object_count.json", f"{data_root}/video/object_count", "video", False),
    "Action Sequence": ("action_sequence.json", f"{data_root}/video/action_sequence", "video", True),  # has start & end
    "Object Shuffle": ("object_shuffle.json", f"{data_root}/video/object_shuffle", "video", False),
    "Scene Transition": ("scene_transition.json", f"{data_root}/video/scene_transition", "video", False),
    "Action Localization": ("action_localization.json", f"{data_root}/video/action_localization", "video", True),  # has start & end
    "Action Antonym": ("action_antonym.json", f"{data_root}/video/action_antonym", "video", False),
    "Unexpected Action": ("unexpected_action.json", f"{data_root}/video/unexpected_action", "video", False),
    "Egocentric Sequence": ("egocentric_sequence.json", f"{data_root}/video/egocentric_sequence", "video", False),
    "Moving Direction": ("moving_direction.json", f"{data_root}/video/moving_direction", "video", False),
}

In [5]:
correct = 0
total = 0
res_list = []
acc_dict = {}

# df = {
#     'video_path': [],
#     'start_time': [],
#     'end_time': [],
#     'question': [],
#     'candidates': [],
#     'answer': [],
# }

df = []

for key in data_list:
    config = data_list[key]

    # Load JSON
    json_path = f"{data_dir}/{config[0]}"
    assert os.path.exists(json_path)
    data = su.io.load_json(json_path)

    video_dir = config[1]
    data_new = []
    for d in data:
        video_path = f"{video_dir}/{d['video']}"

        # Hacks
        if key == 'Action Antonym':
            video_path = video_path.replace(".avi", ".mp4")
            d['video'] = d['video'].replace(".avi", ".mp4")

        if os.path.exists(video_path):
            data_new.append(d)
        else:
            continue
    # print(f"Number of videos for {key}: ", len(data_new))

    for d in su.log.tqdm_iterator(data_new, desc=f"Filtering video for {key}:"):
        video_path = f"{video_dir}/{d['video']}"
        assert os.path.exists(video_path)

        assert 'question' in d
        assert 'candidates' in d
        assert 'answer' in d

        # if config[-1]:
        #     print(d)
        d['video_path'] = video_path
        d['key'] = key

        df.append(d)
df = pd.DataFrame(df)
df.shape

Filtering video for Action Count::   0%|          | 0/536 [00:00<?, ?it/s]

Filtering video for Object Count::   0%|          | 0/148 [00:00<?, ?it/s]

Filtering video for Action Sequence::   0%|          | 0/437 [00:00<?, ?it/s]

Filtering video for Object Shuffle::   0%|          | 0/225 [00:00<?, ?it/s]

Filtering video for Scene Transition::   0%|          | 0/185 [00:00<?, ?it/s]

Filtering video for Action Localization::   0%|          | 0/160 [00:00<?, ?it/s]

Filtering video for Action Antonym::   0%|          | 0/200 [00:00<?, ?it/s]

Filtering video for Unexpected Action::   0%|          | 0/82 [00:00<?, ?it/s]

Filtering video for Egocentric Sequence::   0%|          | 0/200 [00:00<?, ?it/s]

Filtering video for Moving Direction::   0%|          | 0/232 [00:00<?, ?it/s]

(2405, 13)

In [12]:
generate_kwargs = {
    "do_sample": False,
    "max_new_tokens": 1,
    "top_p": 1,
    "temperature": 0.,
    "use_cache": True,
}

In [43]:
def convert_to_prompt(messages):
    """
    Convert a list of message dictionaries to a prompt string.
    
    Args:
        messages: List of message dictionaries with 'role' and 'content' fields
        
    Returns:
        Formatted prompt string
    """
    prompt = ""
    
    for message in messages:
        role = message["role"].upper()
        prompt += f"{role}: "

        content_items = message["content"]
        for item in content_items:
            if item["type"] == "video":
                prompt += "<video>\n"
            elif item["type"] == "text":
                prompt += item["text"]
        
        prompt += " "
    
    prompt += "ASSISTANT: "
    
    return prompt


def generate_answer_for_videoqa(encoder, video_path, question, options, n_frames=16, generate_kwargs={}, verbose=False, start=None, end=None):
    """
    Generates an answer for VideoQA.

    Args:
        video_path (str): video path
        question (str): question
        options (list[str]): list out all the options
        n_frames (int): number of frames
        generate_kwargs (dict): additional kwargs for model.generate
    """
    assert os.path.exists(video_path)

    # Convert options into a single string
    # indexed_options = [f"{j}: {v}" for j, v in enumerate(options)]
    # option_string = '\n'.join(indexed_options)

    indices = ['A', "B", "C", "D"]
    indexed_options = [f"{j}: {v}" for j, v in zip(indices, options)]
    option_string = ' | '.join(indexed_options)

    # option_string = ' | '.join(options)
    messages = [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "Carefully watch the video and pay attention to the cause and sequence of events,"\
                            " the detail and movement of objects, and the action and pose of persons. Based on"\
                            "your observations, select the best option that accurately addresses the question.\n"
                }
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": video_path,
                    "fps": 8.0,
                },
                {
                    "type": "text",
                    "text": f"""
                    Question: {question}
                    Only give the best option.

                    Options: {option_string}

                    Strictly pick only the letter coressponding to the best choice.
                    """
                },
            ],
        }
    ]

    # Convert into a single string prompt
    prompt = convert_to_prompt(messages)

    # Prepare video
    pixel_values = read_frames_decord(video_path, n_frames, start=start, end=end).unsqueeze(0)
    pixel_values = transform_pixel_values(pixel_values)
    nframes = pixel_values.shape[1]
    to_image = ToPILImage()
    batched_frames = []
    for batch in pixel_values:
        frames = [to_image(v) for v in batch]
        batched_frames.append(frames)

    # Run through model
    for frames in batched_frames:
        input_prompt = prompt.replace("<video>", "<image>"*len(frames))
        if verbose:
            print(input_prompt)
            print("=" * 60)
        input_ids = encoder.processor.get_text_inputs(input_prompt)
        frames = encoder.processor.get_pixel_values(frames)
        inputs = {
            "input_ids": input_ids,
            "pixel_values": frames,
        }
        inputs = {k:v.to(encoder.model.device) for k,v in inputs.items() if v is not None}
        outputs = encoder.model.generate(
            **inputs,
            **generate_kwargs,
        )
        # print(generate_kwargs)
        # print(outputs.shape)
        # print(outputs[0][inputs['input_ids'][0].shape[0]:].shape)
        output_text = encoder.processor.tokenizer.decode(
            outputs[0][inputs['input_ids'][0].shape[0]:], skip_special_tokens=True,
        )
        break # Safe to break since it is only a single sample
    return output_text

In [48]:
df = df.replace({np.nan: None})

In [49]:
# Run inference on a single sample
i = 100
i = 500
i = np.random.randint(len(df))
i = 0
row = df.iloc[i].to_dict()
row

{'video': 'video_8686.mp4',
 'question': 'The person makes sets of repeated actions. How many times did the person repeat the action in the last set?',
 'candidates': ['2', '3', '5', '4'],
 'answer': '4',
 'video_path': '/scratch/shared/beegfs/piyush/datasets/TVBench/video/action_count/video_8686.mp4',
 'key': 'Action Count',
 'is_seq': None,
 'question_id': None,
 'start': None,
 'end': None,
 'accurate_start': None,
 'accurate_end': None,
 'video_length': None}

In [50]:
generate_answer_for_videoqa(encoder, row['video_path'], row['question'], row['candidates'], n_frames=16, generate_kwargs=generate_kwargs, verbose=True, start=row['start'], end=row['end'])

SYSTEM: Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based onyour observations, select the best option that accurately addresses the question.
 USER: <image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image>

                    Question: The person makes sets of repeated actions. How many times did the person repeat the action in the last set?
                    Only give the best option.

                    Options: A: 2 | B: 3 | C: 5 | D: 4

                    Strictly pick only the letter coressponding to the best choice.
                     ASSISTANT: 


'cla'

In [19]:
# su.visualize.show_single_video(row['video_path'])

In [20]:
# frames = su.video.load_frames_linspace(row['video_path'], n=16)
# su.visualize.concat_images_with_border(frames)

In [52]:
iterator = su.log.tqdm_iterator(range(len(df)), desc='Generating answers')
index_to_letter = {0: "A", 1: "B", 2: "C", 3: "D"}
preds = []
trues = []
for i in iterator:
    row = df.iloc[i].to_dict()
    pred_option = generate_answer_for_videoqa(encoder, row['video_path'], row['question'], row['candidates'], n_frames=16, generate_kwargs=generate_kwargs, verbose=False, start=row['start'], end=row['end'])
    true_option = index_to_letter[row['candidates'].index(row['answer'])]

    preds.append(pred_option)
    trues.append(true_option)

Generating answers:   0%|          | 0/2405 [00:00<?, ?it/s]

In [56]:
(np.array(preds) == np.array(trues)).mean()

0.20291060291060292

In [59]:
df['pred'] = preds
df['true'] = trues

In [62]:
for k in df.key.unique():
    subdf = df[df.key == k]
    acc = np.round((subdf['pred'] == subdf['true']).mean() * 100., 2)
    print(k, acc)

Action Count 8.21
Object Count 16.89
Action Sequence 32.27
Object Shuffle 18.22
Scene Transition 35.68
Action Localization 13.12
Action Antonym 59.0
Unexpected Action 13.41
Egocentric Sequence 4.0
Moving Direction 5.6


### Old code

In [4]:
# Load NextQA dataset
data_dir = "/scratch/shared/beegfs/piyush/datasets/NExTQA"
csv_path = f"{data_dir}/mc.csv"
df = pd.read_csv(csv_path)

In [70]:
from utils.video import read_frames_decord
from utils.model import transform_pixel_values
from torchvision.transforms.v2 import (
    ToPILImage,
)


def convert_to_prompt(messages):
    """
    Convert a list of message dictionaries to a prompt string.
    
    Args:
        messages: List of message dictionaries with 'role' and 'content' fields
        
    Returns:
        Formatted prompt string
    """
    prompt = ""
    
    for message in messages:
        role = message["role"].upper()
        prompt += f"{role}: "
        
        content_items = message["content"]
        for item in content_items:
            if item["type"] == "video":
                prompt += "<video>\n"
            elif item["type"] == "text":
                prompt += item["text"]
        
        prompt += " "
    
    prompt += "ASSISTANT: "
    
    return prompt


def generate_answer_for_videoqa(encoder, video_path, question, options, n_frames=16):
    """
    Generates an answer for VideoQA.

    Args:
        video_path (str): video path
        question (str): question
        options (list[str]): list out all the options
        n_frames (int): number of frames
    """
    assert os.path.exists(video_path)

    # Convert options into a single string
    indexed_options = [f"{j}: {v}" for j, v in enumerate(options)]
    option_string = '\n'.join(indexed_options)
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": video_path,
                    "fps": 8.0,
                },
                {
                    "type": "text",
                    "text": f"""Answer the following question by choosing the right option from provided choices. \n
                    Question: {question} \n
                    Options: \n {option_string}
                    """
                },
            ],
        }
    ]

    # Convert into a single string prompt
    prompt = convert_to_prompt(messages)

    # Prepare video
    pixel_values = read_frames_decord(video_path, n_frames).unsqueeze(0)
    pixel_values = transform_pixel_values(pixel_values)
    nframes = pixel_values.shape[1]
    to_image = ToPILImage()
    batched_frames = []
    for batch in pixel_values:
        frames = [to_image(v) for v in batch]
        batched_frames.append(frames)

    # Run through model
    for frames in batched_frames:
        input_prompt = prompt.replace("<video>", "<image>"*len(frames))
        input_ids = encoder.processor.get_text_inputs(input_prompt)
        frames = encoder.processor.get_pixel_values(frames)
        inputs = {
            "input_ids": input_ids,
            "pixel_values": frames,
        }
        inputs = {k:v.to(encoder.model.device) for k,v in inputs.items() if v is not None}
        outputs = encoder.model.generate(
            **inputs,
            **generate_kwargs,
        )
        output_text = encoder.processor.tokenizer.decode(
            outputs[0][inputs['input_ids'][0].shape[0]:], skip_special_tokens=True,
        )
        break # Safe to break since it is only a single sample
    return output_text, indexed_options

In [80]:
row

{'video': 9528960175,
 'frame_count': 4500,
 'width': 640,
 'height': 480,
 'question': 'where is the child at',
 'answer': 3,
 'qid': 7,
 'type': 'DL',
 'a0': 'playground',
 'a1': 'along a pathway',
 'a2': 'dining table',
 'a3': 'bedroom',
 'a4': 'bathtub'}

In [84]:
def show_keyvalue_markdown(key, value, color='black'):
    display(Markdown(f'<span style="color:{color}">{key}</span>: {value}'))

In [88]:
# Test on a sample
# i = 10
i = np.random.randint(len(df))
row = df.iloc[i].to_dict()

video_path = f"{data_dir}/NExTVideo/{row['video']}.mp4"
assert os.path.exists(video_path)
display(su.visualize.show_single_image_sequence(video_path, label=f"Video ID: {row['video']}"))

question = row['question']

# Add question mark
if not question.endswith("?"):
    question += '?'

# Prepare options for MCQ
options = [
    row['a0'], row['a1'], row['a2'], row['a3'], row['a4'],
]

show_keyvalue_markdown('**Question**', question, color='blue')
show_keyvalue_markdown('**Options**\n', options, color='blue')
# display(Markdown(f'<span style="color:blue">**Question**</span>: {question}'))
# display(Markdown(f'**Options**: \n {options}'))

generated_answer, indexed_options = generate_answer_for_videoqa(
    encoder=encoder, 
    video_path=video_path,
    question=question,
    options=options,
    n_frames=16,
)
# print(generated_answer)
# print(indexed_options)

show_keyvalue_markdown('**Generated text**', generated_answer, color='red')

answer_true = indexed_options[row['answer']]
show_keyvalue_markdown('**Ground truth**', answer_true, color='limegreen')

# display(Markdown(f"**Generated text**: {generated_answer}"))


# display(Markdown(f"**True answer**: {answer_true}"))

VBox(children=(HTML(value='Video ID: 11019529085'), Output()))

<span style="color:blue">**Question**</span>: where did the man get the microphone from?

<span style="color:blue">**Options**
</span>: ['laptop', 'lady in skirt', 'man in shirt', 'his teammates', 'microphone stand']

<span style="color:red">**Generated text**</span>: Answer: 4: microphone stand

<span style="color:limegreen">**Ground truth**</span>: 4: microphone stand

### Dev code

In [24]:
# Messages containing a images list as a video and a text query
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": video_path,
                "fps": 8.0,
            },
            {
                "type": "text",
                "text": f"""Answer the following question by choosing the right option from provided choices. \n
                Question: {question} \n
                Options: \n {option_string}
                """
            },
        ],
    }
]
import json
print(json.dumps(messages, indent=1))

[
 {
  "role": "user",
  "content": [
   {
    "type": "video",
    "video": "/scratch/shared/beegfs/piyush/datasets/NExTQA/NExTVideo/9528960175.mp4",
    "fps": 8.0
   },
   {
    "type": "text",
    "text": "Answer the following question by choosing the right option from provided choices. \n\n                Question: where is the child at? \n\n                Options: \n 0: playground\n1: along a pathway\n2: dining table\n3: bedroom\n4: bathtub\n                "
   }
  ]
 }
]


In [58]:
def convert_to_prompt(messages):
    """
    Convert a list of message dictionaries to a prompt string.
    
    Args:
        messages: List of message dictionaries with 'role' and 'content' fields
        
    Returns:
        Formatted prompt string
    """
    prompt = ""
    
    for message in messages:
        role = message["role"].upper()
        prompt += f"{role}: "
        
        content_items = message["content"]
        for item in content_items:
            if item["type"] == "video":
                prompt += "<video>\n"
            elif item["type"] == "text":
                prompt += item["text"]
        
        prompt += " "
    
    prompt += "ASSISTANT: "
    
    return prompt


prompt = convert_to_prompt(messages)
print(prompt)

USER: <video>
Answer the following question by choosing the right option from provided choices. 

                Question: where is the child at? 

                Options: 
 0: playground
1: along a pathway
2: dining table
3: bedroom
4: bathtub
                 ASSISTANT: 


In [66]:
# Process video
from utils.video import read_frames_decord
from utils.model import transform_pixel_values
from torchvision.transforms.v2 import (
    ToPILImage,
)

n_frames = 16
pixel_values = read_frames_decord(video_path, n_frames).unsqueeze(0)
pixel_values = transform_pixel_values(pixel_values)
nframes = pixel_values.shape[1]
to_image = ToPILImage()
batched_frames = []
for batch in pixel_values:
    frames = [to_image(v) for v in batch]
    batched_frames.append(frames)

for frames in batched_frames:
    input_prompt = prompt.replace("<video>", "<image>"*len(frames))
    input_ids = encoder.processor.get_text_inputs(input_prompt)
    frames = encoder.processor.get_pixel_values(frames)
    inputs = {
        "input_ids": input_ids,
        "pixel_values": frames
    }
    inputs = {k:v.to(encoder.model.device) for k,v in inputs.items() if v is not None}
    outputs = encoder.model.generate(
        **inputs,
        **generate_kwargs,
    )
    output_text = encoder.processor.tokenizer.decode(
        outputs[0][inputs['input_ids'][0].shape[0]:], skip_special_tokens=True,
    )
print(output_text)

Answer: 3: bedroom


In [35]:
print(messages[0]['content'][0]['video'])

/scratch/shared/beegfs/piyush/datasets/NExTQA/NExTVideo/9528960175.mp4


In [39]:
from models.tarsier2.dataset.utils import format_one_sample

In [48]:
def process_one(
    model, processor, prompt, video_file, generate_kwargs,
):
    # inputs = processor(prompt, video_file, edit_prompt=True, return_prompt=True)
    sample = format_one_sample(video_file, prompt)
    print(sample)
    batch_data = processor(sample)
    print(f"###Prompt:\n{get_prompt_from_data_dict(sample)}")
    model_inputs = {}
    for k, v in batch_data.items():
        if not isinstance(v, torch.Tensor):
            continue
        model_inputs[k] = v.to(model.device)
    outputs = model.generate(
        **model_inputs,
        **generate_kwargs,
    )
    output_text = processor.processor.tokenizer.decode(
        outputs[0][model_inputs['input_ids'][0].shape[0]:], skip_special_tokens=True,
    )
    return output_text

In [50]:
generate_kwargs = {
    "do_sample": False,
    "max_new_tokens": 128,
    "top_p": 1,
    "temperature": 0.,
    "use_cache": True,
}
input_file = messages[0]['content'][0]['video']
prompt = messages[0]['content'][-1]['text']

pred = process_one(encoder.model, encoder.processor, prompt, input_file, generate_kwargs)
pred

{'messages': [{'role': 'user', 'content': [{'type': 'video', 'video': {'video_file': '/scratch/shared/beegfs/piyush/datasets/NExTQA/NExTVideo/9528960175.mp4'}}, {'type': 'text', 'text': 'Answer the following question by choosing the right option from provided choices. \n\n                Question: where is the child at? \n\n                Options: \n 0: playground\n1: along a pathway\n2: dining table\n3: bedroom\n4: bathtub\n                '}]}, {'role': 'assistant', 'content': []}], 'task': 'video/QA'}
> [0;32m/users/piyush/projects/CaReBench/models/tarsier/processor.py[0m(139)[0;36mget_text_inputs[0;34m()[0m
[0;32m    138 [0;31m        [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 139 [0;31m        [0mprompt_ids[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mtokenizer[0m[0;34m.[0m[0mencode[0m[0;34m([0m[0mtext[0m[0;34m,[0m [0madd_special_tokens[0m[0;34m=[0m[

ipdb>  text


{'messages': [{'role': 'user', 'content': [{'type': 'video', 'video': {'video_file': '/scratch/shared/beegfs/piyush/datasets/NExTQA/NExTVideo/9528960175.mp4'}}, {'type': 'text', 'text': 'Answer the following question by choosing the right option from provided choices. \n\n                Question: where is the child at? \n\n                Options: \n 0: playground\n1: along a pathway\n2: dining table\n3: bedroom\n4: bathtub\n                '}]}, {'role': 'assistant', 'content': []}], 'task': 'video/QA'}


ipdb>  self.tokenizer.encode(text, add_special_tokens=True)


*** TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]


ipdb>  self.tokenizer.encode('answer this qn', add_special_tokens=True)


[1, 1234, 445, 3855, 29876]


ipdb>  q


In [18]:
encoder.video_eol_prompt

'USER: <video>\nSummary above video in one word: ASSISTANT: '

In [15]:

# Preparation for inference
text = encoder.processor.processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference
generated_ids = encoder.model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

AttributeError: 'CustomImageProcessor' object has no attribute 'apply_chat_template'

In [14]:
encoder.processor.processor

<models.tarsier.processor.CustomImageProcessor at 0x7f9e5b505810>