In [None]:
import os
import json
import torch
import asyncio
import aiofiles
import nest_asyncio
from tqdm import tqdm
from PIL import Image
from google.colab import drive
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration

nest_asyncio.apply()
drive.mount('/content/drive', force_remount=True)

dataset_path = "/content/drive/MyDrive/mmml_project/mini_gqa.json"
output_folder = "/content/drive/MyDrive/mmml_project/outputs"
os.makedirs(output_folder, exist_ok=True)
output_path = os.path.join(output_folder, "mini_gqa_with_llava.json")

with open(dataset_path, "r") as f:
    data = json.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained(
    "llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True
).to(device)

SEMAPHORE = asyncio.Semaphore(20)

async def process_record(record):
    question = record["question"]
    image_file = record["image_file"]

    if not os.path.exists(image_file):
        record["llava_response"] = "Error: Image not found"
        return record

    image = Image.open(image_file).convert("RGB")

    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {"type": "image"},
            ],
        }
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    async with SEMAPHORE:
        try:
            inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

            model_inputs = {
                "input_ids": inputs.input_ids,
                "attention_mask": inputs.attention_mask,
            }

            with torch.no_grad():
                output_ids = model.generate(**model_inputs, max_new_tokens=50)

            generated_text = processor.decode(output_ids[0], skip_special_tokens=True)
            record["llava_response"] = generated_text.strip()
        except Exception as e:
            record["llava_response"] = f"Error: {str(e)}"

    return record

async def process_dataset():
    tasks = [process_record(record) for record in tqdm(data, desc="Processing LLaVA Responses")]
    updated_data = await asyncio.gather(*tasks)

    async with aiofiles.open(output_path, "w") as f:
        await f.write(json.dumps(updated_data, indent=2))

    print(f"Updated dataset saved to: {output_path}")

await process_dataset()

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open

Updated dataset saved to: /content/drive/MyDrive/mmml_project/outputs/mini_gqa_with_llava.json
