## Use Fine Tuned Llama2 from TIFA Benchmark Team to Generate Question and Answers for VQA

acknowledgement 
- https://www.kaggle.com/code/richolson/tifa-question-generation-qwen-vs-tuned-llama/notebook
- https://github.com/Yushi-Hu/tifa

Import dependencies

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import gc
import pandas as pd
import re

Load pipeline

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "tifa-benchmark/llama2_tifa_question_generation"
)
model = AutoModelForCausalLM.from_pretrained(
    "tifa-benchmark/llama2_tifa_question_generation",
    torch_dtype=torch.bfloat16,  # Use half precision for memory efficiency
    device_map="auto",
    low_cpu_mem_usage=True,
)

llama_pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Follow TIFA Prompt format

In [None]:
def create_qg_prompt(caption):
    INTRO_BLURB = "Given an image description, generate one or two multiple-choice questions that verifies if the image description is correct.\nClassify each concept into a type (object, human, animal, food, activity, attribute, counting, color, material, spatial, location, shape, other), and then generate a question for each type.\n"
    formated_prompt = f"<s>[INST] <<SYS>>\n{INTRO_BLURB}\n<</SYS>>\n\n"
    formated_prompt += f"Description: {caption} [/INST] Entities:"
    return formated_prompt


test_caption = "a purple forest at dusk"

prompt = create_qg_prompt(test_caption)

print(prompt)

Create TIFA VQA Dataset

In [None]:
def extract_questions(output_text):
    questions = []
    q_pattern = re.compile(r"Q: (.*?)\nChoices: (.*?)\nA: (.*?)(?=\n\w|$)", re.DOTALL)
    matches = q_pattern.findall(output_text)

    for match in matches:
        question = match[0].strip()
        choices_str = match[1].strip()
        answer = match[2].strip()

        choices = [choice.strip() for choice in choices_str.split(",")]

        questions.append({"question": question, "choices": choices, "answer": answer})

    return questions


def process_with_model(description, pipeline):
    prompt = create_qg_prompt(description)

    try:
        sequences = pipeline(
            prompt, do_sample=True, max_length=512, num_return_sequences=1
        )

        full_output = sequences[0]["generated_text"]
        output_text = full_output[len(prompt) :]

        # handle whitepsaces, extract first part
        if "\n\n" in output_text:
            output_text = output_text.split("\n\n")[0]

        # Extract questions
        questions = extract_questions(output_text)

        list_questions = [q["question"] for q in questions]
        list_choices = [q["choices"] for q in questions]
        list_answer = [q["answer"] for q in questions]

        return list_questions, list_choices, list_answer

    except Exception as e:
        print(f"Error processing: {e}")
        return [], [], []


def create_dataset(df, pipeline):
    description_collection = []
    question_collection = []
    choices_collection = []
    answer_collection = []

    for _, row in df.iterrows():
        description = row["description"]

        print(f"Processing: {description}")

        # Process with model through the pipeline
        list_questions, list_choices, list_answer = process_with_model(
            description, pipeline
        )

        description_collection.append(description)
        question_collection.append(list_questions)
        choices_collection.append(list_choices)
        answer_collection.append(list_answer)

        print(f"  Extracted {len(list_questions)} questions")
        # Clean up memory
        gc.collect()
        torch.cuda.empty_cache()

    if question_collection:
        results_df = pd.DataFrame(
            {
                "description": description_collection,
                "question": question_collection,
                "choices": choices_collection,
                "answer": answer_collection,
            }
        )
        return results_df
    else:
        print("No results generated")
        return None


description_df = pd.read_csv("data/descriptions.csv")[:5]
tifa_llama_predicted_questions = create_dataset(description_df, llama_pipeline)
# tifa_llama_predicted_questions.to_csv("data/descriptions_with_vqa.csv")