In [1]:
!pip install langchain langchain-community huggingface_hub roboflow pillow transformers requests tqdm pandas



# !pip install -q diffusers transformers accelerate peft



In [1]:
prompt_template_llama2_general = """
You are an advanced language model tasked with merging detailed captions generated by two multimodal systems. Your goal is to create a unified, highly detailed, and factually accurate description of the image.

### Instructions:
- Combine the details from both captions, ensuring all critical elements (objects, actions, spatial relationships, and contextual nuances) are represented.
- Maintain a coherent and consistent tone throughout the output.
- Avoid contradictions or redundant details; instead, refine and enhance the combined information for clarity.
- Incorporate additional entities provided, enriching the description with fine-grained visual and contextual details.
- The resulting caption should serve as the most comprehensive and accurate summary of the image.

### Input Captions:
1. Caption 1: {caption1}
2. Caption 2: {caption2}

### Additional Entities:
- {entities}

### Output:
Write a single, unified caption that captures the full essence of the image, accurately representing all provided details and entities.
"""

prompt_template_llama2_object_detection = """
You are an advanced language model tasked with merging detailed captions generated by two multimodal systems. Your goal is to create a unified, highly detailed, and factually accurate description of the image.
Below is a detailed description of an image. Your task is to extract and list every object mentioned in the description that can be reliably detected by an object detection model. Please follow these instructions carefully:

### Instructions:
1. Identify each object that is clearly described and can be recognized by typical object detection systems.
2. Be very cautious with numerical details attached to the objects.
3. Do not combine or generalize objects if different numbers are specified. Each unique instance should be explicitly listed.

### Detailed Description:
{description}

### Output:
List all identifiable objects from the detailed description. Return it in json with key named results
Given the description as an input, provide a response in this exact JSON schema:\n\n"
    {
    "results": "<list of objects>"
    }
"""

final_caption_prompt_template = """
You are an advanced language model tasked with refining a detailed image caption. Your goal is to ensure that the caption is both highly detailed and free of hallucinations by cross-checking it against detection probabilities from an object detection model.

### Instructions:
1. Carefully analyze the provided detailed description of the image.
2. For each object in the description:
   - Cross-check its presence using the detection probabilities from the object detection model.
   - If the probability of detection for an object is high (e.g., above 0.1), retain the object in the caption.
   - If the probability of detection for an object is low (e.g., below 0.1), treat it as a potential hallucination and decide whether to remove it from the caption.
   - If the object is not present in the detection probabilities, leave it unchanged in the caption.
3. Ensure that the revised caption maintains logical flow, coherence, and a consistent tone after removing hallucinated objects.
4. Do not add any new objects or make assumptions beyond the provided information.

### Inputs:
- Detailed Description: {description}
- Detection Probabilities: {object_probabilities}

### Output:
Write a revised, highly detailed caption where all described objects are verified as detected and potential hallucinations are removed.
"""

In [2]:
# import json
import os
import pandas as pd
from tqdm import tqdm

# import pandas as pd

root_dir = os.getcwd()
path_to_kosmos = os.path.join(root_dir, "inference_results_kosmos_2.json")
path_to_llava = os.path.join(root_dir, "inference_results_llava_first_step.json")

kosmos_res = pd.read_json(path_to_kosmos)
llava_res = pd.read_json(path_to_llava)

first_step_df = kosmos_res.merge(llava_res, on="image", suffixes=("_kosmos", "_llava")).drop_duplicates(subset="image", keep="last")
# with open(path_to_file, "rb") as f:
#     kosmos_inference_res = json.loads(f.read())
# pd.read_json(kosmos_inference_res)
inputs = first_step_df.iloc[0]

In [3]:
from langchain.prompts import PromptTemplate
# from langchain_community.chat_models import ChatOllama
from langchain.llms import HuggingFacePipeline
from langchain_core.output_parsers.string import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser
from transformers import pipeline
import timeit


local_llm = "llama2:13b"
model_name = "meta-llama/Llama-2-7b-hf"

prompt_general = PromptTemplate(template=prompt_template_llama2_general, input_variables=["caption1", "caption2", "entities"])
prompt_object_detection = PromptTemplate(template=prompt_template_llama2_object_detection, input_variables=["description"])
prompt_final = PromptTemplate(template=final_caption_prompt_template, input_variables=["description", "object_probabilities"])

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import timeit
import json
from tqdm import tqdm

# # Initialize the tokenizer and model, explicitly transferring them to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")

# # Set the model to evaluation mode
# model.eval()

# Initialize results list
first_step_responses = []

# Start timer
start = timeit.default_timer()

for i, inputs in tqdm(first_step_df.iterrows(), desc="First step llama2 generation"):
    # Preprocess input: tokenize the prompt
    inputs_text = dict(
        caption1=inputs["description_llava"].split("ASSISTANT:")[1],
        caption2=inputs["description_kosmos"],
        entities=", ".join([entities[0] for entities in inputs["entities"]])
    )

    # Tokenize the inputs and move to GPU
    inputs_enc = tokenizer(inputs_text["caption1"], inputs_text["caption2"], return_tensors="pt").to("cuda")

    # Run inference
    with torch.no_grad():  # Disable gradients for inference
        # Generate response
        output = model.generate(
            inputs_enc["input_ids"],
            temperature=0.2,
            max_new_tokens=100,
            top_k=50,
            top_p=0.95
        )

    # Decode the generated text and append response
    response_first = tokenizer.decode(output[0], skip_special_tokens=True)
    first_step_responses.append({
        "image": inputs["image"],
        "description_llama_first": response_first
    })

    # Save results in batches
    if len(first_step_responses) % 100 == 0:
        with open("inference_llama_first.json", "w") as f:
            json.dump(first_step_responses, f, indent=4)

# Save final results
with open("inference_llama_first.json", "w") as f:
    json.dump(first_step_responses, f, indent=4)

# End timer
end = timeit.default_timer()

# Save inference time
inf_time = {}
with open("inference_time.json", "w") as f:
    inf_time["inference_llama_first_time"] = end - start
    json.dump(inf_time, f)


First step llama2 generation: 1035it [1:32:38,  5.38s/it]

In [None]:
# Start timer
start = timeit.default_timer()

second_step_responses = []

for response_first in tqdm(first_step_responses, desc="Second step llama2 generation"):
    # Preprocess input: tokenize the description
    formatted_description = prompt_object_detection.format(description=response_first["description_llama_first"])
    inputs_enc = tokenizer(formatted_description, return_tensors="pt").to("cuda")

    # Run inference with torch.no_grad() to save memory
    with torch.no_grad():
        output = model.generate(
            inputs_enc["input_ids"],
            max_new_tokens=100,  # Control the number of tokens generated
            temperature=0.5,
            top_k=50,
            top_p=0.95
        )

    # Decode the generated response
    response_second = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Append the results with image info
    second_step_responses.append({
        "image": response_first["image"],
        "objects_llama_second": response_second  # Use the decoded text as the result
    })

    # Save results in batches
    if len(second_step_responses) % 100 == 0:
        with open("inference_llama_second.json", "w") as f:
            json.dump(second_step_responses, f, indent=4)

# Final result saving
with open("inference_llama_second.json", "w") as f:
    json.dump(second_step_responses, f, indent=4)

# End timer
end = timeit.default_timer()

# Save inference time
inf_time = {}
with open("inference_time.json", "w") as f:
    inf_time["inference_llama_second_time"] = end - start
    json.dump(inf_time, f)