In [1]:
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
# %pip install langchain-groq
import os

# os.getenv("GROQ_API_KEY")

In [4]:
# from langchain.tools import tool
# from langchain_core.messages import HumanMessage
# from langgraph.checkpoint.memory import MemorySaver
# from langgraph.prebuilt import create_react_agent


from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_groq import ChatGroq

from langchain_core.output_parsers import JsonOutputParser

In [5]:
from typing import List
from langchain_core.pydantic_v1 import BaseModel, Field
import time


class ObjectsDetected(BaseModel):
    objects_detected: List[str] = Field(description="List of detected objects")

        
llm = ChatGroq(model="llama3-70b-8192", temperature=0.5)
template = '''
You are an advanced language model tasked with merging detailed captions generated by two multimodal systems. Your goal is to create a unified, highly detailed, and factually accurate description of the image.  

Below is a detailed description of an image. Your task is to extract and list every object mentioned in the description that can be reliably detected by an object detection model. Please follow these instructions carefully:  

### Instructions:  
1. Identify each object that is clearly described and can be recognized by typical object detection systems.  
2. Be very cautious with numerical details attached to the objects, don't avoid them.  
3. Do not combine or generalize objects if different numbers are specified. Each unique instance should be explicitly listed.  
'''
system_message = SystemMessagePromptTemplate.from_template(template)
human_message = HumanMessagePromptTemplate.from_template("Here is description: {description}")

prompt_object_detection = ChatPromptTemplate.from_messages([system_message, human_message])
model = prompt_object_detection | llm.with_structured_output(ObjectsDetected)



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [27]:
request_counter = 0
last_reset_time = time.time()


def invoke_with_rate_limit_per_minute(description, requests_per_minute = 29):
    global request_counter, last_reset_time

    current_time = time.time()
    if current_time - last_reset_time >= 60:
        request_counter = 0
        last_reset_time = current_time

    if request_counter >= requests_per_minute:
        wait_time = time_window - (current_time - last_reset_time)
        print(f"Rate limit reached. Please wait {wait_time:.2f} seconds.")
        time.sleep(wait_time)  # Sleep until the next allowed time
        request_counter = 0  # Reset the counter after waiting
        last_reset_time = time.time()

    # Now make the request
    response = model.invoke({"description": description}).objects_detected

    # Increment the request counter after the request
    request_counter += 1

    return response

# Example usage
first_llama_inference = {}

result = invoke_with_rate_limit_per_minute("there are 3 cars are driving on dirty route")


In [28]:
result

['cars', 'route']

In [6]:
# TODO: regenerate first_llama_responses with groq 80b llama
class DescriptionObject(BaseModel):
    description: str = Field(description="summarized description")

        
prompt_template_llama2_general = """
You are an advanced language model tasked with merging detailed captions generated by two multimodal systems. Your goal is to create a unified, highly detailed, and factually accurate description of the image.

### Instructions:
- Combine the details from both captions, ensuring all critical elements (objects, actions, spatial relationships, and contextual nuances) are represented.
- Maintain a coherent and consistent tone throughout the output.
- Avoid contradictions or redundant details; instead, refine and enhance the combined information for clarity.
- Incorporate additional entities provided, enriching the description with fine-grained visual and contextual details.
- The resulting caption should serve as the most comprehensive and accurate summary of the image.
"""

template = '''
You are an advanced language model tasked with merging detailed captions generated by two multimodal systems. Your goal is to create a unified, highly detailed, and factually accurate description of the image.  

Below is a detailed description of an image. Your task is to extract and list every object mentioned in the description that can be reliably detected by an object detection model. Please follow these instructions carefully:  

### Instructions:  
1. Identify each object that is clearly described and can be recognized by typical object detection systems.  
2. Be very cautious with numerical details attached to the objects, don't avoid them.  
3. Do not combine or generalize objects if different numbers are specified. Each unique instance should be explicitly listed.  
'''
system_message = SystemMessagePromptTemplate.from_template(prompt_template_llama2_general)
human_message = HumanMessagePromptTemplate.from_template("""
### Input Captions:
1. Caption 1: {caption1}
2. Caption 2: {caption2}

### Additional Entities:
- {entities}""")

prompt_object_detection = ChatPromptTemplate.from_messages([system_message, human_message])
model = prompt_object_detection | llm.with_structured_output(DescriptionObject)

request_counter = 0
last_reset_time = time.time()


def invoke_with_rate_limit_per_minute(caption1, caption2, entities, requests_per_minute = 29):
    global request_counter, last_reset_time

    current_time = time.time()
    if current_time - last_reset_time >= 60:
        request_counter = 0
        last_reset_time = current_time

    if request_counter >= requests_per_minute:
        wait_time = time_window - (current_time - last_reset_time)
        print(f"Rate limit reached. Please wait {wait_time:.2f} seconds.")
        time.sleep(wait_time)
        request_counter = 0
        last_reset_time = time.time()

    # Now make the request
    response = model.invoke({"caption1": caption1, "caption2": caption2, "entities": entities}).description

    # Increment the request counter after the request
    request_counter += 1

    return response


In [7]:
import json
with open("../processed_data/inference_results_llava_first_step.json", "r") as f:
    inference_llava = json.loads(f.read())
with open("../processed_data/inference_results_kosmos_2.json", "r") as f:
    inference_kosmos = json.loads(f.read())

In [8]:
import pandas as pd
merged_df = pd.DataFrame(inference_llava).merge(pd.DataFrame(inference_kosmos), on="image", suffixes=("_llava", "_kosmos"))

In [44]:

with open("inference_llama_first_80b.json", "r") as f:
        first_llama_inference = json.loads(f.read())

checked_images = pd.DataFrame(first_llama_inference).image.unique().tolist()
len(checked_images)

609

In [45]:
from tqdm import tqdm

for i, r in tqdm(merged_df[~merged_df.image.isin(checked_images)].iterrows(), total=(~merged_df.image.isin(checked_images)).sum()):
    entities = list(map(lambda entity: entity[0], r["entities"]))
    try:
        first_llama_inference.append({"image": r["image"], "description": invoke_with_rate_limit_per_minute(r["description_llava"], r["description_kosmos"], entities)})
    except:
        time.sleep(60*3)
        with open("inference_llama_first_80b.json", "w") as f:
            json.dump(first_llama_inference, f, indent=4)
            
        first_llama_inference.append({"image": r["image"], "description": invoke_with_rate_limit_per_minute(r["description_llava"], r["description_kosmos"], entities)})
        
with open("inference_llama_first_80b.json", "w") as f:
        json.dump(first_llama_inference, f, indent=4)

In [48]:
checked_images = pd.DataFrame(first_llama_inference).image.unique().tolist()
len(checked_images)

611

In [49]:
with open("inference_llama_first_80b.json", "w") as f:
        json.dump(first_llama_inference, f, indent=4)