In [6]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, validator
import dotenv
import os
from typing import List


from constituent_treelib import ConstituentTree, Language
# Define the language for the sentence as well as for the spaCy and benepar models
language = Language.English
# Define which specific SpaCy model should be used (default is Medium)
spacy_model_size = ConstituentTree.SpacyModelSize.Medium
# Create the pipeline (note, the required models will be downloaded and installed automatically)
nlp = ConstituentTree.create_pipeline(language, spacy_model_size)
# Your sentence


def get_constituents(sentence: str) -> list[str]:
    # Create the tree from where we are going to extract the desired noun phrases
    tree = ConstituentTree(sentence, nlp)
    all_phrases = tree.extract_all_phrases(min_words_in_phrases=1)
    return all_phrases["NP"]



caption = "a clock above a building with clouds and lightning background"
question = "what unpleasant emotion does this weather phenomenon evoke?"

caption_constituents = get_constituents(caption)
print(caption_constituents)

question_constituents = get_constituents(question)
print(question_constituents)


def get_all_possible_question_rewrites(question : str, 
                                       caption_constituents : list[str], 
                                       question_constituents : list[str]) -> list[str]:
    questions = []
    for q in question_constituents:
        for c in caption_constituents:
            new_sentence = question.replace(q, c)
            questions.append(new_sentence)

    # return all possible re-written questions
    return questions


questions = get_all_possible_question_rewrites(question, caption_constituents, question_constituents)
print(questions)

[nltk_data] Downloading package benepar_en3 to /Users/szczekulskij/.py
[nltk_data]     env/versions/3.10.15/envs/cognify/share/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


['a clock above a building with clouds and lightning background', 'a clock', 'a building with clouds and lightning background', 'a building', 'clouds and lightning background', 'clouds', 'lightning background']
['this weather phenomenon']
['what unpleasant emotion does a clock above a building with clouds and lightning background evoke?', 'what unpleasant emotion does a clock evoke?', 'what unpleasant emotion does a building with clouds and lightning background evoke?', 'what unpleasant emotion does a building evoke?', 'what unpleasant emotion does clouds and lightning background evoke?', 'what unpleasant emotion does clouds evoke?', 'what unpleasant emotion does lightning background evoke?']


In [7]:
dotenv.load_dotenv()

class RelevanceAssessment(BaseModel):
    score: int
    reason: str

    @validator('score')
    def score_must_be_in_range(cls, value):
        if not 1 <= value <= 10:
            raise ValueError('Score must be between 1 and 10')
        return value

class BatchRelevanceAssessment(BaseModel):
    assessments: List[RelevanceAssessment]

parser = PydanticOutputParser(pydantic_object=BatchRelevanceAssessment)


def assess_relevance_batch(original_question: str, new_questions: List[str], model: ChatOpenAI) -> List[RelevanceAssessment]:
    """Assesses the relevance of a batch of new questions compared to the original."""

    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", """You are an expert in language and reasoning. 
             Your task is to assess how well each new question retains the meaning and intent of the original question,
             considering a substitution that has been made.
             For each new question, compare the original question to the new question. Think step by step about whether the substitution makes sense
             in the context of the question and whether it preserves the original intent.
             Provide a score from 1 to 10 for each new question, where 1 means the new question is completely irrelevant to the original,
             and 10 means the new question is perfectly relevant and retains the original intent.
             Explain your reasoning for each question in a short sentence.

             Return a JSON array of objects, where each object has a "score" and a "reason" field.
             The scores must be between 1 and 10 (inclusive).
             Your response format should be: {format_instructions}
             """),
            ("human", "Original question: {original_question}\nNew questions:\n{new_questions}")
        ]
    )

    chain = prompt | model | parser

    # Format the new questions for the prompt
    formatted_questions = "\n".join([f"{i+1}. {q}" for i, q in enumerate(new_questions)])

    input = {"original_question": original_question,
             "new_questions": formatted_questions,
             "format_instructions": parser.get_format_instructions()}

    output = chain.invoke(input)
    return output.assessments


def choose_the_best_questions(original_question: str, all_possible_sentences: List[str]) -> str:
    ''' Given all possible sentences, choose the one that is most relevant to the original question
      after considering the substitutions made.'''

    model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    sentence_assessments = assess_relevance_batch(original_question, all_possible_sentences, model)

    # Find the sentence with the highest relevance score
    best_sentence = None
    best_score = -1
    best_index = -1

    for i, assessment in enumerate(sentence_assessments):
        if assessment.score > best_score:
            best_score = assessment.score
            best_sentence = all_possible_sentences[i]
            best_index = i

    print(f"LLM chose index {best_index} with score {best_score} because: {sentence_assessments[best_index].reason}")

    if best_sentence is None:
        print("No sentences were assessed. Returning the first question.")
        return all_possible_sentences[0]

    return best_sentence

best_question = choose_the_best_questions(question, questions)
print(f"The best question is: {best_question}")

/var/folders/n2/5fn7vvhn3gb8s8nvtd_42ttc0000gn/T/ipykernel_50914/2964411331.py:7: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  @validator('score')


LLM chose index 4 with score 6 because: The clouds and lightning are related to weather, but the question lacks specificity about the phenomenon, reducing its relevance.
The best question is: what unpleasant emotion does clouds and lightning background evoke?


In [8]:
def reWriteQuery(caption: str, question: str) -> str:
    caption_constituents = get_constituents(caption)
    question_constituents = get_constituents(question)
    questions = get_all_possible_question_rewrites(question, caption_constituents, question_constituents)
    best_question = choose_the_best_questions(question, questions)
    return best_question

In [None]:
def VLMQueryRewritingCog(query, image):

    # 1. Call VLM to get caption of the image
    caption = ...

    # 2. Re-write the Query
    new_query = reWriteQuery(caption, query)

    # 3. Ensure that the new query is gonna be used in the future instead
    query = new_query
