## Imports

In [1]:
%pip install langchain
%pip install langchain-openai


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


## Functions

In [6]:
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
import pandas as pd 

class FactComparator:
    def __init__(self, model):
        self.model = model
        self.pronoun_chain = LLMChain(llm=self.model, prompt=self._pronoun_prompt())
        self.parse_chain = LLMChain(llm=self.model, prompt=self._parse_prompt())
        self.compare_chain = LLMChain(llm=self.model, prompt=self._compare_prompt())
        self.parser = PydanticOutputParser(pydantic_object=ComparisonResult)

    def process_data(self, context, answer):
        context_replace_pronouns = self.pronoun_chain.run(text=context)
        answer_replace_pronouns = self.pronoun_chain.run(text=answer)

        context_list = self.parse_chain.run(text=context_replace_pronouns)
        answer_list = self.parse_chain.run(text=answer_replace_pronouns)

        comparison_result = self.parser.parse(self.compare_chain.run(context_list=context_list, answer_list=answer_list))

        return {
            "context_replace_pronouns": context_replace_pronouns,
            "answer_replace_pronouns": answer_replace_pronouns,
            "context_list": context_list,
            "answer_list": answer_list,
            "comparison_result": comparison_result,
        }

    def calculate_metrics(self, comparison_result):
        facts_in_both_count = len(comparison_result.facts_in_both)
        facts_only_in_answer_count = len(comparison_result.facts_only_in_answer)
        facts_only_in_context_count = len(comparison_result.facts_only_in_context)

        total_answer_facts = facts_in_both_count + facts_only_in_answer_count
        total_context_facts = facts_in_both_count + facts_only_in_context_count

        groundedness = facts_in_both_count / total_answer_facts * 100 if total_answer_facts > 0 else 0
        thoroughness = facts_in_both_count / total_context_facts * 100 if total_context_facts > 0 else 0

        return {
            "groundedness": groundedness,
            "thoroughness": thoroughness,
        }

    def process_data_list(self, data_list):
        results = []
        for data in data_list:
            context = data['context']
            answer = data['answer']

            result = self.process_data(context, answer)
            metrics = self.calculate_metrics(result["comparison_result"])

            result_data = {
                'context': context,
                'answer': answer,
                'context_replace_pronouns': result["context_replace_pronouns"],
                'answer_replace_pronouns': result["answer_replace_pronouns"],
                'context_list': result["context_list"],
                'answer_list': result["answer_list"],
                'facts_in_both': ', '.join(result["comparison_result"].facts_in_both),
                'facts_only_in_answer': ', '.join(result["comparison_result"].facts_only_in_answer),
                'facts_only_in_context': ', '.join(result["comparison_result"].facts_only_in_context),
                'groundedness': metrics['groundedness'],
                'thoroughness': metrics['thoroughness']
            }
            results.append(result_data)

        return pd.DataFrame(results)

    @staticmethod
    def _pronoun_prompt():
        return PromptTemplate(
            input_variables=["text"],
            template="""
            Your task is to replace all the pronouns in the following text with the nouns they refer to:

            <text>
            {text}
            </text>

            The goal is to make the text more explicit and clear by replacing potentially ambiguous pronouns like "he", "she", "it", "they", "them", etc. with the specific nouns or names they refer to.

            For example:
            Original: John went to the store. He bought some milk.
            Pronoun replaced: John went to the store. John bought some milk.

            Here are the steps to complete this task:

            1. Carefully read the provided text and identify all the pronouns 
            2. For each pronoun, look back in the text to determine which noun or name it is referring to
            3. If the pronoun is part of a direct quote, do not replace it
            4. Replace each pronoun with the most recent noun or name it refers to
            5. If a pronoun does not have a clear referent noun or name, do not replace it
            6. Repeat this process until all the pronouns with clear referents have been replaced
            """,
        )

    @staticmethod
    def _parse_prompt():
        return PromptTemplate(
            input_variables=["text"],
            template="""
            Please parse the following text into a list of individual facts:

            <text>
            {text}
            </text>

            Read the text carefully. Your task is to break it down into the key facts it contains. Parse out each individual fact into a separate sentence, even if that means splitting up or rewording the original sentences. The goal is to have a clear, concise list of the core facts contained in the text.

            Output the parsed facts in a numbered list, with each fact written as a complete sentence on its own line. Use <facts> tags to demarcate the start and end of the list.
            """,
        )

    @staticmethod
    def _compare_prompt():
        return PromptTemplate(
            input_variables=["context_list", "answer_list"],
            template="""
            You will be comparing facts between a context and an answer to determine which facts are shared and which are unique to each.

            Here is the context:

            <context>
            {context_list}
            </context>

            And here is the answer: 

            <answer>
            {answer_list}
            </answer>

            Carefully analyze the facts presented in the context and answer, focusing on the semantic meaning rather than the exact wording.

            Then, output a dictionary with the following keys and corresponding lists of facts as values:

            1. "facts_in_both": A list of facts that are present in both the context and the answer

            2. "facts_only_in_answer": A list of facts that are only present in the answer 

            3. "facts_only_in_context": A list of facts that are only present in the context

            Remember, the facts do not need to be worded identically to be considered the same. Focus on whether the core meaning is shared or unique.

            Provide your results in this format:

            {{
                "facts_in_both": [
                    "Fact 1 present in both",
                    "Fact 2 present in both"
                ],
                "facts_only_in_answer": [
                    "Fact 1 only in answer",
                    "Fact 2 only in answer"  
                ],
                "facts_only_in_context": [
                    "Fact 1 only in context",
                    "Fact 2 only in context"
                ]
            }}
            """,
        )


class ComparisonResult(BaseModel):
    facts_in_both: list[str] = Field(default_factory=list, description="List of facts present in both context and answer")
    facts_only_in_answer: list[str] = Field(default_factory=list, description="List of facts only present in the answer")
    facts_only_in_context: list[str] = Field(default_factory=list, description="List of facts only present in the context")

## Run on First Pair of Statements

In [19]:
model = OpenAI(temperature=0)
comparator = FactComparator(model)

context = "The quick brown fox jumps over the rock because he's happy. He was born in 2005. The hedgehog was born in 2010, but she's even happier than him."
answer = "The quick brown fox was born in 2005, and the hedgehog in 2010. The quick brown fox is not as happy as the hedgehog"

result = comparator.process_data(context, answer)
metrics = comparator.calculate_metrics(result["comparison_result"])

print("Context with replaced pronouns:")
print(result["context_replace_pronouns"])

print("\nAnswer with replaced pronouns:")
print(result["answer_replace_pronouns"])

print("\nContext list:")
print(result["context_list"])

print("\nAnswer list:")
print(result["answer_list"])

print("\nComparison result:")
print(result["comparison_result"])

print("\nMetrics:")
print(f"Groundedness: {metrics['groundedness']:.2f}%")
print(f"Thoroughness: {metrics['thoroughness']:.2f}%")

Context with replaced pronouns:

            The quick brown fox jumps over the rock because the fox is happy. The fox was born in 2005. The hedgehog was born in 2010, but the hedgehog is even happier than the fox.

Answer with replaced pronouns:

            The quick brown fox was born in 2005, and the hedgehog in 2010. The quick brown fox is not as happy as the hedgehog.

Context list:

<facts>
1. The quick brown fox jumps over the rock because the fox is happy.
2. The fox was born in 2005.
3. The hedgehog was born in 2010.
4. The hedgehog is even happier than the fox.
</facts>

Answer list:

<facts>
1. The quick brown fox was born in 2005.
2. The hedgehog was born in 2010.
3. The quick brown fox is not as happy as the hedgehog.
</facts>

Comparison result:
facts_in_both=['The fox was born in 2005.', 'The hedgehog was born in 2010.'] facts_only_in_answer=['The quick brown fox is not as happy as the hedgehog.'] facts_only_in_context=['The quick brown fox jumps over the rock because t

## Run on another pair of statements

In [14]:
model = OpenAI(temperature=0)
comparator = FactComparator(model)

context = "To boil pasta, first bring a large pot of salted water to a rolling boil over high heat.."
answer = "To boil pasta, begin by filling a large pot with water, making sure there's enough to fully submerge the pasta. Bring the water to a rolling boil over high heat, then add salt to enhance the pasta's flavor. Once the water is boiling, carefully add the pasta, stirring gently to prevent sticking. Cook the pasta according to the package instructions or until it reaches your desired level of tenderness, usually around 8-12 minutes. To check for doneness, taste a piece of pasta—it should be tender but still slightly firm (al dente)."

result = comparator.process_data(context, answer)
metrics = comparator.calculate_metrics(result["comparison_result"])

print("Context with replaced pronouns:")
print(result["context_replace_pronouns"])

print("\nAnswer with replaced pronouns:")
print(result["answer_replace_pronouns"])

print("\nContext list:")
print(result["context_list"])

print("\nAnswer list:")
print(result["answer_list"])

print("\nComparison result:")
print(result["comparison_result"])

print("\nMetrics:")
print(f"Groundedness: {metrics['groundedness']:.2f}%")
print(f"Thoroughness: {metrics['thoroughness']:.2f}%")

Context with replaced pronouns:

            To boil pasta, first bring a large pot of salted water to a rolling boil over high heat.

Answer with replaced pronouns:

To boil pasta, begin by filling a large pot with water, making sure there's enough water to fully submerge the pasta. Bring the water to a rolling boil over high heat, then add salt to enhance the pasta's flavor. Once the water is boiling, carefully add the pasta, stirring gently to prevent the pasta from sticking. Cook the pasta according to the package instructions or until the pasta reaches your desired level of tenderness, usually around 8-12 minutes. To check for doneness, taste a piece of pasta—it should be tender but still slightly firm (al dente).

Context list:

<facts>
1. To boil pasta, you must first bring a large pot of salted water to a rolling boil.
2. This should be done over high heat.
</facts>

Answer list:

<facts>
1. To boil pasta, begin by filling a large pot with water.
2. Make sure there's enough wat

## Run on a list of dictionaries - return DF

In [18]:
data_list = [
    {
        'context': 'The quick brown fox jumps over the rock because he\'s happy. He was born in 2005. The hedgehog was born in 2010, but she\'s even happier than him.',
        'answer': 'The quick brown fox was born in 2005, and the hedgehog in 2010. The quick brown fox is not as happy as the hedgehog'
    },
    {
        'context': 'The sun is a star at the center of our solar system. It is about 93 million miles away from Earth. The sun is a hot ball of glowing gases that provides light and warmth to Earth.',
        'answer': 'The sun is a star located approximately 93 million miles from Earth. It is the source of light and heat for our planet. The sun is not a solid object, but rather a sphere of hot glowing gases.'
    },
    {
        'context': 'Birds are warm-blooded vertebrates that lay eggs and have feathers, wings, and beaks. There are over 10,000 species of birds worldwide. Some common bird species include sparrows, pigeons, and parrots.',
        'answer': 'Birds are a diverse group of animals with feathers and wings. They are warm-blooded egg-laying vertebrates. The number of bird species globally exceeds 10,000. Pigeons, parrots, and sparrows are among the most familiar bird types.'
    },
    {
        'context': 'The Eiffel Tower is a wrought-iron lattice tower located on the Champ de Mars in Paris, France. It was constructed from 1887 to 1889 and stands at a height of 324 meters. The tower is named after Gustave Eiffel, whose company designed and built it.',
        'answer': 'The Eiffel Tower, found in Paris, France, is a lattice tower made of wrought iron. Built between 1887 and 1889, it reaches a height of 324 meters. Gustave Eiffel\'s company was responsible for the tower\'s design and construction, hence its name.'
    },
    {
        'context': 'The Great Wall of China is a series of fortifications and walls built across the historical northern borders of ancient Chinese states and Imperial China. The most well-known sections were built during the Ming dynasty, which ruled from 1368 to 1644.',
        'answer': 'The Great Wall of China, a series of walls and fortifications, was constructed along the northern borders of ancient Chinese states and Imperial China. The Ming dynasty, which lasted from 1368 to 1644, is responsible for the construction of the most famous sections of the wall.'
    }
]

model = OpenAI(temperature=0)
comparator = FactComparator(model)

df = comparator.process_data_list(data_list)
df

Unnamed: 0,context,answer,context_replace_pronouns,answer_replace_pronouns,context_list,answer_list,facts_in_both,facts_only_in_answer,facts_only_in_context,groundedness,thoroughness
0,The quick brown fox jumps over the rock becaus...,"The quick brown fox was born in 2005, and the ...",\n The quick brown fox jumps over t...,\n The quick brown fox was born in ...,\n<facts>\n1. The quick brown fox jumps over t...,\n<facts>\n1. The quick brown fox was born in ...,"The fox was born in 2005., The hedgehog was bo...",The quick brown fox is not as happy as the hed...,The quick brown fox jumps over the rock becaus...,66.666667,50.0
1,The sun is a star at the center of our solar s...,The sun is a star located approximately 93 mil...,\n The sun is a star at the center ...,\n The sun is a star located approx...,\n<facts>\n1. The sun is a star.\n2. The sun i...,\n<facts>\n1. The sun is a star.\n2. The sun i...,"The sun is a star., The sun is a hot ball of g...",The sun is located approximately 93 million mi...,"The sun is at the center of our solar system.,...",25.0,40.0
2,Birds are warm-blooded vertebrates that lay eg...,Birds are a diverse group of animals with feat...,\n Birds are warm-blooded vertebrat...,\n Birds are a diverse group of ani...,\n<facts>\n1. Birds are warm-blooded vertebrat...,\n<facts>\n1. Birds are a diverse group of ani...,"Birds are warm-blooded vertebrates., Birds lay...",Birds are a diverse group of animals with feat...,"Some common bird species include sparrows., So...",60.0,66.666667
3,The Eiffel Tower is a wrought-iron lattice tow...,"The Eiffel Tower, found in Paris, France, is a...",\n The Eiffel Tower is a wrought-ir...,"\n The Eiffel Tower, found in Paris...",\n<facts>\n1. The Eiffel Tower is a wrought-ir...,\n<facts>\n1. The Eiffel Tower is located in P...,The Eiffel Tower is a wrought-iron lattice tow...,"The Eiffel Tower is located in Paris, France.,...",,50.0,100.0
4,The Great Wall of China is a series of fortifi...,"The Great Wall of China, a series of walls and...",\n The Great Wall of China is a ser...,"\n The Great Wall of China, a serie...",\n<facts>\n1. The Great Wall of China is a ser...,\n<facts>\n1. The Great Wall of China is a ser...,The Great Wall of China is a series of walls a...,The Ming dynasty is responsible for the constr...,The most well-known sections were built during...,75.0,75.0


## Inspect_AI Grader (not working)

In [32]:
from inspect_ai.solver import TaskState
from inspect_ai.util import resource
from inspect_ai.scorer._metric import Score
from inspect_ai.scorer._scorer import Scorer, Target, scorer
from inspect_ai.scorer._metrics import accuracy, bootstrap_std
from inspect_ai.model import ChatMessageUser, ModelOutput, ChatCompletionChoice, ChatMessage

def groundedness():
    return lambda value: value["groundedness"]

def thoroughness():
    return lambda value: value["thoroughness"]

@scorer(metrics=[groundedness(), thoroughness()])
def fact_comparator_scorer(
    model: OpenAI,
) -> Scorer:
    comparator = FactComparator(model)

    async def score(state: TaskState, target: Target) -> Score:
        result = comparator.process_data(target.text, state.output.completion)
        metrics = comparator.calculate_metrics(result["comparison_result"])

        return Score(
            value=metrics,
            answer=state.output.completion,
            explanation=f"Groundedness: {metrics['groundedness']}, Thoroughness: {metrics['thoroughness']}",
            metadata=dict(
                comparison_result=result["comparison_result"].dict(),
                context_replace_pronouns=result["context_replace_pronouns"],
                answer_replace_pronouns=result["answer_replace_pronouns"],
                context_list=result["context_list"],
                answer_list=result["answer_list"],
            ),
        )

    return score

# Initialize your Langchain model
model = OpenAI(temperature=0.7)

# Define your evaluation data
eval_data = [
    {
        "context": "The capital of France is Paris.",
        "answer": "Paris is the capital city of France.",
    },
    # Add more evaluation samples
]

# Run the evaluation
results = []
for data in eval_data:
    state = TaskState(
        model=model.model_name,
        sample_id=data.get("id", ""),
        epoch=0,
        input=data["context"],
        choices=None,
        messages=[ChatMessageUser(content=data["context"])],
        output=ModelOutput(
            model=str(model.model_name),
            choices=[ChatCompletionChoice(message=ChatMessage(content=data["answer"]))],
        ),
        completed=True,
    )
    target = Target(text=data["context"])
    score = await fact_comparator_scorer(model).score(state, target)
    results.append(score)

# Process and display the evaluation results
for result in results:
    print(f"Groundedness: {result.value['groundedness']}")
    print(f"Thoroughness: {result.value['thoroughness']}")
    print(f"Explanation: {result.explanation}")
    print(f"Metadata: {result.metadata}")
    print("---")

TypeError: Cannot instantiate typing.Union