## Imports

In [1]:
%pip install langchain
%pip install ipywidgets
%pip install langchain-openai






[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


## Functions

In [6]:
import json
from typing import Any, Dict, Protocol, cast, runtime_checkable

from langchain_core.callbacks import (
    AsyncCallbackManagerForLLMRun,
    CallbackManagerForLLMRun,
)
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    FunctionMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
)
from langchain_core.messages import ToolCall as LCToolCall
from langchain_core.outputs import (
    ChatGeneration,
    ChatResult,
)
from pydantic.v1 import Field
from typing_extensions import override

from inspect_ai.model import (
    ChatMessage,
    ChatMessageAssistant,
    ChatMessageSystem,
    ChatMessageTool,
    ChatMessageUser,
    Content,
    ContentImage,
    ContentText,
    GenerateConfig,
    ModelName,
    ModelOutput,
    ToolCall,
    ToolChoice,
    ToolInfo,
    ToolParam,
    get_model,
)
from inspect_ai.solver import Generate, Solver, TaskState

@runtime_checkable
class LangChainAgent(Protocol):
    async def __call__(
        self, llm: BaseChatModel, input: dict[str, Any]
    ) -> str | list[str | dict[str, Any]]:
        ...


def langchain_solver(agent: LangChainAgent) -> Solver:
    async def solve(state: TaskState, generate: Generate) -> TaskState:
        # create the inspect model api bridge
        llm = InspectChatModel()

        # call the agent
        await agent(
            llm=llm,
            input=dict(
                input=state.user_prompt.text,
                chat_history=as_langchain_chat_history(state.messages[1:]),
            ),
        )

        # collect output from llm interface
        state.messages = llm.messages
        state.output = llm.output

        # return state
        return state

    return solve


class InspectChatModel(BaseChatModel):
    # track messages and model output so we can update
    # the inspect task state when we are complete
    messages: list[ChatMessage] = Field(default=[], exclude=True)
    output: ModelOutput = Field(default=ModelOutput(), exclude=True)

    @property
    def _llm_type(self) -> str:
        return f"Inspect ({ModelName(get_model()).api})"

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        return {
            "model_name": str(ModelName(get_model()).name),
        }

    @override
    def _generate(
        self,
        messages: list[BaseMessage],
        stop: list[str] | None = None,
        run_manager: CallbackManagerForLLMRun | None = None,
        **kwargs: Any,
    ) -> ChatResult:
        # inspect uses async exclusively
        raise NotImplementedError

    @override
    async def _agenerate(
        self,
        messages: list[BaseMessage],
        stop: list[str] | None = None,
        run_manager: AsyncCallbackManagerForLLMRun | None = None,
        **kwargs: dict[str, Any],
    ) -> ChatResult:
        # extract tools from kwargs
        tools: list[ToolInfo] = []
        tool_choice: ToolChoice | None = None
        lc_tools = cast(list[dict[str, Any]] | None, kwargs.get("tools", None))
        if lc_tools:
            tools = [
                ToolInfo(
                    name=tool["function"]["name"],
                    description=tool["function"]["description"],
                    params=as_inspect_tool_params(tool["function"]["parameters"]),
                )
                for tool in lc_tools
            ]
            tool_choice = "auto"

        # generate
        input = [as_inspect_message(message) for message in messages]
        result = await get_model().generate(
            input=input,
            tools=tools,
            tool_choice=tool_choice,
            config=GenerateConfig(stop_seqs=stop),
        )

        # track last messages / model output
        self.messages = input
        self.messages.append(result.choices[0].message)
        self.output = result

        # extract choices
        generations = [
            ChatGeneration(message=as_langchain_message(choice.message))
            for choice in result.choices
        ]

        # return
        return ChatResult(generations=generations)


def as_inspect_message(message: BaseMessage) -> ChatMessage:
    if isinstance(message, SystemMessage):
        return ChatMessageSystem(content=as_inspect_content(message.content))
    elif isinstance(message, HumanMessage):
        return ChatMessageUser(content=as_inspect_content(message.content))
    elif isinstance(message, AIMessage):
        return ChatMessageAssistant(
            content=as_inspect_content(message.content),
            tool_calls=(
                [
                    ToolCall(
                        type="function",
                        function=call["name"],
                        id=call["id"] or call["name"],
                        arguments=call["args"],
                    )
                    for call in message.tool_calls
                ]
                if message.tool_calls and len(message.tool_calls) > 0
                else None
            ),
        )
    elif isinstance(message, ToolMessage):
        return ChatMessageTool(
            content=as_inspect_content(message.content),
            tool_call_id=message.tool_call_id,
        )
    elif isinstance(message, FunctionMessage):
        return ChatMessageTool(
            content=as_inspect_content(message.content), tool_call_id=message.name
        )
    else:
        raise ValueError(f"Unexpected message type: {type(message)}")


def as_langchain_message(message: ChatMessage) -> BaseMessage:
    if isinstance(message, ChatMessageSystem):
        return SystemMessage(content=as_langchain_content(message.content))
    elif isinstance(message, ChatMessageUser):
        return HumanMessage(content=as_langchain_content(message.content))
    elif isinstance(message, ChatMessageAssistant):
        additional_kwargs: dict[str, Any] = {}
        if message.tool_calls and len(message.tool_calls) > 0:
            additional_kwargs["tool_calls"] = [
                dict(
                    id=call.id, name=call.function, arguments=json.dumps(call.arguments)
                )
                for call in message.tool_calls
            ]

        return AIMessage(
            content=as_langchain_content(message.content),
            tool_calls=(
                [
                    LCToolCall(id=call.id, name=call.function, args=call.arguments)
                    for call in message.tool_calls
                ]
                if message.tool_calls
                else []
            ),
            additional_kwargs=additional_kwargs,
        )
    elif isinstance(message, ChatMessageTool):
        return ToolMessage(
            content=as_langchain_content(message.content),
            tool_call_id=message.tool_call_id or "",
        )
    else:
        raise ValueError(f"Unexpected message type: {type(message)}")


def as_langchain_chat_history(messages: list[ChatMessage]) -> list[dict[str, Any]]:
    return [dict(role=message.role, content=message.text) for message in messages]


def as_inspect_content(
    content: str | list[str | dict[str, Any]],
) -> str | list[Content]:
    if isinstance(content, str):
        return content
    else:
        return [
            (
                ContentText(text=c)
                if isinstance(c, str)
                else (
                    ContentText(text=c["text"])
                    if c["type"] == "text"
                    else ContentImage(image=c["image"])
                )
            )
            for c in content
        ]


def as_inspect_tool_params(parameters: dict[str, Any]) -> list[ToolParam]:
    params: list[ToolParam] = []
    for key, param in parameters["properties"].items():
        params.append(
            ToolParam(
                name=key,
                type=param["type"],
                description=param.get("description", param.get("title")),
                optional=key not in parameters["required"],
            )
        )
    return params


def as_langchain_content(
    content: str | list[Content],
) -> str | list[str | dict[str, Any]]:
    if isinstance(content, str):
        return content
    else:
        return [c if isinstance(c, str) else c.model_dump() for c in content]

In [11]:
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
import pandas as pd

class FactComparator:
    def __init__(self, model):
        self.model = model
        self.parser = PydanticOutputParser(pydantic_object=ComparisonResult)

    async def __call__(self, context, answer):
        return await self.process_data(context, answer)

    async def process_data(self, context, answer):
        context_replace_pronouns = (await self.model._agenerate([HumanMessage(content=self._pronoun_prompt().format(text=context))])).generations[0].text
        answer_replace_pronouns = (await self.model._agenerate([HumanMessage(content=self._pronoun_prompt().format(text=answer))])).generations[0].text

        context_list = (await self.model._agenerate([HumanMessage(content=self._parse_prompt().format(text=context_replace_pronouns))])).generations[0].text
        answer_list = (await self.model._agenerate([HumanMessage(content=self._parse_prompt().format(text=answer_replace_pronouns))])).generations[0].text

        comparison_result = self.parser.parse((await self.model._agenerate([HumanMessage(content=self._compare_prompt().format(context_list=context_list, answer_list=answer_list))])).generations[0].text)

        return {
            "context_replace_pronouns": context_replace_pronouns,
            "answer_replace_pronouns": answer_replace_pronouns,
            "context_list": context_list,
            "answer_list": answer_list,
            "comparison_result": comparison_result,
        }

    def calculate_metrics(self, comparison_result):
        facts_in_both_count = len(comparison_result.facts_in_both)
        facts_only_in_answer_count = len(comparison_result.facts_only_in_answer)
        facts_only_in_context_count = len(comparison_result.facts_only_in_context)

        total_answer_facts = facts_in_both_count + facts_only_in_answer_count
        total_context_facts = facts_in_both_count + facts_only_in_context_count

        groundedness = facts_in_both_count / total_answer_facts * 100 if total_answer_facts > 0 else 0
        thoroughness = facts_in_both_count / total_context_facts * 100 if total_context_facts > 0 else 0

        return {
            "groundedness": groundedness,
            "thoroughness": thoroughness,
        }

    def process_data_list(self, data_list):
        results = []
        for data in data_list:
            context = data['context']
            answer = data['answer']
    
            try:
                result = asyncio.run(self.process_data(context, answer))
                metrics = self.calculate_metrics(result["comparison_result"])

                result_data = {
                    'context': context,
                    'answer': answer,
                    'context_replace_pronouns': result["context_replace_pronouns"],
                    'answer_replace_pronouns': result["answer_replace_pronouns"],
                    'context_list': result["context_list"],
                    'answer_list': result["answer_list"],
                    'facts_in_both': ', '.join(result["comparison_result"].facts_in_both),
                    'facts_only_in_answer': ', '.join(result["comparison_result"].facts_only_in_answer),
                    'facts_only_in_context': ', '.join(result["comparison_result"].facts_only_in_context),
                    'groundedness': metrics['groundedness'],
                    'thoroughness': metrics['thoroughness']
                }
                results.append(result_data)
            except Exception as e:
                print(f"Error processing data item: {data}")
                print(f"Exception: {e}")
                continue

        return pd.DataFrame(results)

    @staticmethod
    def _pronoun_prompt():
        return PromptTemplate(
            input_variables=["text"],
            template="""
            Your task is to replace all the pronouns in the following text with the nouns they refer to:

            <text>
            {text}
            </text>

        Please follow these steps to replace the pronouns in the text with the nouns they refer to:

        1. Read through the text carefully and identify all the pronouns (words like "he", "she", "it", "they", "them", etc.). If there are no pronouns, simply return the original text unchanged.

        2. For each pronoun you find, look back in the text to determine which specific noun or name that pronoun is referring to. The referent noun will generally be the most recent noun or name mentioned before the pronoun that matches the pronoun in number (singular/plural) and gender.

        3. If a pronoun is part of text inside quotation marks, do not replace that pronoun, as it is part of a direct quote.

        4. Replace each pronoun with the most recent matching noun or name that it refers to. 

        5. If it is unclear which noun a pronoun is referring to, or if no matching referent can be found, do not replace that pronoun. Leave it as is.

        6. Continue this process, scanning through the text and replacing appropriate pronouns with their referent nouns, until all pronouns with clear referents have been replaced.

        7. Return the modified text with pronouns replaced. Do not include any of your analysis or thought process, or a statement that no pronouns were found, ONLY return the text itself with pronouns replaced (or the original text if no changes were made).
            """,
        )

    @staticmethod
    def _parse_prompt():
        return PromptTemplate(
            input_variables=["text"],
            template="""
            Please parse the following text into a list of individual facts:

            <text>
            {text}
            </text>

            Read the text carefully. Your task is to break it down into the key facts it contains. Parse out each individual fact into a separate sentence, even if that means splitting up or rewording the original sentences. The goal is to have a clear, concise list of the core facts contained in the text.

            Output the parsed facts in a numbered list, with each fact written as a complete sentence on its own line. Use <facts> tags to demarcate the start and end of the list.
            """,
        )

    @staticmethod
    def _compare_prompt():
        return PromptTemplate(
            input_variables=["context_list", "answer_list"],
            template="""
            You will be comparing facts between a context and an answer to determine which facts are shared and which are unique to each.

            Here is the context:

            <context>
            {context_list}
            </context>

            And here is the answer: 

            <answer>
            {answer_list}
            </answer>

            Carefully analyze the facts presented in the context and answer, focusing on the semantic meaning rather than the exact wording.

            Then, output a dictionary with the following keys and corresponding lists of facts as values:

            1. "facts_in_both": A list of facts that are present in both the context and the answer

            2. "facts_only_in_answer": A list of facts that are only present in the answer 

            3. "facts_only_in_context": A list of facts that are only present in the context

            Remember, the facts do not need to be worded identically to be considered the same. Focus on whether the core meaning is shared or unique.

            Provide your results in this format:

            {{
                "facts_in_both": [
                    "Fact 1 present in both",
                    "Fact 2 present in both"
                ],
                "facts_only_in_answer": [
                    "Fact 1 only in answer",
                    "Fact 2 only in answer"  
                ],
                "facts_only_in_context": [
                    "Fact 1 only in context",
                    "Fact 2 only in context"
                ]
            }}
            """,
        )


class ComparisonResult(BaseModel):
    facts_in_both: list[str] = Field(default_factory=list, description="List of facts present in both context and answer")
    facts_only_in_answer: list[str] = Field(default_factory=list, description="List of facts only present in the answer")
    facts_only_in_context: list[str] = Field(default_factory=list, description="List of facts only present in the context")

## Run on First Pair of Statements

In [12]:
import asyncio


%env INSPECT_EVAL_MODEL=openai/gpt-4
%env INSPECT_MODEL_NAME=openai/gpt-4

# Create an instance of InspectChatModel with the specified model
inspect_model = InspectChatModel()

# Create an instance of FactComparator with the InspectChatModel
comparator = FactComparator(inspect_model)


context = "The quick brown fox jumps over the rock because he's happy. He was born in 2005. The hedgehog was born in 2010, but she's even happier than him."
answer = "The quick brown fox was born in 2005, and the hedgehog in 2010. The quick brown fox is not as happy as the hedgehog"

# Run the asynchronous process_data method
result = await comparator(context, answer)

metrics = comparator.calculate_metrics(result["comparison_result"])

print("Context with replaced pronouns:")
print(result["context_replace_pronouns"])

print("Context with replaced pronouns:")
print(result["context_replace_pronouns"])

print("\nAnswer with replaced pronouns:")
print(result["answer_replace_pronouns"])

print("\nContext list:")
print(result["context_list"])

print("\nAnswer list:")
print(result["answer_list"])

print("\nComparison result:")
print(result["comparison_result"])

print("\nMetrics:")
print(f"Groundedness: {metrics['groundedness']:.2f}%")
print(f"Thoroughness: {metrics['thoroughness']:.2f}%")

env: INSPECT_EVAL_MODEL=openai/gpt-4
env: INSPECT_MODEL_NAME=openai/gpt-4
Context with replaced pronouns:
The quick brown fox jumps over the rock because the quick brown fox is happy. The quick brown fox was born in 2005. The hedgehog was born in 2010, but the hedgehog is even happier than the quick brown fox.
Context with replaced pronouns:
The quick brown fox jumps over the rock because the quick brown fox is happy. The quick brown fox was born in 2005. The hedgehog was born in 2010, but the hedgehog is even happier than the quick brown fox.

Answer with replaced pronouns:
The quick brown fox was born in 2005, and the hedgehog in 2010. The quick brown fox is not as happy as the hedgehog.

Context list:
<facts>
1. The quick brown fox jumps over the rock.
2. The quick brown fox is happy.
3. The quick brown fox was born in 2005.
4. The hedgehog was born in 2010.
5. The hedgehog is happier than the quick brown fox.
</facts>

Answer list:
<facts>
1. The quick brown fox was born in 2005.
2

## Run on another pair of statements

In [11]:
%env INSPECT_EVAL_MODEL=openai/gpt-4
%env INSPECT_MODEL_NAME=openai/gpt-4

# Create an instance of InspectChatModel with the specified model
inspect_model = InspectChatModel()

# Create an instance of FactComparator with the InspectChatModel
comparator = FactComparator(inspect_model)

context = "To boil pasta, first bring a large pot of salted water to a rolling boil over high heat.."
answer = "To boil pasta, begin by filling a large pot with water, making sure there's enough to fully submerge the pasta. Bring the water to a rolling boil over high heat, then add salt to enhance the pasta's flavor. Once the water is boiling, carefully add the pasta, stirring gently to prevent sticking. Cook the pasta according to the package instructions or until it reaches your desired level of tenderness, usually around 8-12 minutes. To check for doneness, taste a piece of pasta—it should be tender but still slightly firm (al dente)."

# Run the asynchronous process_data method
result = asyncio.run(comparator.process_data(context, answer))

metrics = comparator.calculate_metrics(result["comparison_result"])
print("Context with replaced pronouns:")
print(result["context_replace_pronouns"])

print("\nAnswer with replaced pronouns:")
print(result["answer_replace_pronouns"])

print("\nContext list:")
print(result["context_list"])

print("\nAnswer list:")
print(result["answer_list"])

print("\nComparison result:")
print(result["comparison_result"])

print("\nMetrics:")
print(f"Groundedness: {metrics['groundedness']:.2f}%")
print(f"Thoroughness: {metrics['thoroughness']:.2f}%")

env: INSPECT_EVAL_MODEL=openai/gpt-4
env: INSPECT_MODEL_NAME=openai/gpt-4
Context with replaced pronouns:
There are no pronouns in the text:

"To boil pasta, first bring a large pot of salted water to a rolling boil over high heat."

Answer with replaced pronouns:
To boil pasta, begin by filling a large pot with water, making sure there's enough to fully submerge the pasta. Bring the water to a rolling boil over high heat, then add salt to enhance the pasta's flavor. Once the water is boiling, carefully add the pasta, stirring gently to prevent the pasta from sticking. Cook the pasta according to the package instructions or until the pasta reaches your desired level of tenderness, usually around 8-12 minutes. To check for doneness, taste a piece of pasta—the pasta should be tender but still slightly firm (al dente).

Context list:
<facts>
1. There are no pronouns in the given text.
2. The given text is about boiling pasta.
3. The first step to boil pasta is to bring a large pot of salt

## Run on a list of dictionaries - return DF

In [18]:
data_list = [
    {
        'context': 'The quick brown fox jumps over the rock because he\'s happy. He was born in 2005. The hedgehog was born in 2010, but she\'s even happier than him.',
        'answer': 'The quick brown fox was born in 2005, and the hedgehog in 2010. The quick brown fox is not as happy as the hedgehog'
    },
    {
        'context': 'The sun is a star at the center of our solar system. It is about 93 million miles away from Earth. The sun is a hot ball of glowing gases that provides light and warmth to Earth.',
        'answer': 'The sun is a star located approximately 93 million miles from Earth. It is the source of light and heat for our planet. The sun is not a solid object, but rather a sphere of hot glowing gases.'
    },
    {
        'context': 'Birds are warm-blooded vertebrates that lay eggs and have feathers, wings, and beaks. There are over 10,000 species of birds worldwide. Some common bird species include sparrows, pigeons, and parrots.',
        'answer': 'Birds are a diverse group of animals with feathers and wings. They are warm-blooded egg-laying vertebrates. The number of bird species globally exceeds 10,000. Pigeons, parrots, and sparrows are among the most familiar bird types.'
    },
    {
        'context': 'The Eiffel Tower is a wrought-iron lattice tower located on the Champ de Mars in Paris, France. It was constructed from 1887 to 1889 and stands at a height of 324 meters. The tower is named after Gustave Eiffel, whose company designed and built it.',
        'answer': 'The Eiffel Tower, found in Paris, France, is a lattice tower made of wrought iron. Built between 1887 and 1889, it reaches a height of 324 meters. Gustave Eiffel\'s company was responsible for the tower\'s design and construction, hence its name.'
    },
    {
        'context': 'The Great Wall of China is a series of fortifications and walls built across the historical northern borders of ancient Chinese states and Imperial China. The most well-known sections were built during the Ming dynasty, which ruled from 1368 to 1644.',
        'answer': 'The Great Wall of China, a series of walls and fortifications, was constructed along the northern borders of ancient Chinese states and Imperial China. The Ming dynasty, which lasted from 1368 to 1644, is responsible for the construction of the most famous sections of the wall.'
    }
]

# Create an instance of InspectChatModel with the specified model
inspect_model = InspectChatModel()

# Create an instance of FactComparator with the InspectChatModel
comparator = FactComparator(inspect_model)

df = comparator.process_data_list(data_list)
df

Unnamed: 0,context,answer,context_replace_pronouns,answer_replace_pronouns,context_list,answer_list,facts_in_both,facts_only_in_answer,facts_only_in_context,groundedness,thoroughness
0,The quick brown fox jumps over the rock becaus...,"The quick brown fox was born in 2005, and the ...",<text>\nThe quick brown fox jumps over the roc...,"The quick brown fox was born in 2005, and the ...",<facts>\n1. The quick brown fox jumps over the...,<facts>\n1. The quick brown fox was born in 20...,"The quick brown fox was born in 2005., The hed...",,"The quick brown fox jumps over the rock., The ...",100.0,60.0
1,The sun is a star at the center of our solar s...,The sun is a star located approximately 93 mil...,The sun is a star at the center of our solar s...,The sun is a star located approximately 93 mil...,<facts>\n1. The sun is a star.\n2. The sun is ...,<facts>\n1. The sun is a star.\n2. The sun is ...,"The sun is a star., The sun is approximately 9...","The sun is not a solid object., The sun is a s...",The sun is located at the center of our solar ...,71.428571,83.333333
2,Birds are warm-blooded vertebrates that lay eg...,Birds are a diverse group of animals with feat...,"The text does not contain any pronouns, so it ...","The text contains no pronouns, so it remains u...",<facts>\n1. Birds are warm-blooded vertebrates...,<facts>\n1. Birds are a diverse group of anima...,"Birds are warm-blooded vertebrates., Birds lay...",Birds are a diverse group of animals.,Birds have beaks.,88.888889,88.888889
3,The Eiffel Tower is a wrought-iron lattice tow...,"The Eiffel Tower, found in Paris, France, is a...",The Eiffel Tower is a wrought-iron lattice tow...,"The Eiffel Tower, found in Paris, France, is a...",<facts>\n1. The Eiffel Tower is a wrought-iron...,<facts>\n1. The Eiffel Tower is located in Par...,The Eiffel Tower is a wrought-iron lattice tow...,Gustave Eiffel's company was responsible for t...,,75.0,100.0
4,The Great Wall of China is a series of fortifi...,"The Great Wall of China, a series of walls and...",The Great Wall of China is a series of fortifi...,"The Great Wall of China, a series of walls and...",<facts>\n1. The Great Wall of China is a serie...,<facts>\n1. The Great Wall of China is a serie...,The Great Wall of China is a series of fortifi...,,,100.0,100.0


In [44]:
Value = Union[
    str | int | float | bool,
    list[str | int | float | bool],
    dict[str, str | int | float | bool],
]

class Score(BaseModel):
    """Score generated by a scorer.

    Args:
       value (Value): Score value.
       answer (str | None): Answer extracted from model output (optional).
       explanation (str | None): Explanation of score (optional).
       metadata (dict[str,Any]): Additional metadata related to the score.
    """

    value: Value
    """Score value."""

    answer: str | None = Field(default=None)
    """Answer extracted from model output (optional)"""

    explanation: str | None = Field(default=None)
    """Explanation of score (optional)."""

    metadata: dict[str, Any] | None = Field(default=None)
    """Additional metadata related to the score"""

    @property
    def text(self) -> str:
        """Read the score as text."""
        return self.as_str()

    def as_str(self) -> str:
        """Read the score as a string."""
        return str(self._as_scalar())

    def as_int(self) -> int:
        """Read the score as an integer."""
        return int(self._as_scalar())

    def as_float(self) -> float:
        """Read the score as a float."""
        return float(self._as_scalar())

    def as_bool(self) -> bool:
        """Read the score as a boolean."""
        return bool(self._as_scalar())

    def _as_scalar(self) -> str | int | float | bool:
        if (
            isinstance(self.value, str)
            or isinstance(self.value, int)
            or isinstance(self.value, float)
            or isinstance(self.value, bool)
        ):
            return self.value
        else:
            raise ValueError("This score is not a scalar")



In [40]:
from typing import NamedTuple
from inspect_ai.scorer import scorer, Scorer, Target

class ScorerValue(NamedTuple):
    groundedness: float
    thoroughness: float

def groundedness_thoroughness_metric(scores):
    groundedness = [score.value.groundedness for score in scores]
    thoroughness = [score.value.thoroughness for score in scores]
    return {
        "groundedness": sum(groundedness) / len(groundedness) if groundedness else 0,
        "thoroughness": sum(thoroughness) / len(thoroughness) if thoroughness else 0,
    }

@scorer(metrics=[groundedness_thoroughness_metric])
class FactComparatorScorer:
    def __init__(self, model):
        self.model = model
        self.fact_comparator = FactComparator(model)

    async def __call__(self, answer: str, target: Target):
        result = await self.fact_comparator.process_data(target.text, answer)
        metrics = self.fact_comparator.calculate_metrics(result["comparison_result"])

        return Score(
            value=ScorerValue(metrics["groundedness"], metrics["thoroughness"]),
            explanation=str(result),
        )

In [47]:
# Assuming you have the necessary data
answer = "The fox is brown."
target_text = "The fox is brown."

# Create a Target object
target = Target(target_text)

# Create an instance of the scorer
model = InspectChatModel()
fact_comparator_scorer = FactComparatorScorer(model)

# Call the scorer
score = await fact_comparator_scorer(answer, target)

# Access the score value and explanation

scorer_value = score.value
groundedness = scorer_value[0]
thoroughness = scorer_value[0]
explanation = score.explanation

['100.0', '50.0']