## Imports

In [1]:
%pip install langchain
%pip install ipywidgets
%pip install langchain-openai


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip




## Functions

In [5]:
import json
from typing import Any, Dict, Protocol, cast, runtime_checkable

from langchain_core.callbacks import AsyncCallbackManagerForLLMRun, CallbackManagerForLLMRun
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import AIMessage, BaseMessage, FunctionMessage, HumanMessage, SystemMessage, ToolMessage
from langchain_core.messages import ToolCall as LCToolCall
from langchain_core.outputs import ChatGeneration, ChatResult
from pydantic.v1 import Field
from typing_extensions import override

from inspect_ai.model import (
    ChatMessage,
    ChatMessageAssistant,
    ChatMessageSystem,
    ChatMessageTool,
    ChatMessageUser,
    Content,
    ContentImage,
    ContentText,
    GenerateConfig,
    ModelName,
    ModelOutput,
    ToolCall,
    ToolChoice,
    ToolInfo,
    ToolParam,
    get_model,
)
from inspect_ai.solver import Generate, Solver, TaskState

@runtime_checkable
class LangChainAgent(Protocol):
    async def __call__(
        self, llm: BaseChatModel, input: dict[str, Any]
    ) -> str | list[str | dict[str, Any]]:
        ...


def langchain_solver(agent: LangChainAgent) -> Solver:
    async def solve(state: TaskState, generate: Generate) -> TaskState:
        # create the inspect model api bridge
        llm = InspectChatModel()

        # call the agent
        await agent(
            llm=llm,
            input=dict(
                input=state.user_prompt.text,
                chat_history=as_langchain_chat_history(state.messages[1:]),
            ),
        )

        # collect output from llm interface
        state.messages = llm.messages
        state.output = llm.output

        # return state
        return state

    return solve


class InspectChatModel(BaseChatModel):
    # track messages and model output so we can update
    # the inspect task state when we are complete
    messages: list[ChatMessage] = Field(default=[], exclude=True)
    output: ModelOutput = Field(default=ModelOutput(), exclude=True)

    @property
    def _llm_type(self) -> str:
        return f"Inspect ({ModelName(get_model()).api})"

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        return {
            "model_name": str(ModelName(get_model()).name),
        }

    @override
    def _generate(
        self,
        messages: list[BaseMessage],
        stop: list[str] | None = None,
        run_manager: CallbackManagerForLLMRun | None = None,
        **kwargs: Any,
    ) -> ChatResult:
        # inspect uses async exclusively
        raise NotImplementedError

    @override
    async def _agenerate(
        self,
        messages: list[BaseMessage],
        stop: list[str] | None = None,
        run_manager: AsyncCallbackManagerForLLMRun | None = None,
        **kwargs: dict[str, Any],
    ) -> ChatResult:
        # extract tools from kwargs
        tools: list[ToolInfo] = []
        tool_choice: ToolChoice | None = None
        lc_tools = cast(list[dict[str, Any]] | None, kwargs.get("tools", None))
        if lc_tools:
            tools = [
                ToolInfo(
                    name=tool["function"]["name"],
                    description=tool["function"]["description"],
                    params=as_inspect_tool_params(tool["function"]["parameters"]),
                )
                for tool in lc_tools
            ]
            tool_choice = "auto"

        # generate
        input = [as_inspect_message(message) for message in messages]
        result = await get_model().generate(
            input=input,
            tools=tools,
            tool_choice=tool_choice,
            config=GenerateConfig(stop_seqs=stop),
        )

        # track last messages / model output
        self.messages = input
        self.messages.append(result.choices[0].message)
        self.output = result

        # extract choices
        generations = [
            ChatGeneration(message=as_langchain_message(choice.message))
            for choice in result.choices
        ]

        # return
        return ChatResult(generations=generations)


def as_inspect_message(message: BaseMessage) -> ChatMessage:
    if isinstance(message, SystemMessage):
        return ChatMessageSystem(content=as_inspect_content(message.content))
    elif isinstance(message, HumanMessage):
        return ChatMessageUser(content=as_inspect_content(message.content))
    elif isinstance(message, AIMessage):
        return ChatMessageAssistant(
            content=as_inspect_content(message.content),
            tool_calls=(
                [
                    ToolCall(
                        type="function",
                        function=call["name"],
                        id=call["id"] or call["name"],
                        arguments=call["args"],
                    )
                    for call in message.tool_calls
                ]
                if message.tool_calls and len(message.tool_calls) > 0
                else None
            ),
        )
    elif isinstance(message, ToolMessage):
        return ChatMessageTool(
            content=as_inspect_content(message.content),
            tool_call_id=message.tool_call_id,
        )
    elif isinstance(message, FunctionMessage):
        return ChatMessageTool(
            content=as_inspect_content(message.content), tool_call_id=message.name
        )
    else:
        raise ValueError(f"Unexpected message type: {type(message)}")


def as_langchain_message(message: ChatMessage) -> BaseMessage:
    if isinstance(message, ChatMessageSystem):
        return SystemMessage(content=as_langchain_content(message.content))
    elif isinstance(message, ChatMessageUser):
        return HumanMessage(content=as_langchain_content(message.content))
    elif isinstance(message, ChatMessageAssistant):
        additional_kwargs: dict[str, Any] = {}
        if message.tool_calls and len(message.tool_calls) > 0:
            additional_kwargs["tool_calls"] = [
                dict(
                    id=call.id, name=call.function, arguments=json.dumps(call.arguments)
                )
                for call in message.tool_calls
            ]

        return AIMessage(
            content=as_langchain_content(message.content),
            tool_calls=(
                [
                    LCToolCall(id=call.id, name=call.function, args=call.arguments)
                    for call in message.tool_calls
                ]
                if message.tool_calls
                else []
            ),
            additional_kwargs=additional_kwargs,
        )
    elif isinstance(message, ChatMessageTool):
        return ToolMessage(
            content=as_langchain_content(message.content),
            tool_call_id=message.tool_call_id or "",
        )
    else:
        raise ValueError(f"Unexpected message type: {type(message)}")


def as_langchain_chat_history(messages: list[ChatMessage]) -> list[dict[str, Any]]:
    return [dict(role=message.role, content=message.text) for message in messages]


def as_inspect_content(
    content: str | list[str | dict[str, Any]],
) -> str | list[Content]:
    if isinstance(content, str):
        return content
    else:
        return [
            (
                ContentText(text=c)
                if isinstance(c, str)
                else (
                    ContentText(text=c["text"])
                    if c["type"] == "text"
                    else ContentImage(image=c["image"])
                )
            )
            for c in content
        ]


def as_inspect_tool_params(parameters: dict[str, Any]) -> list[ToolParam]:
    params: list[ToolParam] = []
    for key, param in parameters["properties"].items():
        params.append(
            ToolParam(
                name=key,
                type=param["type"],
                description=param.get("description", param.get("title")),
                optional=key not in parameters["required"],
            )
        )
    return params


def as_langchain_content(
    content: str | list[Content],
) -> str | list[str | dict[str, Any]]:
    if isinstance(content, str):
        return content
    else:
        return [c if isinstance(c, str) else c.model_dump() for c in content]

In [18]:
import random
from inspect_ai import eval, Task, task
from inspect_ai.model import get_model
from inspect_ai.solver import TaskState, generate, system_message
from inspect_ai.scorer import Score, Scorer, Target, metric, scorer
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
import pandas as pd
import asyncio
from inspect_ai.dataset import Sample


class FactComparator:
    def __init__(self, model):
        self.model = model
        self.parser = PydanticOutputParser(pydantic_object=ComparisonResult)

    async def __call__(self, context, answer):
        return await self.process_data(context, answer)

    async def process_data(self, context, answer):
        context_list = (await self.model._agenerate([HumanMessage(content=self._parse_prompt().format(text=context))])).generations[0].text
        answer_list = (await self.model._agenerate([HumanMessage(content=self._parse_prompt().format(text=answer))])).generations[0].text

        comparison_result = self.parser.parse((await self.model._agenerate([HumanMessage(content=self._compare_prompt().format(context_list=context_list, answer_list=answer_list))])).generations[0].text)

        return {
            "context_list": context_list,
            "answer_list": answer_list,
            "comparison_result": comparison_result,
        }

    def calculate_metrics(self, comparison_result):
        facts_in_both_count = len(comparison_result.facts_in_both)
        facts_only_in_answer_count = len(comparison_result.facts_only_in_answer)
        facts_only_in_context_count = len(comparison_result.facts_only_in_context)

        total_answer_facts = facts_in_both_count + facts_only_in_answer_count
        total_context_facts = facts_in_both_count + facts_only_in_context_count

        groundedness = facts_in_both_count / total_answer_facts * 100 if total_answer_facts > 0 else 0
        thoroughness = facts_in_both_count / total_context_facts * 100 if total_context_facts > 0 else 0

        return {
            "groundedness": groundedness,
            "thoroughness": thoroughness,
        }
    @staticmethod
    def _parse_prompt():
        return PromptTemplate(
            input_variables=["text"],
            template="""
            Here is a text that may contain one or more facts:

            <text>
            {text}
            </text>

            Please parse this text into a list of individual facts. If a sentence contains multiple facts, break it up into separate sentences as needed so that each sentence contains only one fact.

            If any of the facts contain pronouns and the pronoun reference is clear, replace the pronoun with the noun it refers to. If the pronoun reference is ambiguous, leave the pronoun as is.

        Return the final list of parsed and pronoun-replaced facts inside <facts> tags, with each fact on its own line. Do not include any additional commentary or explanation, including about pronoun changes, number of facts, or truth value of the facts.
        """,
        )

    @staticmethod
    def _compare_prompt():
        return PromptTemplate(
            input_variables=["context_list", "answer_list"],
            template="""
            You will be comparing facts between a context and an answer to determine which facts are shared and which are unique to each.

            Here is the context:

            <context>
            {context_list}
            </context>

            And here is the answer: 

            <answer>
            {answer_list}
            </answer>

            Carefully analyze the facts presented in the context and answer, focusing on the semantic meaning rather than the exact wording.

            Then, output a dictionary with the following keys and corresponding lists of facts as values:

            1. "facts_in_both": A list of facts that are present in both the context and the answer

            2. "facts_only_in_answer": A list of facts that are only present in the answer 

            3. "facts_only_in_context": A list of facts that are only present in the context

            Remember, the facts do not need to be worded identically to be considered the same. Focus on whether the core meaning is shared or unique.  A fact in the context may be expressed in different terms in the answer, or multiple facts in one may combine to express a single fact in the other.

            Provide your results in this format:

            {{
                "facts_in_both": [
                    "Fact 1 present in both",
                    "Fact 2 present in both"
                ],
                "facts_only_in_answer": [
                    "Fact 1 only in answer",
                    "Fact 2 only in answer"  
                ],
                "facts_only_in_context": [
                    "Fact 1 only in context",
                    "Fact 2 only in context"
                ]
            }}
            """,
        )


class ComparisonResult(BaseModel):
    facts_in_both: list[str] = Field(default_factory=list, description="List of facts present in both context and answer")
    facts_only_in_answer: list[str] = Field(default_factory=list, description="List of facts only present in the answer")
    facts_only_in_context: list[str] = Field(default_factory=list, description="List of facts only present in the context")

## Run on First Pair of Statements

['The fox runs quickly.', "The fox's best friend is Sally.", 'Sally is a cat.']

In [23]:
%env INSPECT_EVAL_MODEL=openai/gpt-4
%env INSPECT_MODEL_NAME=openai/gpt-4

# Create an instance of InspectChatModel with the specified model
inspect_model = InspectChatModel()

# Create an instance of FactComparator with the InspectChatModel
comparator = FactComparator(inspect_model)


context = "The fox is brown. It runs quickly. The fox's best friend is Sally, which is a cat."
answer = "The fox is tan. It runs fast. Its best friend is a cat. She's named Sally."

# Run the asynchronous process_data method
result = await comparator(context, answer)

metrics = comparator.calculate_metrics(result["comparison_result"])


print("\nContext list:")
print(result["context_list"])

print("\nAnswer list:")
print(result["answer_list"])

print("\nComparison result:")
print(result["comparison_result"])

print("\nMetrics:")
print(f"Groundedness: {metrics['groundedness']:.2f}%")
print(f"Thoroughness: {metrics['thoroughness']:.2f}%")

env: INSPECT_EVAL_MODEL=openai/gpt-4
env: INSPECT_MODEL_NAME=openai/gpt-4

Context list:
<facts>
The fox is brown.
The fox runs quickly.
The fox's best friend is Sally.
Sally is a cat.
</facts>

Answer list:
<facts>
The fox is tan.
The fox runs fast.
The fox's best friend is a cat.
The cat's name is Sally.
</facts>

Comparison result:
facts_in_both=['The fox runs quickly.', "The fox's best friend is Sally.", 'Sally is a cat.'] facts_only_in_answer=['The fox is tan.'] facts_only_in_context=['The fox is brown.']

Metrics:
Groundedness: 75.00%
Thoroughness: 75.00%


In [73]:
from typing import Dict, Tuple
import pandas as pd
from ast import literal_eval

class ModelComparator:
    def __init__(self, model):
        self.inspect_model = InspectChatModel()
        self.comparator = FactComparator(self.inspect_model)

    async def run_and_compare(self, target_statement, input_statement):
        try:
            result = await self.comparator(target_statement, input_statement)
            metrics = self.comparator.calculate_metrics(result["comparison_result"])
            groundedness_model = metrics['groundedness']
            thoroughness_model = metrics['thoroughness']
            context_list = result["context_list"]
            answer_list = result["answer_list"]
            comparison_result = result["comparison_result"]
            model_error = None
        except Exception as e:
            groundedness_model = None
            thoroughness_model = None
            context_list = None
            answer_list = None
            comparison_result = None
            model_error = str(e)

        return {
            'Groundedness (Model)': groundedness_model,
            'Thoroughness (Model)': thoroughness_model,
            'Context List': context_list,
            'Answer List': answer_list,
            'Facts in Both': comparison_result.facts_in_both if comparison_result else None,
            'Facts Only in Answer': comparison_result.facts_only_in_answer if comparison_result else None,
            'Facts Only in Context': comparison_result.facts_only_in_context if comparison_result else None,
            'Model Error': model_error
        }

def compare_metrics(cases: Dict[str, Dict[str, Tuple[str, str, Dict[str, float], str]]]):
    data = []
    model_comparator = ModelComparator(model='openai/gpt-4')

    for case_name, case_data in cases.items():
        input_statement = case_data['input']
        target_statement = case_data['target']
        true_metrics = case_data['true_metrics']
        description = case_data['description']

        model_results = asyncio.run(model_comparator.run_and_compare(target_statement, input_statement))

        groundedness_true = true_metrics['groundedness']
        thoroughness_true = true_metrics['thoroughness']

        data.append({
            'Case': case_name,
            'Input Statement': input_statement,
            'Target Statement': target_statement,
            'Description': description,
            'Groundedness (True)': groundedness_true,
            'Thoroughness (True)': thoroughness_true,
            **model_results
        })

    df = pd.DataFrame(data)
    return df


In [61]:
# Example usage
cases = {
    'case1': {
        'input': 'The Sun is a medium-sized star. It\'s about 4.6 billion years old.',
        'target': 'The sun is approximately 4.6 billion years old. It\'s a mid-sized star.',
        'true_metrics': {'groundedness': 100, 'thoroughness': 100},
        'description': 'This is a basic use case with pronouns and mild rephrasing.'
    },
    'case2': {
        'input': 'The Sun, a medium-sized star, is located at the center of our Solar System and is approximately 4.6 billion years old.',
        'target': 'The sun is a mid-sized star which has existed for about 4.6 billion years.',
        'true_metrics': {'groundedness': 67, 'thoroughness': 100},
        'description': 'This is a basic use case with mild rephrasing.'
    },
    'case3': {
        'input': 'Sally is Rachel\'s cat.',
        'target': 'Sally is a cat. Rachel is her owner.',
        'true_metrics': {'groundedness': 100, 'thoroughness': 100},  
        'description': 'This case involves simple restructuring and clarification.'
    },
    'case4': {
        'input': 'Sally is larger than Stan.',
        'target': 'Stan is smaller than Sally.',
        'true_metrics': {'groundedness': 100, 'thoroughness': 100}, 
        'description': 'This case demonstrates a change in comparative perspective.'
    },
    'case5': {
        'input': 'the average temperature today is 20 degrees celsius.',
        'target': 'the mean temperature today is 68 degrees fahrenheit.',
        'true_metrics': {'groundedness': 100, 'thoroughness': 100},  
        'description': 'This case involves unit conversion and synonym use.'
    },
    'case6': {
        'input': 'the average temperature today is 20 degrees celsius.',
        'target': 'the average temperature today is 50 degrees celsius.',
        'true_metrics': {'groundedness': 0, 'thoroughness': 0},  
        'description': 'This case involves unit conversion and synonym use.'
    },
    'case7': {
        'input': 'The company has an ATO now, so they have been sanctioned by the government and you can work with them.', 
        'target':  'The company has been sanctioned by the government in response to recent lawbreaking activity.' , 
        'true_metrics': {'groundedness': 0, 'thoroughness': 0},  # Contextual misuse
        'description': 'This case uses "sanctioned" in a way that highlights its dual meaning: approved or penalized.'
    },
    # Add more cases as needed
}


%env INSPECT_EVAL_MODEL=openai/gpt-4
%env INSPECT_MODEL_NAME=openai/gpt-4

import asyncio

df = compare_metrics(cases)

env: INSPECT_EVAL_MODEL=openai/gpt-4
env: INSPECT_MODEL_NAME=openai/gpt-4


In [62]:
df

Unnamed: 0,Case,Input Statement,Target Statement,Description,Groundedness (True),Thoroughness (True),Groundedness (Model),Thoroughness (Model),Context List,Answer List,Facts in Both,Facts Only in Answer,Facts Only in Context,Model Error
0,case1,The Sun is a medium-sized star. It's about 4.6...,The sun is approximately 4.6 billion years old...,This is a basic use case with pronouns and mil...,100,100,100.0,100.0,<facts>\nThe sun is approximately 4.6 billion ...,<facts>\nThe Sun is a medium-sized star.\nThe ...,[The sun is approximately 4.6 billion years ol...,[],[],
1,case2,"The Sun, a medium-sized star, is located at th...",The sun is a mid-sized star which has existed ...,This is a basic use case with mild rephrasing.,67,100,66.666667,100.0,<facts>\nThe sun is a mid-sized star.\nThe sun...,<facts>\nThe Sun is a medium-sized star.\nThe ...,"[The sun is a mid-sized star., The sun has exi...",[The Sun is located at the center of our Solar...,[],
2,case3,Sally is Rachel's cat.,Sally is a cat. Rachel is her owner.,This case involves simple restructuring and cl...,100,100,100.0,100.0,<facts>\nSally is a cat.\nRachel is the owner ...,<facts>\nSally is Rachel's cat.\n</facts>,"[Sally is a cat., Rachel is the owner of Sally.]",[],[],
3,case4,Sally is larger than Stan.,Stan is smaller than Sally.,This case demonstrates a change in comparative...,100,100,100.0,100.0,<facts>\nStan is smaller than Sally.\n</facts>,<facts>\nSally is larger than Stan.\n</facts>,"[Stan is smaller than Sally, Sally is larger t...",[],[],
4,case5,the average temperature today is 20 degrees ce...,the mean temperature today is 68 degrees fahre...,This case involves unit conversion and synonym...,100,100,100.0,100.0,<facts>\nThe mean temperature today is 68 degr...,<facts>\nThe average temperature today is 20 d...,[The mean/average temperature today is 68 degr...,[],[],
5,case6,the average temperature today is 20 degrees ce...,the average temperature today is 50 degrees ce...,This case involves unit conversion and synonym...,0,0,0.0,0.0,<facts>\nThe average temperature today is 50 d...,<facts>\nThe average temperature today is 20 d...,[],[The average temperature today is 20 degrees c...,[The average temperature today is 50 degrees c...,
6,case7,"The company has an ATO now, so they have been ...",The company has been sanctioned by the governm...,"This case uses ""sanctioned"" in a way that high...",0,0,33.333333,50.0,<facts>\nThe company has been sanctioned by th...,<facts>\nThe company has an ATO now.\nThe comp...,[The company has been sanctioned by the govern...,"[The company has an ATO now, You can work with...",[The sanction was in response to recent lawbre...,


In [None]:

# Define your samples

samples = [
    Sample(
        input="The Sun is a medium-sized star. It's about 4.6 billion years old.",
        target="The sun is approximately 4.6 billion years old. It's a mid-sized star.",
        true_metrics={'groundedness': 100, 'thoroughness': 100},
        description="This is a basic use case with pronouns and mild rephrasing.",
        id="case1"
    ),
    Sample(
        input="The Sun, a medium-sized star, is located at the center of our Solar System and is approximately 4.6 billion years old.",
        target="The sun is a mid-sized star which has existed for about 4.6 billion years.",
        true_metrics={'groundedness': 67, 'thoroughness': 100},
        description="This is a basic use case with mild rephrasing.",
        id="case2"
    ),
    Sample(
        input="Sally is Rachel's cat.",
        target="Sally is a cat. Rachel is her owner.",
        true_metrics={'groundedness': 100, 'thoroughness': 100},  
        description="This case involves simple restructuring and clarification.",
        id="case3"
    ),
    Sample(
        input="Sally is larger than Stan.",
        target="Stan is smaller than Sally.",
        true_metrics={'groundedness': 100, 'thoroughness': 100}, 
        description="This case demonstrates a change in comparative perspective.",
        id="case4"
    ),
    Sample(
        input="the average temperature today is 20 degrees celsius.",
        target="the mean temperature today is 68 degrees fahrenheit.",
        true_metrics={'groundedness': 100, 'thoroughness': 100},  
        description="This case involves unit conversion and synonym use.",
        id="case5"
    ),
    Sample(
        input="the average temperature today is 20 degrees celsius.",
        target="the average temperature today is 50 degrees celsius.",
        true_metrics={'groundedness': 0, 'thoroughness': 0},  
        description="This case involves incorrect unit conversion and synonym use.",
        id="case6"
    ),
    Sample(
        input="The company has an ATO now, so they have been sanctioned by the government and you can work with them.", 
        target="The company has been sanctioned by the government in response to recent lawbreaking activity.",
        true_metrics={'groundedness': 0, 'thoroughness': 0},  
        description="This case uses 'sanctioned' in a way that highlights its dual meaning: approved or penalized.",
        id="case7"
    )
    # Add more samples as needed
]

In [84]:
SYSTEM_MESSAGE = "Please answer the question being asked."

samples = [
    Sample(
        input="How old is the sun?",
        target="The sun is approximately 4.6 billion years old. It's a mid-sized star.",
        description="Very basic question.",
        id="case1"
    ),
    Sample(
        input="What is the capital of France?",
        target="The capital of France is Paris.",
        description="Basic geography question.",
        id="case2"
    ),
    Sample(
        input="Explain the theory of relativity.",
        target="The theory of relativity, developed by Albert Einstein, includes the special and general theories. Special relativity introduces a consistent explanation for the speed of light, and general relativity provides a description of gravity as a curvature of spacetime caused by mass.",
        description="Complex scientific concept.",
        id="case3"
    ),
    Sample(
        input="What is photosynthesis?",
        target="Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize nutrients from carbon dioxide and water. It typically involves the green pigment chlorophyll and generates oxygen as a byproduct.",
        description="Basic biological process.",
        id="case4"
    ),
    Sample(
        input="Who wrote 'Pride and Prejudice'?",
        target="'Pride and Prejudice' was written by Jane Austen.",
        description="Question about literature.",
        id="case5"
    )
]


In [83]:
class FactComparatorScorer:
    def __init__(self, model):
        self.model = model
        self.fact_comparator = FactComparator(model)

    async def __call__(self, state: TaskState, target: Sample):
        try: 
            context = state.output.choices[0].message.content
        except: 
            context = state.input
        target_text = target.target

        result = await self.fact_comparator.process_data(context, target_text)
        metrics = self.fact_comparator.calculate_metrics(result["comparison_result"])

        scorer_value = {
            "groundedness": metrics["groundedness"],
            "thoroughness": metrics["thoroughness"],
        }

        explanation = str(result) + f"\nModel Output: {context}"

        return Score(
            value=scorer_value,
            explanation=explanation,
        )
        
@metric
def thoroughness():
  def metric(scores: list[Score]) -> float:
    total = 0.0
    for item in scores:
      metadata = item.metadata
      if metadata is not None:
          total += float(metadata["thoroughness"])
    return total / float(len(scores))
  return metric

@metric
def groundedness():
  def metric(scores: list[Score]) -> float:
    total = 0.0
    for item in scores:
        metadata = item.metadata
        if metadata is not None:
            total += float(metadata["groundedness"])
    return total / float(len(scores))
  return metric

    
@scorer(metrics=[groundedness(), thoroughness()])
def fact_comparator_scorer(model) -> Scorer:
  
  async def score(state: TaskState, target: Target) -> Score:

    # Create an instance of the scorer
    model = InspectChatModel()
    fact_comparator_scorer = FactComparatorScorer(model)

    # Call the scorer
    score = await fact_comparator_scorer(state, target)
    print(score)

    # Ignore the actual processing and return a dummy value
    grounded_score = score.value['groundedness']
    thorough_score = score.value['thoroughness']
    explanation = score.explanation

    answer = state.output.completion

    return Score(
        value=f"G:{grounded_score} : T:{thorough_score}", # make a better string?
        answer=answer,
        explanation= "nothing",
        metadata = {
           "thoroughness": thorough_score,
           "groundedness": grounded_score,
            "stuff": explanation
        }
    )

  return score


Output()

In [None]:
@task
def my_eval():
    return Task(
        dataset=samples,
        plan=[
            system_message(SYSTEM_MESSAGE),
            generate()
        ],
        scorer=fact_comparator_scorer(model=get_model()),
    )

if __name__ == "__main__":
    eval(my_eval(), model="openai/gpt-4")