<img src="https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/opik-logo.svg" width="250"/>

# Defining a Custom Metric in Opik.

In this lesson, we will define a custom metric called Factuality. You can use OpenAI or open source models via LiteLLM.

# Imports & Configuration

In [1]:
! pip install comet-ml opik openai litellm --quiet

In [2]:
from opik import Opik, track
from opik.evaluation import evaluate
from opik.integrations.openai import track_openai
from opik.evaluation.metrics import base_metric, score_result
import openai
import os
from datetime import datetime
from getpass import getpass
import litellm
from litellm.integrations.opik.opik import OpikLogger
from opik.opik_context import get_current_span_data
from opik.evaluation.models import litellm_chat_model

opik_logger = OpikLogger()
# In order to log LiteLLM traces to Opik, you will need to set the Opik callback
litellm.callbacks = [opik_logger]


# Define project name to enable tracing
os.environ["OPIK_PROJECT_NAME"] = "food_chatbot_eval"

* 'fields' has been removed


In [3]:
# opik configs
if "OPIK_API_KEY" not in os.environ:
    os.environ["OPIK_API_KEY"] = getpass("Enter your Opik API key: ")

Enter your Opik API key: ··········


In [None]:
# openai configs
#if "OPENAI_API_KEY" not in os.environ:
#    os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")


In [4]:
import opik

opik.configure(use_local=False)

OPIK: Opik is already configured. You can check the settings by viewing the config file at /root/.opik.config


# Templates & Context

In [5]:
# menu items
menu_items = """
Menu: Kids Menu
Food Item: Mini Cheeseburger
Price: $6.99
Vegan: N
Popularity: 4/5
Included: Mini beef patty, cheese, lettuce, tomato, and fries.

Menu: Appetizers
Food Item: Loaded Potato Skins
Price: $8.99
Vegan: N
Popularity: 3/5
Included: Crispy potato skins filled with cheese, bacon bits, and served with sour cream.

Menu: Appetizers
Food Item: Bruschetta
Price: $7.99
Vegan: Y
Popularity: 4/5
Included: Toasted baguette slices topped with fresh tomatoes, basil, garlic, and balsamic glaze.

Menu: Main Menu
Food Item: Grilled Chicken Caesar Salad
Price: $12.99
Vegan: N
Popularity: 4/5
Included: Grilled chicken breast, romaine lettuce, Parmesan cheese, croutons, and Caesar dressing.

Menu: Main Menu
Food Item: Classic Cheese Pizza
Price: $10.99
Vegan: N
Popularity: 5/5
Included: Thin-crust pizza topped with tomato sauce, mozzarella cheese, and fresh basil.

Menu: Main Menu
Food Item: Spaghetti Bolognese
Price: $14.99
Vegan: N
Popularity: 4/5
Included: Pasta tossed in a savory meat sauce made with ground beef, tomatoes, onions, and herbs.

Menu: Vegan Options
Food Item: Veggie Wrap
Price: $9.99
Vegan: Y
Popularity: 3/5
Included: Grilled vegetables, hummus, mixed greens, and a wrap served with a side of sweet potato fries.

Menu: Vegan Options
Food Item: Vegan Beyond Burger
Price: $11.99
Vegan: Y
Popularity: 4/5
Included: Plant-based patty, vegan cheese, lettuce, tomato, onion, and a choice of regular or sweet potato fries.

Menu: Desserts
Food Item: Chocolate Lava Cake
Price: $6.99
Vegan: N
Popularity: 5/5
Included: Warm chocolate cake with a gooey molten center, served with vanilla ice cream.

Menu: Desserts
Food Item: Fresh Berry Parfait
Price: $5.99
Vegan: Y
Popularity: 4/5
Included: Layers of mixed berries, granola, and vegan coconut yogurt.
"""


In [6]:
# prompt template for the Factuality metric
prompt_template = """
###INSTRUCTIONS###

You are a helpful assistant who should evaluate if a food chatbot's response is factual given user requests and a menu (delimited by +++++). Output 1 if the chatbot response is factually answering the user message and 0 if it doesn't.

+++++
{menu_items}
+++++

###EXAMPLE OUTPUT FORMAT###
{{
    "value": 0,
    "reason": "The response is not factually answering the user question."
}}

###INPUTS:###
{user_message}

###RESPONSE:###
{chatbot_response}
"""


In [7]:
question_template = """Answer a question about the following menu:

# MENU
{menu}

# QUESTION
{question}
"""

# Dataset

In [8]:
# Create or get the dataset
client = Opik()
dataset = client.get_or_create_dataset(name="foodchatbot_eval")

## Optional: Download Dataset From Comet

If you have not previously created the `foodchatbot_eval` dataset in your Opik workspace, run the following code to download the dataset as a Comet Artifact and populate your Opik dataset.

If you have already created the `foodchatbot_eval` dataset, you can skip to the next section

In [None]:
import comet_ml

In [None]:
experiment = comet_ml.start(project_name="foodchatbot_eval")

logged_artifact = experiment.get_artifact(artifact_name="foodchatbot_eval",
                                        workspace="examples")
local_artifact = logged_artifact.download("./")
experiment.end()

In [None]:
import csv
import json
# Read the CSV file and insert items into the dataset
with open('./foodchatbot_clean_eval_dataset.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        index, question, response = row
        item = {
            "index": index,
            "question": question,
            "response": response
        }

        dataset.insert([item])

# Build Your Application --> if using OpenAI

In [None]:
# Simple little client class for using different LLM APIs (OpenAI or LiteLLM)
class LLMClient:
  def __init__(self, client_type: str ="openai", model: str ="gpt-4o-mini"):
    self.client_type = client_type
    self.model = model

    if self.client_type == "openai":
      self.client = track_openai(openai.OpenAI())

    else:
      self.client = None

  # LiteLLM query function
  def _get_litellm_response(self, query: str, system: str = "You are a helpful assistant."):
    messages = [
        {"role": "system", "content": system },
        { "role": "user", "content": query }
    ]

    response = litellm.completion(
        model=self.model,
        messages=messages
    )

    return response.choices[0].message.content

  # OpenAI query function - use **kwargs to pass arguments like temperature
  def _get_openai_response(self, query: str, system: str = "You are a helpful assistant.", **kwargs):
    messages = [
        {"role": "system", "content": system },
        { "role": "user", "content": query }
    ]

    response = self.client.chat.completions.create(
        model=self.model,
        messages=messages,
        **kwargs
    )

    return response.choices[0].message.content


  def query(self, query: str, system: str = "You are a helpful assistant.", **kwargs):
    if self.client_type == 'openai':
      return self._get_openai_response(query, system, **kwargs)

    else:
      return self._get_litellm_response(query, system)





In [None]:
# Initialize your client!

llm_client = LLMClient()

# Evaluation  --> using OpenAI

In [None]:
# Define the Factuality Metric
class Factuality(base_metric.BaseMetric):
    def __init__(self, name: str):
        self.name = name

    def score(self, input: str, output: str, context: str, reference: str):
        response = llm_client.query(prompt_template.format(menu_items=context, user_message=input, chatbot_response=output))

        response = eval(response)

        return score_result.ScoreResult(
            value=response["value"],
            name=self.name,
            reason=response["reason"]
        )


In [None]:
@track
def chatbot_application(input: str) -> str:
    response = llm_client.query(question_template.format(menu=menu_items, question=input))
    return response


In [None]:
# Define the evaluation task
def evaluation_task(x: DatasetItem):
    return {
        "input": x['question'],
        "output": chatbot_application(x['question']),
        "context": menu_items,
        "reference": x['response']
    }


In [None]:
client = Opik()

In [None]:
# Define the metrics
metrics = [Factuality("Factuality")]

In [None]:
# Run evaluation
experiment_name = "gpt-4o-mini" + "_" + dataset.name + "_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

evaluation = evaluate(
    experiment_name=experiment_name,
    dataset=dataset,
    task=evaluation_task,
    scoring_metrics=metrics,
    experiment_config={
        "model": "gpt-4o-mini"
    }
)

# Build Application with LiteLLM llama3.2 and Evaluate

In [9]:
# Hugging Face Configs to access meta-llama-3.2 model
if "HF_TOKEN" not in os.environ:
  os.environ["HF_TOKEN"] = getpass("Enter your Hugging Face Key: ")

Enter your Hugging Face Key: ··········


In [10]:
# meta-llama from HuggingFace
MODEL = "huggingface/meta-llama/Llama-3.2-3B-Instruct"

In [21]:
# Define the Factuality Metric
class Factuality(base_metric.BaseMetric):
    def __init__(self, name: str, model: str = "huggingface/meta-llama/Llama-3.2-3B-Instruct"):
        self.name = name
        self.llm_client = litellm_chat_model.LiteLLMChatModel(model_name=model)

    def score(self, input: str, output: str, context: str, reference: str = None, **kwargs):
        # Generate response from the LLM
        response = self.llm_client.generate_string(prompt_template.format(menu_items=context, user_message=input, chatbot_response=output))
        response = eval(response)

        return score_result.ScoreResult(
            value=response["value"],
            name=self.name,
            reason=response["reason"]
        )


In [22]:
@track
def chatbot_application(input: str) -> str:
    response = litellm.completion(
        model=MODEL,
        messages=[
            {"role":"system", "content":"You are a helpful assistant."},
            {"role":"user", "content":question_template.format(menu=menu_items, question=input)}
        ]
    )
    return response.choices[0].message.content

In [23]:
# Define the evaluation task
def evaluation_task(x):                       # (x: DatasetItem):
    return {
        "input": x['question'],
        "output": chatbot_application(x['question']),
        "context": menu_items,
        "reference": x['response']
    }

In [24]:
# Define the metrics
metrics = [Factuality("Factuality")]

In [25]:
# Run evaluation
experiment_name = MODEL + "_" + dataset.name + "_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

evaluation = evaluate(
    experiment_name=experiment_name,
    dataset=dataset,
    task=evaluation_task,
    scoring_metrics=metrics,
    experiment_config={
        "model": MODEL
    }
)

Evaluation:  19%|█▉        | 11/57 [00:02<00:04,  9.93it/s]OPIK: Failed to compute metric Factuality. Score result will be marked as failed.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/opik/evaluation/scorer.py", line 37, in _score_test_case
    result = metric.score(**score_kwargs)
  File "<ipython-input-21-a4dc4d035c48>", line 10, in score
    response = eval(response)
  File "<string>", line 2
    Is the Vegan Beyond Burger a popular choice?
       ^^^
SyntaxError: invalid syntax
Evaluation:  26%|██▋       | 15/57 [00:02<00:04,  8.53it/s]OPIK: Failed to compute metric Factuality. Score result will be marked as failed.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/opik/evaluation/scorer.py", line 37, in _score_test_case
    result = metric.score(**score_kwargs)
  File "<ipython-input-21-a4dc4d035c48>", line 10, in score
    response = eval(response)
  File "<string>", line 2
    The response claims that there