<img src="https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/opik-logo.svg" width="250"/>

# Evaluation with Opik

In this exercise, you'll implement a basic evaluation pipeline with Opik. You can use OpenAI or open source models via LiteLLM

# Imports & Configuration

In [1]:
! pip install opik openai comet_ml litellm --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/303.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.5/303.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/710.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m710.6/710.6 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m5.5/6.6 MB[0m [31m167.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.6/6.6 MB[0m [31m160.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━

In [2]:
# Imports & Configuration
import opik
from opik import Opik, track
from opik.evaluation import evaluate
from opik.evaluation.metrics import (IsJson)
from opik.integrations.openai import track_openai
import openai
import os
from datetime import datetime
from getpass import getpass
import litellm
from litellm.integrations.opik.opik import OpikLogger
from opik.opik_context import get_current_span_data

opik_logger = OpikLogger()
# In order to log LiteLLM traces to Opik, you will need to set the Opik callback
litellm.callbacks = [opik_logger]

# Define project name to enable tracing
os.environ["OPIK_PROJECT_NAME"] = "food_chatbot_eval"

* 'fields' has been removed


In [4]:
# Opik configuration
if "OPIK_API_KEY" not in os.environ:
  os.environ["OPIK_API_KEY"] = getpass("Enter your Opik API key: ")

opik.configure()

Enter your Opik API key: ··········
Do you want to use "bluemusk" workspace? (Y/n)y


OPIK: Configuration saved to file: /root/.opik.config


In [None]:
# OpenAI configuration (ignore if you're using LiteLLM)
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")

MODEL = "gpt-4o-mini"

In [5]:
# Hugging Face Configs to access meta-llama-3.2 model
if "HF_TOKEN" not in os.environ:
  os.environ["HF_TOKEN"] = getpass("Enter your Hugging Face Key: ")

Enter your Hugging Face Key: ··········


In [6]:
client = opik.Opik()

In [8]:
MODEL = "huggingface/meta-llama/Llama-3.2-1B-Instruct"

# Dataset

In [7]:
# Create or get the dataset
dataset = client.get_or_create_dataset(name="foodchatbot_eval")

## Optional: Download Dataset From Comet

If you have not previously created the `foodchatbot_eval` dataset in your Opik workspace, run the following code to download the dataset as a Comet Artifact and populate your Opik dataset.

If you have already created the `foodchatbot_eval` dataset, you can skip to the next section

In [None]:
import comet_ml

In [None]:
experiment = comet_ml.start(project_name="foodchatbot_eval")

logged_artifact = experiment.get_artifact(artifact_name="foodchatbot_eval",
                                        workspace="examples")
local_artifact = logged_artifact.download("./")
experiment.end()

In [None]:
import csv
import json
# Read the CSV file and insert items into the dataset
with open('./foodchatbot_clean_eval_dataset.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        index, question, response = row
        item = {
            "index": index,
            "question": question,
            "response": response
        }

        dataset.insert([item])

# Templates & Prompts

In [9]:
# menu items
menu_items = """
Menu: Kids Menu
Food Item: Mini Cheeseburger
Price: $6.99
Vegan: N
Popularity: 4/5
Included: Mini beef patty, cheese, lettuce, tomato, and fries.

Menu: Appetizers
Food Item: Loaded Potato Skins
Price: $8.99
Vegan: N
Popularity: 3/5
Included: Crispy potato skins filled with cheese, bacon bits, and served with sour cream.

Menu: Appetizers
Food Item: Bruschetta
Price: $7.99
Vegan: Y
Popularity: 4/5
Included: Toasted baguette slices topped with fresh tomatoes, basil, garlic, and balsamic glaze.

Menu: Main Menu
Food Item: Grilled Chicken Caesar Salad
Price: $12.99
Vegan: N
Popularity: 4/5
Included: Grilled chicken breast, romaine lettuce, Parmesan cheese, croutons, and Caesar dressing.

Menu: Main Menu
Food Item: Classic Cheese Pizza
Price: $10.99
Vegan: N
Popularity: 5/5
Included: Thin-crust pizza topped with tomato sauce, mozzarella cheese, and fresh basil.

Menu: Main Menu
Food Item: Spaghetti Bolognese
Price: $14.99
Vegan: N
Popularity: 4/5
Included: Pasta tossed in a savory meat sauce made with ground beef, tomatoes, onions, and herbs.

Menu: Vegan Options
Food Item: Veggie Wrap
Price: $9.99
Vegan: Y
Popularity: 3/5
Included: Grilled vegetables, hummus, mixed greens, and a wrap served with a side of sweet potato fries.

Menu: Vegan Options
Food Item: Vegan Beyond Burger
Price: $11.99
Vegan: Y
Popularity: 4/5
Included: Plant-based patty, vegan cheese, lettuce, tomato, onion, and a choice of regular or sweet potato fries.

Menu: Desserts
Food Item: Chocolate Lava Cake
Price: $6.99
Vegan: N
Popularity: 5/5
Included: Warm chocolate cake with a gooey molten center, served with vanilla ice cream.

Menu: Desserts
Food Item: Fresh Berry Parfait
Price: $5.99
Vegan: Y
Popularity: 4/5
Included: Layers of mixed berries, granola, and vegan coconut yogurt.
"""


In [12]:
prompt_template = """Answer a question about the following menu:

# MENU
{menu}

# QUESTION
{question}
"""

In [11]:
# function call of llama3 using litellm
@track
def chatbot_application(input: str) -> str:
    response = litellm.completion(
        model=MODEL,
        messages=[
            {"role":"system", "content":"You are a helpful assistant."},
            {"role":"user", "content":prompt_template.format(menu=menu_items, question=input)}
        ]
    )
    return response.choices[0].message.content

# Evaluation

In [13]:
# Define the evaluation task
def evaluation_task(x):
    return {
        "input": x['question'],
        "output": chatbot_application(x['question']),
        "context": menu_items,
        "reference": x['response']
    }


In [14]:
# Define the metrics
metrics = [IsJson()]

# experiment_name
experiment_name = MODEL + "_" + dataset.name + "_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# run evaluation
evaluation = evaluate(
    experiment_name=experiment_name,
    dataset=dataset,
    task=evaluation_task,
    scoring_metrics=metrics,
    experiment_config={
        "model": MODEL
    }
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

OPIK: Started logging traces to the "food_chatbot_eval" project at https://www.comet.com/opik/bluemusk/redirect/projects?name=food_chatbot_eval.
Evaluation:   2%|▏         | 1/56 [00:15<14:04, 15.35s/it][92m18:26:18 - LiteLLM:ERROR[0m: opik.py:111 - OpikLogger failed to send batch - Client error '403 Forbidden' for url 'https://www.comet.com/opik/api/v1/private/traces/batch'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/litellm/integrations/opik/opik.py", line 102, in _sync_send
    response = self.sync_httpx_client.post(
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom_httpx/http_handler.py", line 528, in post
    raise e
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom_httpx/http_handler.py", line 509, in post
    response.raise_for_status()
  File "/usr/local/lib/python3.10/dist-packages/httpx/_models.py", line 763, in 

ERROR:LiteLLM:OpikLogger failed to send batch - Client error '403 Forbidden' for url 'https://www.comet.com/opik/api/v1/private/traces/batch'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/litellm/integrations/opik/opik.py", line 102, in _sync_send
    response = self.sync_httpx_client.post(
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom_httpx/http_handler.py", line 528, in post
    raise e
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom_httpx/http_handler.py", line 509, in post
    response.raise_for_status()
  File "/usr/local/lib/python3.10/dist-packages/httpx/_models.py", line 763, in raise_for_status
    raise HTTPStatusError(message, request=request, response=self)
httpx.HTTPStatusError: Client error '403 Forbidden' for url 'https://www.comet.com/opik/api/v1/private/traces/batch'
For more information check: https://d

ERROR:LiteLLM:OpikLogger failed to send batch - Client error '403 Forbidden' for url 'https://www.comet.com/opik/api/v1/private/traces/batch'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/litellm/integrations/opik/opik.py", line 102, in _sync_send
    response = self.sync_httpx_client.post(
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom_httpx/http_handler.py", line 528, in post
    raise e
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom_httpx/http_handler.py", line 509, in post
    response.raise_for_status()
  File "/usr/local/lib/python3.10/dist-packages/httpx/_models.py", line 763, in raise_for_status
    raise HTTPStatusError(message, request=request, response=self)
httpx.HTTPStatusError: Client error '403 Forbidden' for url 'https://www.comet.com/opik/api/v1/private/traces/batch'
For more information check: https://d

[92m18:26:22 - LiteLLM:ERROR[0m: opik.py:111 - OpikLogger failed to send batch - Client error '403 Forbidden' for url 'https://www.comet.com/opik/api/v1/private/traces/batch'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/litellm/integrations/opik/opik.py", line 102, in _sync_send
    response = self.sync_httpx_client.post(
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom_httpx/http_handler.py", line 528, in post
    raise e
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom_httpx/http_handler.py", line 509, in post
    response.raise_for_status()
  File "/usr/local/lib/python3.10/dist-packages/httpx/_models.py", line 763, in raise_for_status
    raise HTTPStatusError(message, request=request, response=self)
httpx.HTTPStatusError: Client error '403 Forbidden' for url 'https://www.comet.com/opik/api/v1/private/traces/batch'
Fo

* Refresh the experiment section in Comet OPIK to see the evaluation with its metrics created. You can annotate different metrics/feedback scores from the create Project section in the OPIK UI. This process is termed human-in-loop annotation.
Once a feedback score (Quality for example) is annotated for any question in the Project section, the human feeback score reflects on the question in the Experiment section

* Note: context in the evaluation_task function should contain the retrievals in personal project

* The annotated metrics (Quality) for each question and response gives an average score of the Quality metric in the OPIK UI

# This below is not neccesary unless using OPENAI

# LLM Application


In [None]:
# Simple little client class for using different LLM APIs (OpenAI or LiteLLM)
class LLMClient:
  def __init__(self, client_type: str ="openai", model: str ="gpt-4"):
    self.client_type = client_type
    self.model = model

    if self.client_type == "openai":
      self.client = track_openai(openai.OpenAI())

    else:
      self.client = None

  # LiteLLM query function
  def _get_litellm_response(self, query: str, system: str = "You are a helpful assistant."):
    messages = [
        {"role": "system", "content": system },
        { "role": "user", "content": query }
    ]

    response = litellm.completion(
        model=self.model,
        messages=messages
    )

    return response.choices[0].message.content

  # OpenAI query function - use **kwargs to pass arguments like temperature
  def _get_openai_response(self, query: str, system: str = "You are a helpful assistant.", **kwargs):
    messages = [
        {"role": "system", "content": system },
        { "role": "user", "content": query }
    ]

    response = self.client.chat.completions.create(
        model=self.model,
        messages=messages,
        **kwargs
    )

    return response.choices[0].message.content


  def query(self, query: str, system: str = "You are a helpful assistant.", **kwargs):
    if self.client_type == 'openai':
      return self._get_openai_response(query, system, **kwargs)

    else:
      return self._get_litellm_response(query, system)





In [None]:
llm_client = LLMClient(model=MODEL)

In [None]:
@track
def chatbot_application(input: str) -> str:
    response = llm_client.query(prompt_template.format(menu=menu_items, question=input))
    return response



# Evaluation

In [None]:
# Define the evaluation task
def evaluation_task(x):
    return {
        "input": x['question'],
        "output": chatbot_application(x['question']),
        "context": menu_items,
        "reference": x['response']
    }


In [None]:
#dataset = client.get_or_create_dataset(name="foodchatbot_eval")

In [None]:
# Define the metrics
metrics = [IsJson()]

# experiment_name
experiment_name = MODEL + "_" + dataset.name + "_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# run evaluation
evaluation = evaluate(
    experiment_name=experiment_name,
    dataset=dataset,
    task=evaluation_task,
    scoring_metrics=metrics,
    experiment_config={
        "model": MODEL
    }
)