<img src="https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/opik-logo.svg" width="250"/>

# LLM-Based Evaluation with Opik

In this exercise, you'll be evaluationg LLM applications with LLM-as-a-judge metrics. You can use OpenAI or open source models via LiteLLM. To make the exercise a little more exciting, you'll be running your evaluations using HaluBench, the popular hallucination dataset.

# Imports & Configuration

In [1]:
%pip install opik openai comet_ml litellm --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/303.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.5/303.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m710.6/710.6 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m980.3/980.3 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import opik
from opik import Opik, track
from opik.evaluation import evaluate
from opik.evaluation.metrics import (Hallucination, AnswerRelevance)
from opik.integrations.openai import track_openai
import openai
import os
from datetime import datetime
from getpass import getpass
import litellm
from litellm.integrations.opik.opik import OpikLogger
from opik.opik_context import get_current_span_data

opik_logger = OpikLogger()
# In order to log LiteLLM traces to Opik, you will need to set the Opik callback
litellm.callbacks = [opik_logger]

# Define project name to enable tracing
os.environ["OPIK_PROJECT_NAME"] = "llm-based-eval"

* 'fields' has been removed


In [3]:
# opik configs
if "OPIK_API_KEY" not in os.environ:
    os.environ["OPIK_API_KEY"] = getpass("Enter your Opik API key: ")

opik.configure()

Enter your Opik API key: ··········
Do you want to use "bluemusk" workspace? (Y/n)y


OPIK: Configuration saved to file: /root/.opik.config


In [5]:
# Hugging Face Configs to access meta-llama-3.2 model
if "HF_TOKEN" not in os.environ:
  os.environ["HF_TOKEN"] = getpass("Enter your Hugging Face Key: ")

Enter your Hugging Face Key: ··········


In [None]:
# OpenAI configuration (ignore if you're using LiteLLM)
#if "OPENAI_API_KEY" not in os.environ:
#    os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")
# MODEL = "gpt-4o-mini"

In [4]:
# Opik client and Model
MODEL = "huggingface/meta-llama/Llama-3.2-1B-Instruct"
client = Opik()

# Prompts & Templates

In [6]:
prompt_template = """Use the following context to answer my question:

### CONTEXT:
{context}

### QUESTION:
{question}
"""

# Dataset

In [7]:
# Create dataset
dataset = client.get_or_create_dataset(
    name="HaluBench", description="HaluBench dataset"
)

OPIK: Created a "HaluBench" dataset at https://www.comet.com/opik/bluemusk/redirect/datasets?name=HaluBench.


In [8]:
import pandas as pd

df = pd.read_parquet(
    "hf://datasets/PatronusAI/HaluBench/data/test-00000-of-00001.parquet"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
df.head()

Unnamed: 0,id,passage,question,answer,label,source_ds
0,d3fb4c3c-d21b-480a-baa0-98d6d0d17c1d,Hoping to rebound from the road loss to the Ch...,Which team scored the longest field goal kick ...,"['Rams', 'second', 'Marc Bulger', 'Kevin Curtis']",FAIL,DROP
1,8603663e-c53b-46db-a482-a867f12ff3b4,"As of the census of 2000, there were 218,590 p...",How many percent were not Irish?,87.1,FAIL,DROP
2,c63a73e5-2c91-489b-bd24-af150ddfa82c,Hoping to rebound from the road loss to the Ch...,How many yards was the second longest field go...,42,FAIL,DROP
3,52db14ed-5426-46ec-b0ae-4ef843b2d692,Hoping to rebound from their tough overtime ro...,How long was the last touchdown?,18-yard,FAIL,DROP
4,31b36417-aad1-412c-b0e5-9c1faaed233f,"As of the census of 2000, there were 218,590 p...",How many in percent from the census weren't Ir...,87.1,FAIL,DROP


In [11]:
cleaned_ds = df.drop(['answer', 'label', 'source_ds', 'id'], axis=1).iloc[0:100]
cleaned_ds.head()

Unnamed: 0,passage,question
0,Hoping to rebound from the road loss to the Ch...,Which team scored the longest field goal kick ...
1,"As of the census of 2000, there were 218,590 p...",How many percent were not Irish?
2,Hoping to rebound from the road loss to the Ch...,How many yards was the second longest field go...
3,Hoping to rebound from their tough overtime ro...,How long was the last touchdown?
4,"As of the census of 2000, there were 218,590 p...",How many in percent from the census weren't Ir...


In [12]:
dataset.insert(cleaned_ds.to_dict('records'))

In [13]:
# read the above dataset
dataset.to_pandas().head()

Unnamed: 0,passage,question,id
0,"Trying to snap a two-game skid, the Bills flew...",How many games had the Bills won before this g...,01944236-a6fe-7de8-a55b-0993b307fbf4
1,1564: The city of Ryazan posad was burned.:47 ...,What was burned first: city of Ryazan or subur...,01944236-a6fd-76bd-ba40-badafa549570
2,"As of the census of 2000, there were 218,590 p...",How many percent were not Italian?,01944236-a6fc-7468-b0af-fd3aa8c87132
3,"As of the census of 2000, there were 218,590 p...",Which group from the census is smaller: German...,01944236-a6fb-7404-98fa-4af3ae6390f2
4,"In week 6, the Lions hosted the NFC West Divis...",How many field goals between 20 and 30 yards w...,01944236-a6fa-715e-af4b-1131599ce150


# LLM Application

In [14]:
# Experiment function call of llama3 using litellm
@track
def chatbot_application(question: str, context: str) -> str:
    response = litellm.completion(
        model=MODEL,
        messages=[
            {"role":"system", "content":"You are a helpful assistant."},
            {"role":"user", "content":prompt_template.format(context=context, question=question)}
        ]
    )
    return response.choices[0].message.content

* below codes is for OpenAI and Lite LLM combo

In [None]:
# Simple little client class for using different LLM APIs (OpenAI or LiteLLM)
#class LLMClient:
#  def __init__(self, client_type: str ="openai", model: str ="gpt-4"):
#    self.client_type = client_type
#    self.model = model

#    if self.client_type == "openai":
#      self.client = track_openai(openai.OpenAI())

#    else:
#      self.client = None

  # LiteLLM query function
#  def _get_litellm_response(self, query: str, system: str = "You are a helpful assistant."):
#    messages = [
#        {"role": "system", "content": system },
#        { "role": "user", "content": query }
#    ]

#    response = litellm.completion(
#        model=self.model,
#        messages=messages
#    )

#    return response.choices[0].message.content

  # OpenAI query function - use **kwargs to pass arguments like temperature
#  def _get_openai_response(self, query: str, system: str = "You are a helpful assistant.", **kwargs):
#    messages = [
#        {"role": "system", "content": system },
#        { "role": "user", "content": query }
#    ]

#    response = self.client.chat.completions.create(
#        model=self.model,
#        messages=messages,
#        **kwargs
#    )

#    return response.choices[0].message.content


#  def query(self, query: str, system: str = "You are a helpful assistant.", **kwargs):
#    if self.client_type == 'openai':
#      return self._get_openai_response(query, system, **kwargs)

#    else:
#      return self._get_litellm_response(query, system)


# llm_client = LLMClient(model=MODEL)


@track
#def chatbot_application(question: str, context: str) -> str:
#    response = llm_client.query(prompt_template.format(context=context, question=question))
#    return response


# Evaluation

In [15]:
# Define the evaluation task
def evaluation_task(x):
    return {
        "input": x['question'],
        "output": chatbot_application(x['question'], x['passage']),
        "context": x['passage']
    }


In [16]:
# Retrieve the dataset
client = Opik()

In [17]:
# Define the metrics
metrics = [Hallucination(), AnswerRelevance()]

# experiment_name
experiment_name = MODEL + "_" + dataset.name + "_" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# run evaluation
evaluation = evaluate(
    experiment_name=experiment_name,
    dataset=dataset,
    task=evaluation_task,
    scoring_metrics=metrics,
    experiment_config={
        "model": MODEL
    }
)

Evaluation:   0%|          | 0/96 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    raise e
  File "/usr/local/lib/python3.10/dist-packages/litellm/litellm_core_utils/exception_mapping_utils.py", line 355, in exception_type
    raise AuthenticationError(
litellm.exceptions.AuthenticationError: litellm.AuthenticationError: AuthenticationError: OpenAIException - The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
[92m19:36:44 - LiteLLM:ERROR[0m: opik.py:111 - OpikLogger failed to send batch - Client error '403 Forbidden' for url 'https://www.comet.com/opik/api/v1/private/traces/batch'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/litellm/integrations/opik/opik.py", line 102, in _sync_send
    response = self.sync_httpx_client.post(
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom

[92m19:36:47 - LiteLLM:ERROR[0m: opik.py:111 - OpikLogger failed to send batch - Client error '403 Forbidden' for url 'https://www.comet.com/opik/api/v1/private/traces/batch'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/litellm/integrations/opik/opik.py", line 102, in _sync_send
    response = self.sync_httpx_client.post(
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom_httpx/http_handler.py", line 528, in post
    raise e
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom_httpx/http_handler.py", line 509, in post
    response.raise_for_status()
  File "/usr/local/lib/python3.10/dist-packages/httpx/_models.py", line 763, in raise_for_status
    raise HTTPStatusError(message, request=request, response=self)
httpx.HTTPStatusError: Client error '403 Forbidden' for url 'https://www.comet.com/opik/api/v1/private/traces/batch'
Fo

ERROR:LiteLLM:OpikLogger failed to send batch - Client error '403 Forbidden' for url 'https://www.comet.com/opik/api/v1/private/spans/batch'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/litellm/integrations/opik/opik.py", line 102, in _sync_send
    response = self.sync_httpx_client.post(
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom_httpx/http_handler.py", line 528, in post
    raise e
  File "/usr/local/lib/python3.10/dist-packages/litellm/llms/custom_httpx/http_handler.py", line 509, in post
    response.raise_for_status()
  File "/usr/local/lib/python3.10/dist-packages/httpx/_models.py", line 763, in raise_for_status
    raise HTTPStatusError(message, request=request, response=self)
httpx.HTTPStatusError: Client error '403 Forbidden' for url 'https://www.comet.com/opik/api/v1/private/spans/batch'
For more information check: https://dev

In [18]:
# debugging why above experiment failed
import httpx

response = httpx.post(
    "https://www.comet.com/opik/api/v1/private/traces/batch",
    headers={"Authorization": "Bearer YOUR_API_KEY"},
    json={"your_payload": "data"}
)
print(response.status_code, response.text)


403 {"code":403,"message":"User not allowed to access workspace"}


In [19]:
import httpx

api_key = os.environ["OPIK_API_KEY"]
payload = {"your_payload": "data"}

print("API Key:", api_key)
print("Payload:", payload)

response = httpx.post(
    "https://www.comet.com/opik/api/v1/private/traces/batch",
    headers={"Authorization": f"Bearer {api_key}"},
    json=payload
)
print("Response Status Code:", response.status_code)
print("Response Text:", response.text)


API Key: wkP6mtUG4vmMZp8yU4uJSXQh6
Payload: {'your_payload': 'data'}
Response Status Code: 403
Response Text: {"code":403,"message":"User not allowed to access workspace"}
