In [1]:
from openai import OpenAI
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(dotenv_path="../.env")

open_api_key = os.getenv("OPENAI_API_KEY")

openAI_params = {
    'api_key': open_api_key
}
client = OpenAI(**openAI_params)

#### Evaluating Model Performance
Evaluation (often called evals) test model outputs to ensure they meet style and content criteria that you specify. Writing evals to understand how your LLM applications are performing against your expectation , especially when upgrading or trying new models, is an essential component to building reliable applications.

Broadly , there are three steps to buid and run evals for your LLM applications.
1. Describe the task to be done as an eval.
2. Run your eval with test inputs (a prompt and input data)
3. Analyze the results, then iterate and improve on your prompt.

#### Create an Eval for a task
Creating an eval begins by describing a task to be done by a model. <br>
Let's say that we would like to use a model to classify the contents of IT support tickets into one of the three categories: <br>
`Hardware`, `Software`, `Other` <br>

In [2]:
instructions = """
You are an expert in categorizing IT support tickets. Given the support 
ticket below, categorize the request into one of "Hardware", "Software", 
or "Other". Respond with only one of those words.
"""

ticket = "My monitor won't turn on - help!"

completion = client.chat.completions.create(
    model="gpt-4.1",
    messages=[
        {"role": "developer", "content": instructions},
        {"role": "user", "content": ticket}
    ]
)

print(completion.choices[0].message.content)


Hardware


Let's set up an eval to test this behaviour. An Eval needs two key ingredients:
1. A schema for the test data you will use along with the eval (`data_source_config`)
2. The criteria that determine if the model output is correct (`testing_criteria`)

##### Explanation of `data_source_config`
Running this eval will require a test data set that represents the type of data you expects your prompt to work with.

In `data_source_config` parameter, we specify that each **item** in the data set will conform to JSON Schema with two properties.

1. `ticket_text`: a string of text with the contents of a support ticket
2. `correct_label`: a "ground truth" output that the model should match, provided by a human.

Since we will be referencing a **sample** in our test criteria (the output generated by model given our prompt), we also set `include_sample_schema` to true

##### Explanation of `testing_criteria`
In our `testing_criteria`, we define how we will conclude if the model output satisfies our requirement for each item in the data sets. <br>

In this case, we just want the model to output one of three category strings based on the input ticket. The string it output should exactly match the human labelled `correct_label` field in our test data. So in this case, we want to use a `string_check` grader to evaluate the output.

In the test configuration, we will introduce template syntax, represented by the {{ and }} brackets below. This is how we will insert dynamic content into the test for this eval.

- {{ item.correct_label }} refers to the ground truth value in our test data.
- {{ sample.output_text }} refers to the content we will generate from a model to evaluate our prompt - we'll show how to do that when we actually kick off the eval run.

In [14]:
OPENAI_API_KEY = open_api_key
payload = '''
{
    "name": "IT Ticket Categorization",
    "data_source_config": {
        "type": "custom",
        "item_schema": {
            "type": "object",
            "properties": {
                "ticket_text": { "type": "string" },
                "correct_label": { "type": "string" }
            },
            "required": ["ticket_text", "correct_label"]
        },
        "include_sample_schema": true
    },
    "testing_criteria": [
        {
            "type": "string_check",
            "name": "Match output to human label",
            "input": "{{ sample.output_text }}",
            "operation": "eq",
            "reference": "{{ item.correct_label }}"
        }
    ]
}
'''

!curl https://api.openai.com/v1/evals \
    -H "Authorization: Bearer $OPENAI_API_KEY" \
    -H "Content-Type: application/json" \
    -d '{payload}'

{
  "object": "eval",
  "id": "eval_6806679cddc88190bcc3f0af70f51954",
  "data_source_config": {
    "type": "custom",
    "schema": {
      "type": "object",
      "properties": {
        "item": {
          "type": "object",
          "properties": {
            "ticket_text": {
              "type": "string"
            },
            "correct_label": {
              "type": "string"
            }
          },
          "required": [
            "ticket_text",
            "correct_label"
          ]
        },
        "sample": {
          "type": "object",
          "properties": {
            "model": {
              "type": "string"
            },
            "choices": {
              "type": "array",
              "items": {
                "type": "object",
                "properties": {
                  "message": {
                    "type": "object",
                    "properties": {
                      "role": {
                        "type": "string",
            

##### Upload the Data to Open AI Platforn

In [5]:
file = client.files.create(
    file=open("tickets.jsonl", "rb"),
    purpose="evals"
)

print(file)

FileObject(id='file-Sewn6QzobSEu8PN2uVN95t', bytes=268, created_at=1745243859, filename='tickets.jsonl', object='file', purpose='evals', status='processed', expires_at=None, status_details=None)


##### Creating an eval Run

In [15]:
import json

# Replace these with your actual values
OPENAI_API_KEY = open_api_key
EVAL_ID = "eval_6806679cddc88190bcc3f0af70f51954"
FILE_ID = "file-Sewn6QzobSEu8PN2uVN95t"

# Create the payload as a Python dictionary
payload = {
    "name": "Categorization text run",
    "data_source": {
        "type": "completions",
        "model": "gpt-4o",
        "input_messages": {
            "type": "template",
            "template": [
                {
                    "role": "developer",
                    "content": "You are an expert in categorizing IT support tickets. Given the support ticket below, categorize the request into one of \"Hardware\", \"Software\", or \"Other\". Respond with only one of those words."
                },
                {
                    "role": "user",
                    "content": "{{ item.ticket_text }}"
                }
            ],
        },
        "source": {
            "type": "file_id",
            "id": FILE_ID
        }
    }
}

# Write JSON to a temporary file
with open("payload.json", "w") as f:
    json.dump(payload, f)

# Use curl to send the request
!curl https://api.openai.com/v1/evals/{EVAL_ID}/runs \
    -H "Authorization: Bearer {OPENAI_API_KEY}" \
    -H "Content-Type: application/json" \
    -d @payload.json

{
  "object": "eval.run",
  "id": "evalrun_680667a50908819088339c61e0255831",
  "eval_id": "eval_6806679cddc88190bcc3f0af70f51954",
  "report_url": "https://platform.openai.com/evaluations/eval_6806679cddc88190bcc3f0af70f51954?project_id=proj_ReG0HO1UluDmyfH07rDrLIO1&run_id=evalrun_680667a50908819088339c61e0255831",
  "status": "queued",
  "model": "gpt-4o",
  "name": "Categorization text run",
  "created_at": 1745250213,
  "result_counts": {
    "total": 0,
    "errored": 0,
    "failed": 0,
    "passed": 0
  },
  "per_model_usage": null,
  "per_testing_criteria_results": null,
  "data_source": {
    "type": "completions",
    "source": {
      "type": "file_id",
      "id": "file-Sewn6QzobSEu8PN2uVN95t"
    },
    "input_messages": {
      "type": "template",
      "template": [
        {
          "type": "message",
          "role": "developer",
          "content": {
            "type": "input_text",
            "text": "You are an expert in categorizing IT support tickets. Given 

#### Analyze the results

In [16]:
OPENAI_API_KEY = open_api_key

### Get the status of your run
!curl https://api.openai.com/v1/evals/eval_68064ec8b2048190b2df5ddc0a823fae/runs/evalrun_68064ee45f648190a3c6cb11c5c20fa2 \
    -H "Authorization: Bearer $OPENAI_API_KEY" \
    -H "Content-Type: application/json"

{
  "object": "eval.run",
  "id": "evalrun_68064ee45f648190a3c6cb11c5c20fa2",
  "eval_id": "eval_68064ec8b2048190b2df5ddc0a823fae",
  "report_url": "https://platform.openai.com/evaluations/eval_68064ec8b2048190b2df5ddc0a823fae?project_id=proj_ReG0HO1UluDmyfH07rDrLIO1&run_id=evalrun_68064ee45f648190a3c6cb11c5c20fa2",
  "status": "completed",
  "model": "gpt-4o",
  "name": "Categorization text run",
  "created_at": 1745243876,
  "result_counts": {
    "total": 3,
    "errored": 0,
    "failed": 0,
    "passed": 3
  },
  "per_model_usage": [
    {
      "model_name": "gpt-4o-2024-08-06",
      "invocation_count": 3,
      "prompt_tokens": 175,
      "completion_tokens": 6,
      "total_tokens": 181,
      "cached_tokens": 0
    }
  ],
  "per_testing_criteria_results": [
    {
      "testing_criteria": "Match output to human label-2afa301c-c45c-4581-bb56-6b4fff095203",
      "passed": 3,
      "failed": 0
    }
  ],
  "data_source": {
    "type": "completions",
    "source": {
      "type"