# LLMs in production - Evals Workshop - by Weights & Biases
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/altryne/llm-evals-workshop/blob/main/eval.ipynb) [![Weights & Biases](https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-gradient.svg)](https://wandb.me/weave-workshop-jan)




# Intro
This notebook is accompanying a workshop, that will walk you through common patterns in building evaluations for LLMs, and useful rules of thumb to follow when doing so using [W&B Weave](https://wandb.me/weave-workshop-jan)

We'll explore the following methodology for productizing robust LLM applications: 

![three](https://gist.github.com/user-attachments/assets/0d51de65-8ec7-4cc5-a102-5a13229f5531)


Make sure to set your WANDB_API_KEY (get your key from [here](https://wandb.ai/authorize)) and OPENAI_API_KEY or GEMINI_API_KEY in the environment variables.

If you're running in Colab, set the variables in the keys section on the left. 

If you want to self explore, find the TODO: comments and replace themw with your own code, then run the cell.

Prepared by [Alex Volkov](https://twitter.com/altryne)

In [None]:
# Install and read in required packages
try:
    import google.colab
    !git clone -q --branch main https://github.com/altryne/llm-evals-workshop
    %cd llm-evals-workshop
except ImportError:
    pass

print('⏳ Installing packages')
%pip install -q uv #TODO: alex figure this out
!uv pip install -q --system weave gradio set-env-colab-kaggle-dotenv tqdm ipywidgets requests openai google-generativeai pillow
print('✅ Packages installed')

In [7]:
%%capture
%load_ext gradio

import gradio as gr
from PIL import Image
import requests 
import io
from set_env import set_env
import json
from jinja2 import Environment, FileSystemLoader
from datetime import datetime
import random
import os
from openai import OpenAI
from dotenv import load_dotenv
import weave
from weave.flow.annotation_spec import AnnotationSpec

load_dotenv()
set_env("WANDB_API_KEY")
set_env("OPENAI_API_KEY")
set_env("GEMINI_API_KEY")
set_env("OPENROUTER_API_KEY")

# initialize weave
weave_api = weave.init('jan-evals-workshop')

# initialize annotations for this project
annotation = weave.publish(AnnotationSpec(
    name="Doomer or Boomer",
    description="Doomer or Boomer or Neither",
    field_schema={ "type": "string", "enum": ["Doomer", "Boomer", "Neither"],},
), "doomer_or_boomer")

annotation_reason = weave.publish(AnnotationSpec(
    name="Reason",
    description="Reason why you chose this value, write before clicking.",
    field_schema={ "type": "string"},
), "reason")

In [21]:
# Initialize our LLM client, we'll use either Gemini or OpenAI
API_PROVIDER = 'OpenRouter' # @param ["Gemini", "OpenAI", "OpenRouter"]
if API_PROVIDER == 'Gemini':
    client = OpenAI(
        api_key=os.getenv("GEMINI_API_KEY"),
        base_url="https://generativelanguage.googleapis.com/v1beta/",
    )
    model = "gemini-2.0-flash-exp"
elif API_PROVIDER == 'OpenRouter':
    client = OpenAI(
        api_key=os.getenv("OPENROUTER_API_KEY"),
        base_url="https://openrouter.ai/api/v1",
    )
    model = "openai/chatgpt-4o-latest"
    # model = "google/gemini-flash-1.5-exp"
    # model = "deepseek/deepseek-chat"
else:
    client = OpenAI()
    model = "chatgpt-4o-lates"

# Load the Jinja2 environment
env = Environment(loader=FileSystemLoader('templates'))
template = env.get_template('post.html.jinja')

# Load replies data
def load_replies():
    replies = []
    # Load replies from both files
    with open('data/replies_alpin.json', 'r') as f:
        data = json.load(f)
        replies.extend(data['thread']['replies'])
    with open('data/replies_daniel.json', 'r') as f:
        data = json.load(f)
        replies.extend(data['thread']['replies'])
    return replies


def get_random_post_and_analyze():
    replies = load_replies()
    post = random.choice(replies)
    
    # Format the post data for the template
    created_at = datetime.fromisoformat(post['post']['record']['createdAt'].replace('Z', '+00:00'))
    formatted_date = created_at.strftime('%b %d, %Y, %I:%M %p')
    
    # Convert AT URI to bsky.app URL
    at_uri = post['post']['uri']
    _, _, author_did, _, post_id = at_uri.split('/')
    post_url = f"https://bsky.app/profile/{post['post']['author']['handle']}/post/{post_id}"
    
    # Analyze the post
    #download the avatar and convert to PIL image
    avatar_uri = post['post']['author']['avatar']
    avatar_response = requests.get(avatar_uri)
    avatar_pil = Image.open(io.BytesIO(avatar_response.content))

    analysis, weave_call_id = analyze_post_sentiment(avatar_pil, post['post']['author']['displayName'], post['post']['record']['text'])
    
    post_data = {
        'author': post['post']['author'],
        'created_at': formatted_date,
        'text': post['post']['record']['text'],
        'like_count': post['post'].get('likeCount', 0),
        'repost_count': post['post'].get('repostCount', 0),
        'has_image': False,
        'post_url': post_url
    }
    
    return template.render(**post_data), analysis, weave_call_id, ''


def submit_feedback(user_selection, reason, weave_call_id):
    """
    Example function that could send user feedback (the user_selection)
    and the weave_call_id to your Weave (or any other) API.
    """
    call = weave_api.get_call(weave_call_id)

    if reason:
        print("reason", reason)
        reason_resp = weave_api.server.feedback_create(
            {
            "project_id": weave_api._project_id(),
            "weave_ref": call.ref.uri(),
            "feedback_type": "wandb.annotation.reason",
            "annotation_ref": annotation_reason.uri(),
            "payload": {"value": reason},
            }
        )

    resp = weave_api.server.feedback_create(
        {
            "project_id": weave_api._project_id(),
            "weave_ref": call.ref.uri(),
            "feedback_type": "wandb.annotation.doomer_or_boomer",
            "annotation_ref": annotation.uri(),
            "payload": {"value": user_selection},
        }
    )
    
    # Ready to analyze the next post
    return get_random_post_and_analyze()


# 1. Tracing LLM calls with Weave

#### Why Tracing is Important for LLM Application Reliability

In building reliable LLM-based applications, having a clear view into
how your system behaves is crucial. That’s where “tracing” comes in.

1. **Detailed Interaction Records**:
   Tracing captures all the inputs, prompts, responses, and any user feedback.
   By preserving this detailed record, you always have the context needed to
   debug unexpected or incorrect results.

2. **Rapid Issue Diagnosis**:
   With thorough traces, you can pinpoint issues faster—often without
   needing direct access to remote systems. Simply reviewing the logs can
   reveal how a certain response was triggered.

3. **Collaboration and Sharing**:
   Traces can be shared with both technical and non-technical stakeholders.
   This not only streamlines collaboration but also ensures everyone is
   working off the same “source of truth” when investigating bugs
   or brainstorming improvements.

4. **Outlier Spotting and Performance Tuning**:
   By tracking calls at scale, you can detect when responses deviate
   dramatically from the norm, troubleshoot any failures, and identify
   potential performance bottlenecks.

5. **Facilitates Product Evolution**:
   As you enhance or expand your LLM application, comprehensive
   tracing data helps you make more informed decisions about what to
   improve, remove, or refine.

With W&B Weave, comprehensive tracing is just 1 line of code, and offers features such as:
- Syntax highlighting specific to your use-case (Markdown, JSON, etc.)
- Ability to share links with other members of your team
- Ability to filter traces by function name, input, output, etc.

If you need to trace existing code, you can use the `@weave.op` decorator to trace the function.  

![CleanShot 2024-04-08 at 14 15 40@2x](https://gist.github.com/assets/463317/4e9ada49-572f-47d9-91e1-55ab72b2a476)

In [None]:
#TODO: Add tracing to this function - then see how this function is traced in the Weave UI

def analyze_post_sentiment(avatar, displayName, text):
    # Prompt for OpenAI to analyze the sentiment
    prompt = f"""
    Analyze the following Bluesky post and determine if the author is a [Doomer, Boomer, or Neither]. 
    Be concise and to the point. Answer with just one word (DOOMER, BOOMER, or NEITHER) followed by a brief explanation.
    \n\n {displayName}: "{text}"
    """

    # TODO: Add some more context to the prompt
    # prompt = f"""Analyze the following Bluesky post and determine if the author is a:
    # - DOOMER (someone who hates AI and uses derogatory language)
    # - BOOMER (someone who doesn't understand AI and asks to remove their data)
    # - NEITHER (neutral or positive response)
    
    # Post: {displayName}: "{text}"
    
    # Respond with just one word (DOOMER, BOOMER, or NEITHER) followed by a brief explanation.
    # """
    
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5
    )
    try:
        current_call = weave.require_current_call()
        weave_call_id = current_call.id
    except:
        weave_call_id = None
    
    return response.choices[0].message.content, weave_call_id

# Lets test this out without tracing first
analysis, weave_call_id = analyze_post_sentiment("","Alex","I hate AI")

print(analysis, weave_call_id)

We can see that even without @weave.op, since Weave is initialized, it will still trace the function call and store it in the Weave project as it automatically understands that we use OpenAi client. However if we add @weave.op, we can get even more detail and insrument our existing code with Weave.



# 2. User Feedback & Annotations

Collecting user feedback is a crucial way to improve your LLM applications. There's a reason that every chatbot you use has 👍/👎 and a text box to leave feedback. This is one of the best ways for those labs to understand and improve their models and align them to user preferences.

![text](https://cln.sh/JGMBxMtH+)

Users don't have to be external as well, as you develop your application, marking traces as "good" or "bad", and adding why, is a great way to kick start your initial evaluation dataset with working and non-working examples. 

Additionally, after logging hundreds of thousads of traces, they will all start looking the same, so additional context like your user's feedback, will greately improve your ability to look at your data and find the outliers.

Weave supports collecting user Feedback in the UI and also via the API so you can collect it from your users and also leave it yourself while looking at your data. 

![text](https://cln.sh/X6fFHD8t+)

Read more about feedback [here](https://weave-docs.wandb.ai/guides/tracking/feedback)




# 2.1 Doomer or Boomer App - Annotations by example

Unlike user feedback, Annotations are a bit of a more structure way to classify responses, to help create a dataset of golden answers and reasons or rationales for those answers. All of the major companies use Scale.ai for this and pay them a LOT of money, but you don't have to right away, you can start small, by yourself or with your team. 

Let's see how we can kickstart a simple dataset of annotations by a practical example.

![image](https://gist.github.com/user-attachments/assets/a8537545-e070-4c8e-9988-2a8a905b9d2c)

To simulate a real world scenario, we'll build a simple app that will allow you to annotate a few posts. 

In our case, we're pretending to work at a company that's trying to build an AI classifier for Bluesky posts. We're humans that work in the company and are helping it to align and finetune models for AI moderation. 

We've compiled replies from BlueSky users, on 2 posts that collected publicly available data from BlieSky to train AI models (BlueSky data is public), which led to a lot of hate by users on BlueSky. 

We're going to build a simple app that will use an LLM to classify the replies into 3 categories: `Doomer`, `Boomer`, or `Neither`. 

`Doomer`: Someone who hates AI, and uses derogatory language towards the author of the post because of thier hate for AI and their data being used for AI  
`Boomer`: Someone who doesn't understand AI, and copy-pastes a request to remove their data from the dataset  
`Neither`: Folks who reply neutral or positive to the post.

at first our LLMs will not have context to the task at first, so won't be able to reliably classify the replies, so a human is needed to annotate with additional context, you are that human. 

Launch the app and go through a few posts, annotate with a reason for your choice and the correct classification, we'll later use this data to train an LLM to classify the replies a

In [32]:
# %%blocks
# Create a Gradio Blocks app
os.environ['WEAVE_PRINT_CALL_LINK'] = 'false'
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    # Add a title and description
    gr.Markdown("""
    # 🦋 Doomer or Boomer
    Our AI analyzes bluesky replies and posts to determine if the author is a doomer or a boomer.  
    Source of data: Replies to a post by a BlueSky user that compiled a dataset of posts, which went viral and generated a lot of hate on BlueSky.  
    These are replies and comments on 2 posts that collected a dataset of posts of BlueSky users to train AI models (BlueSky data is public)
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            post_html = gr.HTML()
            next_post_btn = gr.Button("Skip Post & Analyze Another", variant="primary")
            gr.Markdown(f"""
            #### Instructions for labeler: 
            `Doomer`: Someone who hates AI, and uses derogatory language towards the author of the post because of thier hate for AI and their data being used for AI  
            `Boomer`: Someone who doesn't understand AI, and copy-pastes a request to remove their data from the dataset  
            `Neither`: Folks who reply neutral or positive to the post.
            
            See your Weave project & traces [here](https://wandb.ai/{weave_api._project_id()})
            """)
        
        with gr.Column(scale=1):
            analysis_output = gr.Textbox(
                label="Analysis Results",
                placeholder="Analysis will appear here...",
                lines=4
            )
            weave_call_id_state = gr.State()
            
            # Replace dropdown with three buttons
            reason_input = gr.Textbox(label="Add reason and click",placeholder="Reason why you chose this value, write before clicking.", lines=2)
            with gr.Row():
                doomer_btn = gr.Button("Doomer 😡", variant="huggingface")
                boomer_btn = gr.Button("Boomer 👵", variant="primary")
                neither_btn = gr.Button("Neither 🤷")

            
    # Set up event handler for combined next/analyze
    next_post_btn.click(fn=get_random_post_and_analyze, outputs=[post_html, analysis_output, weave_call_id_state, reason_input])
    
    doomer_btn.click(
    fn=submit_feedback,
    inputs=[gr.State("Doomer"), reason_input, weave_call_id_state],
    outputs=[post_html, analysis_output, weave_call_id_state, reason_input]
    )
    boomer_btn.click(
        fn=submit_feedback,
        inputs=[gr.State("Boomer"), reason_input, weave_call_id_state],
        outputs=[post_html, analysis_output, weave_call_id_state, reason_input]
    )
    neither_btn.click(
        fn=submit_feedback,
        inputs=[gr.State("Neither"), reason_input, weave_call_id_state],
        outputs=[post_html, analysis_output, weave_call_id_state, reason_input]
    )

    
    # Initialize with first post and analysis
    post_html.value, analysis_output.value, weave_call_id_state.value, reason_input.value = get_random_post_and_analyze()

demo.launch()

* Running on local URL:  http://127.0.0.1:7882

To create a public link, set `share=True` in `launch()`.




## 2.1 Building a dataset from annotated calls

Now that we've annotated at least 10-20 examples, we can build our first evaluation dataset! 

![text](https://cln.sh/dyBq4QXD+)

Step 1: Filter calls in Weave UI by only those with annotations not empty

Step 2: Use the Export -> Use Python button to get code to extract a list of filtered annotated calls

Step 3: Convert the calls to a clean evaluation dataset (and optionally publish to Weave)



In [33]:

@weave.op
def get_annotated_calls():
   # Weave API call to get all calls filtered by annotations not empty (with reasons)
   resp = weave_api.server.calls_query_stream({
      "project_id": weave_api._project_id(),
      "filter": {"op_names": [f"weave:///{weave_api._project_id()}/op/analyze_post_sentiment:*"]},
      "query": {"$expr":{"$and":[{"$not":[{"$eq":[{"$getField":"feedback.[wandb.annotation.doomer_or_boomer].payload.value"},{"$literal":""}]}]},{"$not":[{"$eq":[{"$getField":"feedback.[wandb.annotation.reason].payload.value"},{"$literal":""}]}]}]}},
      "sort_by": [{"field":"started_at","direction":"desc"}],
      "include_feedback": True,
   })

   # Iterate over the calls, clean up and publish as a dataset we can version and reference later.
   list_of_calls = []
   dataset = []
   for call in resp:
      row = {}
      call_dict = dict(call)
      row["input"] = call_dict.get('inputs').get('text')
      row["displayName"] = call_dict.get('inputs').get('displayName')
      row["llm_classification"] = call_dict.get('output')[0]
      list_of_feedback = call_dict.get('summary').get('weave').get('feedback')
      feedback_value = None
      for feedback in list_of_feedback:
         if feedback.get("feedback_type") == 'wandb.annotation.doomer_or_boomer':
            row["human_annotation"] = feedback.get('payload').get('value')
         if feedback.get("feedback_type") == 'wandb.annotation.reason':
            row["reason"] = feedback.get('payload').get('value')
      
      dataset.append(row)

   weave.publish(weave.Dataset(name="doomer_or_boomer_dataset", rows=dataset))
   return dataset

dataset = get_annotated_calls()

📦 Published to https://wandb.ai/thursdai/jan-evals-workshop/weave/objects/doomer_or_boomer_dataset/versions/kPkJew7ifAQDTiskCKUeYZPAjSagSILxHY0Ze9a72i8


## 2.2 Storing Datasets within Weave

If you'd like to store your own dataset and name them, it's very easy to do so, and then you get a "ref" to the dataset that's stored in our system. Weave datasets are versioned, which means you can reference them in your code by a URL or a ref, and either point to the latest version or a specific version. 

Using `refs` is a great way to make your code reproducible and versioned.

![CleanShot 2025-01-07 at 16 12 35@2x](https://gist.github.com/user-attachments/assets/e2d02340-cc0f-41e8-8d97-957b08611d08)


Here's an example of the dataset we just created, and how we can reuse it in our evaluations.

In [46]:
# TODO: replace this dataset with your own using the dataset link above and looking at the "use" tab
doomer_or_boomer_dataset = weave.ref("weave:///thursdai/jan-evals-workshop/object/doomer_or_boomer_dataset:iioNDY6XVmLzYnKbJucvrSDRdzaICkEw7nHxVOMTJ0E").get()

import pandas as pd
df = pd.DataFrame(doomer_or_boomer_dataset.rows)
df.head(10)


Unnamed: 0,input,displayName,llm_classification,reason,human_annotation
0,Take me the fuck out of your shitty dataset or...,mc_raney42,NEITHER. The post expresses frustration with d...,Calling dataset is shitty and threatening lega...,Doomer
1,Thank you for this work! Freedom of informatio...,spencer.,NEITHER. The post expresses a general positive...,user is agreeing with the action taken in the ...,Neither
2,What the fuck is wrong with you?,Jimmy R,NEITHER. The post is too short and lacks any s...,the negativity is based on AI hate,Doomer
3,Chode.,E. Perkins 🎃,NEITHER. The post is too short and lacks any g...,personal attack based on AI hate,Doomer
4,May all your devices and servers get EMP'd,,DOOMER - Expresses a desire for catastrophic t...,personal attack based on AI hate,Doomer
5,Great job <3 - the dataset will help to build ...,Raahul Dutta,NEITHER. The post is positive and forward-look...,this user likes AI,Neither
6,I request that any of my data that is containe...,Joseph O,NEITHER. The post expresses concern about data...,copy paste request that means nothing feels li...,Boomer
7,all replies here:\nme when posting something p...,Flutter 🛷🎄,NEITHER. The post uses internet culture humor ...,making fun of other people replying,Neither
8,"Hi, hope you get banned, shithead!",Andy,"NEITHER. The post is just aggressive, not indi...",personal attack based on AI hate,Doomer
9,The fact you thought this was okay is astonish...,Spacekat9,NEITHER. The post expresses disapproval but la...,thinking this wasn't ok because this was AI re...,Doomer


# Step 3 : Evaluations 
### Components of an Evaluation Dataset

Evaluations generally consist of four key elements:
- An **input prompt** that serves as the basis for the model's completion. This prompt often includes a set of variable inputs that are inserted into a prompt template during testing.
- The **output** generated by the model in response to the input prompt.
- A **"gold standard" answer** used as a reference for assessing the model's output. This can be an exact match that the output must replicate, or an exemplary answer that provides a benchmark for scoring.
- A **score**, determined by one of the scoring approaches outlined below, which indicates the model's performance on the question.

#TODO: Look at the dataset and try to match the input, output, gold standard, and score for each row

## Evaluation Grading Approaches
Evaluations can be time-consuming and costly in two main areas: creating questions and gold standard answers, and the scoring/grading process itself.  
Developing questions and ideal answers is often a one-time fixed cost, albeit potentially time-intensive if a suitable dataset is not readily available (consider leveraging an LLM to generate questions!). However, scoring is a recurring expense incurred each time the evaluation is conducted, which is likely to be frequent. Therefore, designing evaluations that can be scored efficiently and economically should be a central priority.

![](https://gist.github.com/assets/463317/e970bb03-9552-4712-ba12-727b89928e3b)

There are three primary methods for grading (scoring) evaluations:
- **Programmatic scoring:** This approach involves using standard code (primarily string matching and regular expressions) to assess the model's outputs. Common techniques include checking for an exact match against an answer or verifying the presence of key phrase(s) in a string. Programmatic scoring is the most optimal method when feasible, as it is extremely fast and highly reliable. However, not all evaluations are amenable to this style of scoring.
- **Human in the loop:** In this approach, a human reviewer examines the model-generated answer, compares it to the gold standard, and assigns a score. While manual scoring is the most versatile method, applicable to nearly any task, it is also exceptionally slow and costly, especially for large-scale evaluations. Designing evaluations that necessitate manual scoring should be avoided whenever possible.
- **Model-based scoring AKA LLM as a judge:** LLMs (especially Claude, GPT-4o, Gemini) are really good at grading themselves (or even outputs of other LLMs) especially in wide range of tasks that traditionally needed human judgement like tone in creative writing or accuracy in open-ended question, or classification. This model-based scoring is accomplished by creating a _scorer prompt_ for an LLM

Let's explore an example of each

## 3.1 Programmatic scoring 

Here we have a simple programmatic eval that will try and check if the LLM had the right answer.

In [None]:
## Create a programmatic scorer that will compare the ground truth to the LLM answer and check if it is correct
import weave
from weave import Evaluation


def programmatic_scorer(output: str, human_annotation: str):
    # check if the model output is exactly the same as human_annotation (Doomer, Boomer, Neither)
    # we expect this evaluation to fail becuase the LLM is talking alot and never returns just the reason
    if not output or not human_annotation:
        raise ValueError("Model output or human annotation is empty")
    return {"match": output == human_annotation}

# TODO: change the programmatic scorer (commented below) to check if the output includes the reason string (Doomer, Boomer, Neither)
# check for lower case and upper case, and check if more than one of the options is present, meaning that LLM wasn't sure
# add the programmatic scorer to the evaluation

# def programmatic_scorer(output: str, human_annotation: str):
#     # check if model_output includes the human_annotation only once 
#     if human_annotation.lower() in output.lower():
#         #possible match, now lets check if the model_output includes any of the other options but not the human_annotation
#         for option in ["doomer", "boomer", "neither"]:
#             if option.lower() in output.lower() and option.lower() != human_annotation.lower():
#                 return {"match": False}
#         return {"match": True}
#     return {"match": False}

evaluation = Evaluation(
    dataset=doomer_or_boomer_dataset, scorers=[score_against_ground_truth]
)

@weave.op()
def function_to_evaluate(input: str):
    # here's where you would add your LLM call and return the output
    # since we already called the LLM, we can just iterate over the dataset 
    # and return the llm_classification where the question is the same
    row = [x for x in dataset if x['input'] == input]
    return row[0].get('llm_classification')

await evaluation.evaluate(function_to_evaluate)

### Strucutred outputs with programmatic scorers

The above example likely gave us a score of 0, because LLMs like to talk, and comparing that via a simple string match is not going to work. 

Programmatic scorers work great when we have structured outputs and we know exactly what to expect from LLMs. Let's recreate our LLM calls for the same questions with strucutred outputs so we can compare the LLM output directly to the human annotation and see if we can get a better score.

In [None]:
import os
os.environ['WEAVE_PARALLELISM'] = '5'
os.environ['WEAVE_PRINT_CALL_LINK'] = 'true'

@weave.op()
def structured_llm_call(input: str, displayName: str):
    prompt = f"""
    Analyze the following Bluesky post and determine if the author is a [Doomer, Boomer, or Neither]. 
    Be concise and to the point. Answer with just one word (DOOMER, BOOMER, or NEITHER) followed by a brief explanation.
    here's more context:
    #### Instructions for labeler: 
    `Doomer`: Someone who hates AI, and uses derogatory language towards the author of the post because of thier hate for AI and their data being used for AI  
    `Boomer`: Someone who doesn't understand AI, and copy-pastes a request to remove their data from the dataset  
    `Neither`: Folks who reply neutral or positive to the post.
    
    Text to Classify: 
    \n\n {displayName}: "{input}"
    """

    #TODO: add a request for structured output in JSON format
    # prompt += """
    # Respond in JSON format with this exact schema   {{
    #     "classification": "DOOMER" | "BOOMER" | "NEITHER",
    #     "reason": "string"
    # }}
    # """

    #TODO: request a stricter JSON 
    # prompt += """
    #     with no backticks or quotes or anything else, just valid JSON or I lose my job  
    # """
    response = client.chat.completions.create(
        model=model,
        messages=[

            {"role": "user", "content": prompt}],
        temperature=0.5
    )
    return response.choices[0].message.content

def programmatic_scorer(output: str, human_annotation: str):
    # check if the model output is exactly the same as human_annotation (Doomer, Boomer, Neither)
    if not output:
        raise ValueError("Model output is empty")
    try:
        object = json.loads(output)
    except:
        raise ValueError("Model output is not valid JSON")
    
    return {"match": object.get('classification').lower() == human_annotation.lower()}

new_evaluation = Evaluation(
    dataset=doomer_or_boomer_dataset, scorers=[extract_and_score_json]
)

await new_evaluation.evaluate(structured_llm_call)

# 3.2 HITL - Human in the loop evaluation grading

Programmatic scoring is great for many reasons, cheap to get started with, can run very fast and can be very reliable, but cannot cover open ended questions or tasks that require analysis or judgement. 

For example, did the LLM follow the instructions it was given, did it hallucinate, was it verbose or concise, etc.

To judge those outputs we can use human graders, to provide "golden answers", which is what we did above with the annotation example! 

The downside of HITL is that it's slow, expensive, and not scalable (unless you have a lot of money in he bank). 

HITL is a great way to kickstart an evaluation dataset and extarpolate with an LLM. 

# 3.3 LLM as a Judge - use another LLM to grade your LLM outputs

Having to manually grade the above eval every time is going to get very annoying very fast, especially if the eval is a more realistic size (dozens, hundreds, or even thousands of questions). Luckily, there's a better way! 

We can actually have an LLM do the grading for us. We'll use a teacher model to grade the LLM outputs of a "student" model (in this case the LLM we're using for our production system is the student). 

There are a few issues with this approaches to be aware of: 
 - LLMs are not great at numerical scoring (eg 1-5) 
 - The order of canditate responses matter
 - Foundational models tend to prefer their own outputs over other models
 - LLMs prefer longer respones and "style" over accuracy



In [None]:
#TODO - Alex - align this eval with the above examples
# We start by defining a "grader prompt" template.
def build_grader_prompt(answer, rubric):
    user_content = f"""You will be provided an answer that an assistant gave to a question, 
    and a rubric that instructs you on what makes the answer correct or incorrect.
    
    Here is the answer that the assistant gave to the question.
    <answer>{answer}</answer>
    
    Here is the rubric on what makes the answer correct or incorrect.
    <rubric>{rubric}</rubric>
    
    An answer is correct if it entirely meets the rubric criteria, and is otherwise incorrect.
    First, think through whether the answer is correct or incorrect based on the rubric inside <thinking></thinking> tags. 
    Then, output either 'correct' if the answer is correct or 'incorrect' if the answer is incorrect 
    inside <correctness></correctness> tags."""

    messages = [{'role': 'user', 'content': user_content}]
    return messages

# Now we define the full grade_completion function.
import re

def grade_completion(output, golden_answer, model_name=FAST_MODEL_NAME):
    messages = build_grader_prompt(output, golden_answer)
    completion = get_completion(messages, model_name=model_name)
    # Extract just the label from the completion (we don't care about the thinking)
    pattern = r'<correctness>(.*?)</correctness>'
    match = re.search(pattern, completion, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        raise ValueError("Did not find <correctness></correctness> tags.")

# Run the grader function on our outputs and print the score.

grades = []
for output, question in tqdm(zip(outputs, eval), total=len(eval), desc=f'Running eval using {FAST_MODEL_NAME}'):
    grade = grade_completion(output, question['golden_answer'], model_name=FAST_MODEL_NAME)
    grades.append(grade)

print(f"{FAST_MODEL_NAME} Score: {grades.count('correct')/len(grades)*100}%") 

# Run the grader function on our outputs and print the score using the smart model
grades = []
for output, question in tqdm(zip(outputs, eval), total=len(eval), desc=f'Running eval using {SMART_MODEL_NAME}'):
    grade = grade_completion(output, question['golden_answer'], model_name=SMART_MODEL_NAME)
    grades.append(grade)

print(f"{SMART_MODEL_NAME} Score: {grades.count('correct')/len(grades)*100}%") 


Running eval using claude-3-haiku-20240307:   0%|          | 0/3 [00:00<?, ?it/s]

claude-3-haiku-20240307 Score: 33.33333333333333%


Running eval using claude-3-opus-20240229:   0%|          | 0/3 [00:00<?, ?it/s]

claude-3-opus-20240229 Score: 33.33333333333333%



# 3.3 HITL - Human in the loop evaluation grading

--- iteration on evals to improve them ---


