# LLM Evals Workshop - by Weights & Biases
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/altryne/llm-evals-workshop/blob/main/eval.ipynb) [![Weights & Biases](https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-gradient.svg)](https://wandb.me/weave-workshop-jan)

This notebook demonstrates how to create, run and track LLM evaluations using [W&B Weave](https://wandb.me/weave-workshop-jan). We'll explore different evaluation techniques and how to analyze the results.

Make sure to set your WANDB_API_KEY (get your key from [here](https://wandb.ai/authorize)) and OPENAI_API_KEY or GEMINI_API_KEY in the environment variables.

If you're running in Colab, set the variables in the keys section on the left. 

In [23]:
# Install and read in required packages, plus create an anthropic client.
try:
    import google.colab
    !git clone --branch main https://github.com/altryne/llm-evals-workshop
    %cd llm-evals-workshop
except ImportError:
    pass

print('⏳ Installing packages')
%pip install uv #TODO: alex figure this out
%uv pip install -q weave gradio set-env-colab-kaggle-dotenv tqdm ipywidgets requests openai google-generativeai pillow
print('✅ Packages installed')

⏳ Installing packages
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
✅ Packages installed


In [27]:
%load_ext gradio
import gradio as gr
from PIL import Image
import requests 
import io
from set_env import set_env
import json
from jinja2 import Environment, FileSystemLoader
from datetime import datetime
import random
import os
from openai import OpenAI
from dotenv import load_dotenv
import weave
from weave.flow.annotation_spec import AnnotationSpec

load_dotenv()
set_env("WANDB_API_KEY")
set_env("OPENAI_API_KEY")
set_env("GEMINI_API_KEY")

# initialize weave
weave_api = weave.init('jan-evals-workshop')
# initialize a basic annotation in this project
annotation = weave.publish(AnnotationSpec(
    name="Doomer or Boomer",
    description="Doomer or Boomer or Neither",
    field_schema={ "type": "string", "enum": ["Doomer", "Boomer", "Neither"],},
), "doomer_or_boomer")

annotation_reason = weave.publish(AnnotationSpec(
    name="Reason",
    description="Reason why you chose this value, write before clicking.",
    field_schema={ "type": "string"},
), "reason")

The gradio extension is already loaded. To reload it, use:
  %reload_ext gradio
📦 Published to https://wandb.ai/thursdai/evals-workshop/weave/objects/doomer_or_boomer/versions/MfppDkza1qvK772eNZWIU1XwwZbtwGQ8UQWWEcyZlfc
📦 Published to https://wandb.ai/thursdai/evals-workshop/weave/objects/reason/versions/Z3Do6YnUa9YHEuELfGyZtJt7JTkgb30oVBv04U4HWyc


In [32]:
# Initialize OpenAI client
client = OpenAI(
    api_key=os.getenv("GEMINI_API_KEY"),
    base_url="https://generativelanguage.googleapis.com/v1beta/",
)

# Load the Jinja2 environment
env = Environment(loader=FileSystemLoader('templates'))
template = env.get_template('post.html.jinja')

# Load replies data
def load_replies():
    replies = []
    # Load replies from both files
    with open('data/replies_alpin.json', 'r') as f:
        data = json.load(f)
        replies.extend(data['thread']['replies'])
    with open('data/replies_daniel.json', 'r') as f:
        data = json.load(f)
        replies.extend(data['thread']['replies'])
    return replies

@weave.op
def analyze_post_sentiment(avatar, displayName, text):
    # Prompt for OpenAI to analyze the sentiment
    prompt = f"""
    Analyze the following Bluesky post and determine if the author is a [Doomer, Boomer, or Neither]. 
    Be concise and to the point. Answer with just one word (DOOMER, BOOMER, or NEITHER) followed by a brief explanation.
    \n\n {displayName}: "{text}"
    """

    # TODO: Add some more context to the prompt
    # prompt = f"""Analyze the following Bluesky post and determine if the author is a:
    # - DOOMER (someone who hates AI and uses derogatory language)
    # - BOOMER (someone who doesn't understand AI and asks to remove their data)
    # - NEITHER (neutral or positive response)
    
    # Post: {displayName}: "{text}"
    
    # Respond with just one word (DOOMER, BOOMER, or NEITHER) followed by a brief explanation.
    # """
    
    response = client.chat.completions.create(
        model="gemini-2.0-flash-exp",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5
    )
    current_call = weave.require_current_call()
    weave_call_id = current_call.id
    
    return response.choices[0].message.content, weave_call_id

def get_random_post_and_analyze():
    replies = load_replies()
    post = random.choice(replies)
    
    # Format the post data for the template
    created_at = datetime.fromisoformat(post['post']['record']['createdAt'].replace('Z', '+00:00'))
    formatted_date = created_at.strftime('%b %d, %Y, %I:%M %p')
    
    # Convert AT URI to bsky.app URL
    at_uri = post['post']['uri']
    _, _, author_did, _, post_id = at_uri.split('/')
    post_url = f"https://bsky.app/profile/{post['post']['author']['handle']}/post/{post_id}"
    
    # Analyze the post
    #download the avatar and convert to PIL image
    avatar_uri = post['post']['author']['avatar']
    avatar_response = requests.get(avatar_uri)
    avatar_pil = Image.open(io.BytesIO(avatar_response.content))

    analysis, weave_call_id = analyze_post_sentiment(avatar_pil, post['post']['author']['displayName'], post['post']['record']['text'])
    
    post_data = {
        'author': post['post']['author'],
        'created_at': formatted_date,
        'text': post['post']['record']['text'],
        'like_count': post['post'].get('likeCount', 0),
        'repost_count': post['post'].get('repostCount', 0),
        'has_image': False,
        'post_url': post_url
    }
    
    return template.render(**post_data), analysis, weave_call_id, ''


def submit_feedback(user_selection, reason, weave_call_id):
    """
    Example function that could send user feedback (the user_selection)
    and the weave_call_id to your Weave (or any other) API.
    """
    call = weave_api.get_call(weave_call_id)

    if reason:
        print("reason", reason)
        reason_resp = weave_api.server.feedback_create(
            {
            "project_id": weave_api._project_id(),
            "weave_ref": call.ref.uri(),
            "feedback_type": "wandb.annotation.reason",
            "annotation_ref": annotation_reason.uri(),
            "payload": {"value": reason},
            }
        )

    resp = weave_api.server.feedback_create(
        {
            "project_id": weave_api._project_id(),
            "weave_ref": call.ref.uri(),
            "feedback_type": "wandb.annotation.doomer_or_boomer",
            "annotation_ref": annotation.uri(),
            "payload": {"value": user_selection},
        }
    )
    
    # Ready to analyze the next post
    return get_random_post_and_analyze()


In [52]:
# %%blocks
# Create a Gradio Blocks app
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    # Add a title and description
    gr.Markdown("""
    # 🦋 Doomer or Boomer
    Our AI analyzes bluesky replies and posts to determine if the author is a doomer or a boomer.  
    Source of data: Replies to a post by a BlueSky user that compiled a dataset of posts, which went viral and created a lot of noise on BlueSky.  
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            post_html = gr.HTML()
            next_post_btn = gr.Button("Skip Post & Analyze Another", variant="primary")
            gr.Markdown(f"""
            #### Instructions for labeler: 
            `Doomer`: Someone who hates AI, and uses derogatory language towards the author of the post.  
            `Boomer`: Someone who doesn't understand AI, and copy-pastes a request to remove their data from the dataset  
            `Neither`: Folks who reply neutral or positive to the post.
            
            See your Weave project & traces [here](https://wandb.ai/{weave_api._project_id()})
            """)
        
        with gr.Column(scale=1):
            analysis_output = gr.Textbox(
                label="Analysis Results",
                placeholder="Analysis will appear here...",
                lines=4
            )
            weave_call_id_state = gr.State()
            
            # Replace dropdown with three buttons
            reason_input = gr.Textbox(label="Add reason and click",placeholder="Reason why you chose this value, write before clicking.", lines=2)
            with gr.Row():
                doomer_btn = gr.Button("Doomer 😡", variant="huggingface")
                boomer_btn = gr.Button("Boomer 👵", variant="primary")
                neither_btn = gr.Button("Neither 🤷")

            
    # Set up event handler for combined next/analyze
    next_post_btn.click(fn=get_random_post_and_analyze, outputs=[post_html, analysis_output, weave_call_id_state, reason_input])
    
    doomer_btn.click(
    fn=submit_feedback,
    inputs=[gr.State("Doomer"), reason_input, weave_call_id_state],
    outputs=[post_html, analysis_output, weave_call_id_state, reason_input]
    )
    boomer_btn.click(
        fn=submit_feedback,
        inputs=[gr.State("Boomer"), reason_input, weave_call_id_state],
        outputs=[post_html, analysis_output, weave_call_id_state, reason_input]
    )
    neither_btn.click(
        fn=submit_feedback,
        inputs=[gr.State("Neither"), reason_input, weave_call_id_state],
        outputs=[post_html, analysis_output, weave_call_id_state, reason_input]
    )

    
    # Initialize with first post and analysis
    post_html.value, analysis_output.value, weave_call_id_state.value, reason_input.value = get_random_post_and_analyze()

demo.launch()

🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943e37-2931-7263-b7f3-ad4386dd6823
* Running on local URL:  http://127.0.0.1:7873

To create a public link, set `share=True` in `launch()`.




🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943e37-5156-7e70-9b78-7bdb3b8edc1b
🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943e37-7409-7000-a3a9-f36315658c1f
🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943e37-7f35-78d1-aec5-958a8432c9c7
🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943e48-7895-7532-a5c0-b52e8ae4d436
🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943e48-83fc-7f92-922a-626210d6ba19


## Building a dataset from Weave filtered calls

step 1: filter calls in Weave by only those with annotations not empty (you can use the code below to just do it)

step 2: 

In [34]:
@weave.op
def get_annotated_calls():
   # Weave API call to get all calls filtered by annotations not empty (with reasons)
   resp = weave_api.server.calls_query_stream({
      "project_id": weave_api._project_id(),
      "filter": {"op_names": [f"weave:///{weave_api._project_id()}/op/analyze_post_sentiment:*"]},
      "query": {"$expr":{"$and":[{"$not":[{"$eq":[{"$getField":"feedback.[wandb.annotation.doomer_or_boomer].payload.value"},{"$literal":""}]}]},{"$not":[{"$eq":[{"$getField":"feedback.[wandb.annotation.reason].payload.value"},{"$literal":""}]}]}]}},
      "sort_by": [{"field":"started_at","direction":"desc"}],
      "include_feedback": True,
   })

   # Iterate over the calls, clean up and publish as a dataset we can version and reference later.
   list_of_calls = []
   dataset = []
   for call in resp:
      row = {}
      call_dict = dict(call)
      row["input"] = call_dict.get('inputs').get('text')
      row["llm_classification"] = call_dict.get('output')[0]
      list_of_feedback = call_dict.get('summary').get('weave').get('feedback')
      feedback_value = None
      for feedback in list_of_feedback:
         if feedback.get("feedback_type") == 'wandb.annotation.doomer_or_boomer':
            row["human_annotation"] = feedback.get('payload').get('value')
         if feedback.get("feedback_type") == 'wandb.annotation.reason':
            row["reason"] = feedback.get('payload').get('value')
      
      dataset.append(row)

   weave.publish(weave.Dataset(name="doomer_or_boomer_dataset", rows=dataset))
   return dataset

dataset = get_annotated_calls()

📦 Published to https://wandb.ai/thursdai/evals-workshop/weave/objects/doomer_or_boomer_dataset/versions/o8WQyHC2ms95z3W3LfW6PZ5otKIpszLwHEt02blJREk
🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943e1c-19e0-7471-b913-24846efcb437


## Step 3 : Evaluations 
## 3.1 Programmatic evaluations 

Here we have a simple programmatic eval that will try and check if the LLM had the right answer.

In [None]:
## Create a programmatic scorer that will compare the ground truth to the LLM answer and check if it is correct
from weave import Evaluation

def score_against_ground_truth(model_output: str, human_annotation: str):
    # check if the model output is exactly the same as human_annotation (Doomer, Boomer, Neither)
    # we expect this evaluation to fail becuase the LLM is talking alot and never returns just the reason
    return {"match": model_output == human_annotation}

# TODO: change this scorer to check if the model_output includes the reason string (Doomer, Boomer, Neither)
# check for lower case and upper case, and check if more than one of the options is present, meaning that LLM wasn't sure
# add the programmatic scorer to the evaluation

# def programmatic_scorer(model_output: str, human_annotation: str):
#     # check if model_output includes the human_annotation only once 
#     if human_annotation.lower() in model_output.lower():
#         #possible match, now lets check if the model_output includes any of the other options but not the human_annotation
#         for option in ["doomer", "boomer", "neither"]:
#             if option.lower() in model_output.lower() and option.lower() != human_annotation.lower():
#                 return {"match": False}
#         return {"match": True}
#     return {"match": False}

evaluation = Evaluation(
    dataset=dataset, scorers=[score_against_ground_truth]
)

@weave.op()
def function_to_evaluate(input: str):
    # here's where you would add your LLM call and return the output
    # since we already called the LLM, we can just iterate over the dataset and return the llm_classification where the question is the same
    row = [x for x in dataset if x['input'] == input]
    return row[0].get('llm_classification')

await evaluation.evaluate(function_to_evaluate)


## 3.2 LLM as a judge evaluations 

--- iteration on evals to improve them ---

## 3.3 Human in te loop aligned evaluations 
