# LLM Evals Workshop
# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/altryne/llm-evals-workshop/blob/main/eval.ipynb) [![Weights & Biases](https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-gradient.svg)](https://wandb.ai)

This notebook demonstrates how to create, run and track LLM evaluations using [W&B Weave](https://wandb.ai/site/weave). We'll explore different evaluation techniques and how to analyze the results.

In [1]:
# Install and read in required packages, plus create an anthropic client.
try:
    import google.colab
    !git clone --branch main https://github.com/altryne/llm-evals-workshop
    %cd llm-evals-workshop
except ImportError:
    pass

print('⏳ Installing packages')
%pip install uv #TODO: alex figure this out
%uv pip install -q weave gradio set-env-colab-kaggle-dotenv tqdm ipywidgets requests openai google-generativeai
print('✅ Packages installed')

⏳ Installing packages
Note: you may need to restart the kernel to use updated packages.
✅ Packages installed


In [2]:
%load_ext gradio
import gradio as gr
from set_env import set_env
import json
from jinja2 import Environment, FileSystemLoader
from datetime import datetime
import random
import os
from openai import OpenAI
from dotenv import load_dotenv
import weave
from weave.flow.annotation_spec import AnnotationSpec

load_dotenv()
set_env("WANDB_API_KEY")
set_env("OPENAI_API_KEY")
set_env("GEMINI_API_KEY")

# initialize weave
weave_api = weave.init('evals-workshop')
# initialize a basic annotation in this project
annotation = weave.publish(AnnotationSpec(
    name="Doomer or Boomer",
    description="Doomer or Boomer or Neither",
    field_schema={
        "type": "string",
        "enum": ["Doomer", "Boomer", "Neither"],
    },
), "doomer_or_boomer")

weave version 0.51.27 is available!  To upgrade, please run:
 $ pip install weave --upgrade
Logged in as Weights & Biases user: altryne.
View Weave data at https://wandb.ai/thursdai/evals-workshop/weave
📦 Published to https://wandb.ai/thursdai/evals-workshop/weave/objects/doomer_or_boomer/versions/MfppDkza1qvK772eNZWIU1XwwZbtwGQ8UQWWEcyZlfc


In [20]:


# Initialize OpenAI client
client = OpenAI()

# Load the Jinja2 environment
env = Environment(loader=FileSystemLoader('templates'))
template = env.get_template('post.html.jinja')

# Load replies data
def load_replies():
    replies = []
    # Load replies from both files
    with open('data/replies_alpin.json', 'r') as f:
        data = json.load(f)
        replies.extend(data['thread']['replies'])
    with open('data/replies_daniel.json', 'r') as f:
        data = json.load(f)
        replies.extend(data['thread']['replies'])
    return replies

@weave.op
def analyze_post_sentiment(text):
    # Prompt for OpenAI to analyze the sentiment
    prompt = "Analyze the following Bluesky post and determine if the author is a [Doomer, Boomer, or Neither]"

    # TODO: Add some more context to the prompt
    # prompt = f"""Analyze the following Bluesky post and determine if the author is a:
    # - DOOMER (someone who hates AI and uses derogatory language)
    # - BOOMER (someone who doesn't understand AI and asks to remove their data)
    # - NEITHER (neutral or positive response)
    
    # Post: {text}
    
    # Respond with just one word (DOOMER, BOOMER, or NEITHER) followed by a brief explanation.
    # """
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_tokens=100
    )
    current_call = weave.require_current_call()
    weave_call_id = current_call.id
    
    return response.choices[0].message.content, weave_call_id

def get_random_post_and_analyze():
    replies = load_replies()
    post = random.choice(replies)
    
    # Format the post data for the template
    created_at = datetime.fromisoformat(post['post']['record']['createdAt'].replace('Z', '+00:00'))
    formatted_date = created_at.strftime('%b %d, %Y, %I:%M %p')
    
    # Convert AT URI to bsky.app URL
    at_uri = post['post']['uri']
    _, _, author_did, _, post_id = at_uri.split('/')
    post_url = f"https://bsky.app/profile/{post['post']['author']['handle']}/post/{post_id}"
    
    # Analyze the post
    analysis, weave_call_id = analyze_post_sentiment(post['post']['record']['text'])
    
    post_data = {
        'author': post['post']['author'],
        'created_at': formatted_date,
        'text': post['post']['record']['text'],
        'like_count': post['post'].get('likeCount', 0),
        'repost_count': post['post'].get('repostCount', 0),
        'has_image': False,
        'post_url': post_url
    }
    
    return template.render(**post_data), analysis, weave_call_id


def submit_feedback(user_selection, weave_call_id):
    """
    Example function that could send user feedback (the user_selection)
    and the weave_call_id to your Weave (or any other) API.
    """
    call = weave_api.get_call(weave_call_id)
    # print(call)
    # print("user_selection", user_selection)
    # resp = call.feedback.add('annotation.doomer_or_boomer', {"value": user_selection})
    # print(type(resp))

    resp = weave_api.server.feedback_create(
        {
            "project_id": weave_api._project_id(),
            "weave_ref": call.ref.uri(),
            "feedback_type": "wandb.annotation.doomer_or_boomer",
            "annotation_ref": annotation.uri(),
            "payload": {"value": user_selection},
        }
    )
    print(resp)
    # Just returning a quick debug message for now:
    return f"Feedback sent for call {weave_call_id} with user selection: {user_selection}"


In [21]:
# %%blocks
# Create a Gradio Blocks app
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    # Add a title and description
    gr.Markdown("# 🦋 Doomer or Boomer")
    gr.Markdown("""
    Our AI analyzes bluesky replies and posts to determine if the author is a doomer or a boomer.  
    Source of data: Replies to a post by a BlueSky user that compiled a dataset of posts, which went viral and created a lot of noise on BlueSky.  
    `Doomer`: Someone who hates AI, and uses derogatory language towards the author of the post.  
    `Boomer`: Someone who doesn't understand AI, and copy-pastes a request to remove their data from the dataset  
    `Neither`: Folks who reply neutral or positive to the post.
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            post_html = gr.HTML()
            next_post_btn = gr.Button("Next Post & Analyze", variant="primary")
        
        with gr.Column(scale=1):
            analysis_output = gr.Textbox(
                label="Analysis Results",
                placeholder="Analysis will appear here...",
                lines=4
            )
            weave_call_id_state = gr.State()
            
            # Add a dropdown for the user to give feedback
            user_feedback = gr.Dropdown(
                label="Your Feedback",
                choices=["Doomer", "Boomer", "Neither"],
                value="Neither"
            )

            submit_feedback_btn = gr.Button("Send Annotation")
    
    # Set up event handler for combined next/analyze
    next_post_btn.click(fn=get_random_post_and_analyze, outputs=[post_html, analysis_output, weave_call_id_state])
    
    # 2) The "Send Feedback" button calls the function that receives both user selection
    #    and the hidden weave_call_id, and returns a message to be shown in the analysis textbox
    submit_feedback_btn.click(
        fn=submit_feedback,
        inputs=[user_feedback, weave_call_id_state],
        outputs=analysis_output
    )
    
    # Initialize with first post and analysis
    post_html.value, analysis_output.value, weave_call_id_state.value = get_random_post_and_analyze()

demo.launch()

🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943d3b-de19-7bc0-972a-b8c8c9485bfc
* Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.




🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943d3b-eafe-7bf1-a6b7-12a83c407c19
🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943d3c-0a96-7773-9ff1-3ba87753a3ff
id='01943d3c-26ee-72e0-99de-d4f2bec19023' created_at=datetime.datetime(2025, 1, 6, 20, 8, 59, 886431, tzinfo=TzInfo(UTC)) wb_user_id='VXNlcjoxNzQ4MTAx' payload={}
🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943d3c-2505-7332-b093-906e816eb28a
🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943d3c-2c9a-7d93-8195-58162f8e9e71
id='01943d3c-5c5b-7b42-9eab-4741ad409de8' created_at=datetime.datetime(2025, 1, 6, 20, 9, 13, 563847, tzinfo=TzInfo(UTC)) wb_user_id='VXNlcjoxNzQ4MTAx' payload={}
🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943d3c-5a8f-7c71-af6b-a214ff695b5c
🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943d3c-71d6-7df2-94b3-3e8f68565857
id='01943d3c-8754-7320-9a85-284212f9425b' created_at=datetime.datetime(2025, 1, 6, 20, 9, 24, 564246, tzinfo=TzInfo(UTC)) wb_user_id='VXNlcjoxNzQ4MTAx' 

In [14]:
call = weave_api.get_call("01943c72-aff2-7bc3-b8f7-a7f698cbc134")
print(dir(call))
print(call.ref.uri())
# for feedback in call.feedback:
#     print(feedback)

['__annotations__', '__class__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply_scorer', '_children', '_display_name', '_feedback', '_op_name', 'attributes', 'children', 'delete', 'deleted_at', 'display_name', 'ended_at', 'exception', 'feedback', 'func_name', 'id', 'inputs', 'op_name', 'output', 'parent_id', 'project_id', 'ref', 'remove_display_name', 'set_display_name', 'started_at', 'summary', 'trace_id', 'ui_url']
weave:///thursdai/evals-workshop/call/01943c72-aff2-7bc3-b8f7-a7f698cbc134


In [15]:
weave_api.server.feedback_create(
        {
            "project_id": "thursdai/evals-workshop",
            "weave_ref": call.ref.uri(),
            "feedback_type": "wandb.annotation.doomer_or_boomer",
            "annotation_ref": annotation.uri(),
            "payload": {"value": "Doomer"},
        }
)

FeedbackCreateRes(id='01943d33-f315-7221-9407-3a2680a66ed5', created_at=datetime.datetime(2025, 1, 6, 20, 0, 2, 325865, tzinfo=TzInfo(UTC)), wb_user_id='VXNlcjoxNzQ4MTAx', payload={})

## Building a dataset from Weave filtered calls

step 1: filter calls in Weave by only those with annotations not empty (you can use the code below to just do it)

step 2: 

In [36]:
@weave.op
def get_annotated_calls():
   resp = weave_api.server.calls_query_stream({
      "project_id": weave_api._project_id(),
      "filter": {"op_names": [f"weave:///{weave_api._project_id()}/op/analyze_post_sentiment:*"]},
      "query": {"$expr":{"$not":[{"$eq":[{"$getField":"feedback.[wandb.annotation.doomer_or_boomer].payload.value"},{"$literal":""}]}]}},
      "sort_by": [{"field":"started_at","direction":"desc"}],
      "include_feedback": True,
   })
   list_of_calls = []
   dataset = []
   for call in resp:
      row = {}
      call_dict = dict(call)
      row["input"] = call_dict.get('inputs').get('text')
      row["llm_classification"] = call_dict.get('output')[0]
      list_of_feedback = call_dict.get('summary').get('weave').get('feedback')
      feedback_value = None
      for feedback in list_of_feedback:
         if feedback.get("feedback_type") == 'wandb.annotation.doomer_or_boomer':
            row["human_annotation"] = feedback.get('payload').get('value')
      
      dataset.append(row)

   weave.publish(weave.Dataset(name="doomer_or_boomer_dataset", rows=dataset))
   return dataset

get_annotated_calls()

📦 Published to https://wandb.ai/thursdai/evals-workshop/weave/objects/doomer_or_boomer_dataset/versions/tl0XcCYUdnDc3SCPCrgEqBX0h1kzIjd7FBeup3icp1E
🍩 https://wandb.ai/thursdai/evals-workshop/r/call/01943d56-af1d-76e3-b773-c418afb80e1d


[{'input': 'hi there! please remove my data in this dataset if it is present, or give me $200 for every post of mine included in this dataset. thanks! have a terrible day.',
  'llm_classification': 'Of course! Please provide the Bluesky post you would like me to analyze.',
  'human_annotation': 'Boomer'},
 {'input': 'Fuck you',
  'llm_classification': 'To analyze the Bluesky post and determine if the author is a Doomer, Boomer, or Neither, I would need to see the content of the post. Could you please provide the text of the post?',
  'human_annotation': 'Doomer'},
 {'input': 'Ty! looks interesting',
  'llm_classification': "Sure, I'd be happy to help analyze the post. However, I would need to see the content of the Bluesky post you're referring to in order to determine whether the author is a Doomer, Boomer, or Neither. Could you please provide the text of the post?",
  'human_annotation': 'Neither'},
 {'input': 'I want everything about my house OFF THE INTERNET!',
  'llm_classificatio

In [26]:
dir(weave_api)
print(weave_api._project_id())

thursdai/evals-workshop


## Step 3 : Evaluations 
## 3.1 Programmatic evaluations 



## 3.2 LLM as a judge evaluations 

--- iteration on evals to improve them ---

## 3.3 Human in te loop aligned evaluations 
