In [4]:
import dotenv
dotenv.load_dotenv()

True

In [6]:
import os
from logfire.query_client import LogfireQueryClient

read_token = os.getenv('LOGFIRE_READ_TOKEN')
logfire_query_client = LogfireQueryClient(read_token=read_token)

In [44]:
trace_rows = logfire_query_client.query_json_rows(
    sql="""
    SELECT
        trace_id,
        start_timestamp,
        duration
    FROM records
    WHERE span_name = 'streamlit_session'
    ORDER BY start_timestamp DESC
    LIMIT 5
    """
)


In [47]:
trace_ids = [r['trace_id'] for r in trace_rows['rows']]
trace_ids

['019c8f3f6447f880e1a12852e646af68',
 '019c8f3d662b1302eeb88ce03a1504a2',
 '019c8f3a23e4cc6bd259a5a00f708b29',
 '019c8f2d3ef001f77db918166936d315',
 '019c8f2c899df265457bc2814ffe975b']

In [21]:
trace_id = '019c8f3a23e4cc6bd259a5a00f708b29'

In [22]:
run_row = logfire_query_client.query_json_rows(
    sql=f"""
    SELECT
        attributes->'pydantic_ai.all_messages' as all_messages,
        attributes->>'gen_ai.usage.input_tokens' as input_tokens,
        attributes->>'gen_ai.usage.output_tokens' as output_tokens
    FROM records
    WHERE trace_id = '{trace_id}'
      AND span_name = 'agent run'
    ORDER BY start_timestamp DESC
    LIMIT 1
    """
)

In [25]:
all_messages = run_row['rows'][0]

In [27]:
all_messages['all_messages'][0]

{'role': 'user',
 'parts': [{'type': 'text',
   'content': 'How do I customize my LLM judge prompts?\n\n'}]}

In [28]:
import sys
sys.path.append('..')

In [29]:
import trace_replay

In [30]:
trace = trace_replay.fetch_trace(trace_id, logfire_query_client)

In [32]:
trace.all_messages[0]

{'role': 'user',
 'parts': [{'type': 'text',
   'content': 'How do I customize my LLM judge prompts?\n\n'}]}

In [40]:
from models import RAGResponse
run = trace_replay.trace_to_run_result(trace, output_type=RAGResponse)

In [43]:
run.output.answer

"Common errors when using an LLM as a judge include:\n\n1. **Inaccurate Responses**: The LLM may produce incorrect evaluations due to insufficient context or ambiguous prompts. It's essential to provide clear and specific criteria in the evaluation prompts.\n\n2. **Overfitting to Training Data**: If the LLM has been trained on biased or unrepresentative datasets, it may favor evaluations based on that training, leading to skewed results.\n\n3. **Difficulty with Open-ended Prompts**: LLMs may struggle with open-ended evaluations where specific reference answers are not available. In such scenarios, establishing clear, custom criteria is vital to guide the evaluations.\n\n4. **Lack of Detailed Reasoning**: Sometimes, LLMs may fail to provide sufficient reasoning behind their judgments, making it difficult to understand why a particular evaluation was made. Always encourage detailed explanations in the prompts.\n\n5. **Variability in Output**: Due to the non-deterministic nature of LLMs, 

In [55]:
traces = trace_replay.fetch_traces(trace_ids, logfire_query_client)

In [62]:
runs = []


for trace in traces.values():
    run = trace_replay.trace_to_run_result(trace)
    runs.append(run)

In [64]:
import pickle

In [66]:
with open('../data/logs.bin', 'wb') as f_out:
    pickle.dump(runs, f_out)

In [67]:
with open('../data/logs.bin', 'rb') as f_in:
    runs = pickle.load(f_in)