In [None]:
import sys
import os
from datetime import datetime
import uuid
import asyncio

# Get the current working directory
current_dir = os.getcwd()

# Construct the path to the backend directory
backend_path = os.path.abspath(os.path.join(os.path.dirname(current_dir), 'backend'))

# Add the backend directory to PYTHONPATH
if backend_path not in sys.path:
    sys.path.insert(0, backend_path)
    os.environ['PYTHONPATH'] = backend_path + os.pathsep + os.environ.get('PYTHONPATH', '')

In [None]:
from langfuse import Langfuse
from src.utils.llm import TextAnalyzer, PromptManager

In [None]:
TRACE_TAGS = ["eval"] # tags to add to trace for filtering
TRACE_USER_ID="dev"

In [None]:
langfuse = Langfuse()
prompt_manager = PromptManager(langfuse) # load latest version of prompts
llm = TextAnalyzer(prompt_manager)

In [None]:
def evaluation_function(input, expected_output, output) :
    result = {}
    result["value"] = 1 if output == expected_output else 0
    result["comment"] = "" # optional, useful to add reasoning
    return result

In [None]:
async def run_evals(
    prompt_name,
    dataset_name,
    score_name,
    run_name,
    prompt_label="latest",
    run_description="",
    max_parallel=5
):
    prompt_manager.load_prompt_templates(prompt_label=prompt_label)
    dataset = langfuse.get_dataset(dataset_name)
    run_name = run_name + "_" + datetime.now().strftime("%Y%m%d_%H%M%S")

    semaphore = asyncio.Semaphore(max_parallel)

    async def process_item(item):
        async with semaphore:
            trace_id = str(uuid.uuid4())
            generation_id = str(uuid.uuid4())
            text = item.input
            
            output = await llm.analyze_single_feature(
                feature=prompt_name,
                prompt_templates=prompt_manager.prompts,
                text=text,
                trace_id=trace_id,
                trace_user_id=TRACE_USER_ID,
                generation_id=generation_id,
                tags=TRACE_TAGS,
            )

            item.link(
                trace_or_observation=None,
                run_name=run_name,
                run_description=run_description,
                trace_id=trace_id,
                observation_id=generation_id,
            )
            
            evaluation_result = evaluation_function(
                item.input,
                item.expected_output,
                output,
            )
            
            langfuse.score(
                name=score_name,
                value=evaluation_result["value"], 
                trace_id=trace_id,
                observation_id=generation_id,
                comment=evaluation_result["comment"],
            )

    tasks = [process_item(item) for item in dataset.items]
    await asyncio.gather(*tasks)

    # Flush the langfuse client to ensure all data is sent to the server at the end of the experiment run
    langfuse.flush()

In [None]:
await run_evals(
    prompt_name="sarcasm",    # "sarcasm", "genre", "manipulation" etc.
    dataset_name="sarcasm",   # Dataset name in langfuse
    score_name="accuracy",    # Score name in langfuse
    run_name="run",           # Any meaningful name for experiment
    prompt_label="latest",    # "latest" or "production"
    run_description="",       # Optional, useful to add details
    max_parallel=5            # Max parallel requests, defaults to 5
)