In [4]:
import json
from typing import List, Dict, Any

# Load the conversation from a JSON file
# with open(r"C:\Users\AakashAI\Desktop\Repositories\Sales Agent\20250413_220608_conversation.json", "r", encoding="utf-8") as file:
#     conversation = json.load(file)


In [6]:
def clean_conversation(raw_convo: List[Dict]) -> List[Dict]:
    cleaned = []
    for msg in raw_convo:
        entry = {"role": msg["role"]}

        # Handle content
        entry["content"] = msg.get("content") or "[No content]"

        # Handle tool_calls from assistant
        if "tool_calls" in msg and msg["tool_calls"]:
            entry["tool_calls"] = [
                {
                    "tool": call.get("function", {}).get("name", ""),
                    "args": call.get("function", {}).get("arguments", "")
                }
                for call in msg["tool_calls"]
            ]

        # Handle function_call (legacy)
        if "function_call" in msg and msg["function_call"]:
            entry["tool_calls"] = [{
                "tool": msg["function_call"].get("name", ""),
                "args": msg["function_call"].get("arguments", "")
            }]

        # Handle tool output
        if msg["role"] == "tool":
            entry["tool_name"] = msg.get("name", "")
            entry["tool_output"] = msg.get("content", "")

        cleaned.append(entry)
    return cleaned

# clean_conversation(conversation)

In [14]:
from openai import OpenAI

client = OpenAI()
def run_eval(input_path: str):
    with open(input_path, "r", encoding="utf-8") as f:
        raw_convo = json.load(f)
    cleaned = clean_conversation(raw_convo)
    eval_prompt = f"""You are evaluating a conversation between a Sales AI Assistant and an SDR.

    Instructions to the assistant:
    - Only use tools when needed.
    - Never hallucinate (make up) information.
    - Ask for missing info when required (e.g., lead name, product).
    - Don't re-query the same thing unnecessarily.

    Below is a snippet of the conversation. Respond in the following JSON format:
    STRICTLY follow the format and do not add any extra text like "Here is the evaluation" or "The evaluation is" or ```json```.
    {{
    "tool_use_correct": true/false,
    "tool_use_reason": "...",
    "hallucination": true/false,
    "hallucination_reason": "...",
    "missing_info_handling": "good" / "bad" / "not_applicable",
    "missing_info_reason": "...",
    "redundant_tool_call": true/false,
    "redundant_tool_reason": "..."
    "out_of_scope": true/false,
    "out_of_scope_reason": "...",
    }}

    Conversation:
    {cleaned}
    """
    messages = [
                {"role": "system", "content": "You are an expert evaluator."},
                {"role": "user", "content": eval_prompt},
            ]
    response = client.chat.completions.create(
        messages=messages,
        model="gpt-4o",
        stream=False,
        temperature=0,

    )

    response_content = response.choices[0].message.content
    print(response_content)
from pathlib import Path

folder_path = Path(os.getcwd()).parent / r"app\saved_conversations"
if not folder_path.exists():
    print(f"Folder not found: {folder_path}")
else:
    for file in os.listdir(folder_path):
        if file.endswith(".json"):
            run_eval(os.path.join(folder_path, file))

#run eval for all files in the folder
import os
for file in os.listdir(folder_path):
    if file.endswith(".json"):
        run_eval(os.path.join(folder_path, file))
# run_eval(r"C:\Users\AakashAI\Desktop\Repositories\Sales Agent\20250413_220608_conversation.json")

{
    "tool_use_correct": true,
    "tool_use_reason": "The assistant correctly used the run_primary_llm_query tool to retrieve leads from Bombay and to list people who work at universities.",
    "hallucination": false,
    "hallucination_reason": "The assistant did not provide any information that was not supported by the conversation or tool outputs.",
    "missing_info_handling": "good",
    "missing_info_reason": "The assistant appropriately asked for missing information (lead's first name, last name, and product) when it was required to draft an email.",
    "redundant_tool_call": false,
    "redundant_tool_reason": "The assistant did not make any redundant tool calls. Each tool call was necessary for the requests made by the user.",
    "out_of_scope": true,
    "out_of_scope_reason": "The assistant answered a question about the capital of Maharashtra, which is outside the scope of its instructions to assist with sales-related tasks."
}
{
    "tool_use_correct": true,
    "tool_

In [None]:
import json
import os
from pathlib import Path
from collections import Counter

def load_evaluations(folder: Path):
    """
    Load all evaluation JSON objects from files in the given folder.
    If a file contains a list of evaluations, extend the list.
    """
    evaluations = []
    for file in os.listdir(folder):
        if file.endswith(".json"):
            file_path = folder / file
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                # If the file contains a list of evaluations, flatten it.
                if isinstance(data, list):
                    evaluations.extend(data)
                elif isinstance(data, dict):
                    evaluations.append(data)
    return evaluations

def aggregate_metrics(evals):
    total = len(evals)
    if total == 0:
        print("No evaluations found!")
        return {}
    
    # Compute counts of True values from boolean fields
    tool_use_correct_count = sum(1 for ev in evals if ev.get("tool_use_correct", False))
    hallucination_count = sum(1 for ev in evals if ev.get("hallucination", False))
    redundant_tool_call_count = sum(1 for ev in evals if ev.get("redundant_tool_call", False))
    out_of_scope_count = sum(1 for ev in evals if ev.get("out_of_scope", False))
    
    # Count the different responses for missing_info_handling ("good", "bad", "not_applicable")
    missing_info_counter = Counter(ev.get("missing_info_handling", "not_applicable") for ev in evals)
    
    metrics = {
        "total_evaluations": total,
        "tool_use_correct_percentage": round(tool_use_correct_count / total * 100, 2),
        "hallucination_percentage": round(hallucination_count / total * 100, 2),
        "redundant_tool_call_percentage": round(redundant_tool_call_count / total * 100, 2),
        "out_of_scope_percentage": round(out_of_scope_count / total * 100, 2),
        "missing_info_handling_breakdown": dict(missing_info_counter)
    }
    return metrics

# Adjust the folder where your evaluation results are saved:
from pathlib import Path

eval_folder = Path(os.getcwd()).parent / r"evaluation\eval_folder"
if not eval_folder.exists():
    print(f"Evaluation folder not found: {eval_folder}")
else:
    evaluations = load_evaluations(eval_folder)
    agg_metrics = aggregate_metrics(evaluations)
    print("Final Aggregate Evaluation Metrics:")
    for key, value in agg_metrics.items():
        print(f"{key}: {value}")


Final Aggregate Evaluation Metrics:
total_evaluations: 84
tool_use_correct_percentage: 0.0
hallucination_percentage: 0.0
redundant_tool_call_percentage: 0.0
out_of_scope_percentage: 0.0
