In [None]:
import json
import os
from typing import List, Dict, Any
from pathlib import Path





In [None]:
load_conversation_files(r"app\saved_conversations")

[[{'role': 'user', 'content': 'get me leads from bombay'},
  {'role': 'assistant',
   'content': '[No content]',
   'tool_calls': [{'tool': 'run_primary_llm_query',
     'args': '{"nl_query": "Get me leads from Bombay"}'}]},
  {'role': 'tool',
   'content': '[{"Lead Number": 660522, "Lead Stage": "Qualified", "City": "Mumbai", "Company": "Dr. Ram Manohar Lohia Avadh University /ugc", "Lead Source": "Google", "Email": NaN}, {"Lead Number": 659357, "Lead Stage": "Unreachable", "City": "Mumbai", "Company": "Yogeshsadarang@yahoo.in", "Lead Source": "Google", "Email": "Yogeshsadarang@yahoo.in"}, {"Lead Number": 655287, "Lead Stage": "Not Interested", "City": "Mumbai", "Company": "vkaradkar@in.imshealth.com", "Lead Source": "Direct Traffic", "Email": "vkaradkar@in.imshealth.com"}, {"Lead Number": 654061, "Lead Stage": "Qualified", "City": "Mumbai", "Company": "rakshata.nikam@sharekhan.com", "Lead Source": "Direct Traffic", "Email": "rakshata.nikam@sharekhan.com"}, {"Lead Number": 652178, "Le

In [41]:
from openai import OpenAI
evals = []
client = OpenAI()
folder = Path(os.getcwd()).parent / r"app\saved_conversations"
CONVERSATION_FILES = []
def run_eval(folder: str) -> List[Dict[str, Any]]:

    eval_prompt_base = """
    
    You are evaluating a conversation between a Sales AI Assistant and a user.
    Context : The Sales AI Assistant is designed to assist users in search and analysis of leads, generating sales emails and sending them. 
    The assistant will not provide any other information or perform any other tasks outside of this context.

    #INPUT
    Conversation:
    {{conversation}}

    # EVALUATION CRITERIA AND SCORING RUBRIC
    Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
    <evaluation_criteria>
    Check for correctness of tool usage, hallucinations, missing information handling, redundant tool calls, and out-of-scope responses.
    </evaluation_criteria>
    
    <scoring_rubric>
    True/False for each of the following:
    Tools used in the conversation:
    - run_primary_llm_query: "run_primary_llm_query" is a function that runs a query to the primary LLM. it requires a query.
    - generate_sales_email: "generate_sales_email" is a function that generates a sales email. It requires lead number, product, first name, last name from the user.
    - send_email: "send_email" is a function that sends an email. It requires subject, body, and recipient email address from the user.
    - tool_use_correctness: Did the assistant use the correct tool for the task it was trying to accomplish? If yes, assign true. If no tools were needed, assign true. If assistant correctly asked for information from the user, assign true. If the assistant used a tool incorrectly, assign false.
    - tool_use_reason: Explain why the tool usage is correct or incorrect.
    - hallucination: Did the assistant provide any hallucinated information? If yes, assign true, otherwise false.
    - hallucination_reason: Explain why the information is considered hallucinated or not.
    - redundant_tool_call: Did the assistant make any redundant tool calls? If yes, assign true, otherwise false.
    - redundant_tool_reason: Explain why the tool call is considered redundant or not.
    - out_of_scope: Did the assistant provide any out-of-scope information in context of the role? If yes, assign true, otherwise false.
    - out_of_scope_reason: Explain why the information is considered out-of-scope or not.
    </scoring_rubric>

    Below is a snippet of the conversation. Respond in the following JSON format:
    STRICTLY follow the format and do not add any extra text like "Here is the evaluation" or "The evaluation is" or ```json```.
    {{
    "tool_use_correctness": true/false,
    "tool_use_reason": "...",
    "hallucination": true/false,
    "hallucination_reason": "...",
    "redundant_tool_call": true/false,
    "redundant_tool_reason": "..."
    "out_of_scope": true/false,
    "out_of_scope_reason": "...",
    }}

    
    """
    #CLEAN THE CONVERSATION FILES FOR EVALUATION
    def clean_conversation(raw_convo: List[Dict]) -> List[Dict]:
    

        cleaned = []
        #clean the conversation by removing system prompt
        for msg in raw_convo[1:]:
            entry = {"role": msg["role"]}

            # Handle content
            entry["content"] = msg.get("content") or "[No content]"

            # Handle tool_calls from assistant
            if "tool_calls" in msg and msg["tool_calls"]:
                entry["tool_calls"] = [
                    {
                        "tool": call.get("function", {}).get("name", ""),
                        "args": call.get("function", {}).get("arguments", "")
                    }
                    for call in msg["tool_calls"]
                ]

            # Handle function_call (legacy)
            if "function_call" in msg and msg["function_call"]:
                entry["tool_calls"] = [{
                    "tool": msg["function_call"].get("name", ""),
                    "args": msg["function_call"].get("arguments", "")
                }]

            # Handle tool output
            if msg["role"] == "tool":
                entry["tool_name"] = msg.get("name", "")
                entry["tool_output"] = msg.get("content", "")

            cleaned.append(entry)
        return cleaned
    
    #LOAD CONVERSATION FILES
    def load_conversation_files() :
        for file in os.listdir(folder):
            if file.endswith(".json"):
                file_path = folder / file
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    data = clean_conversation(data)
                    CONVERSATION_FILES.append(data)
                    
    load_conversation_files()

    for conversation in CONVERSATION_FILES:
        print("Conversation")
        # Prepare the evaluation prompt with the cleaned conversation
        eval_prompt = eval_prompt_base.replace("{{conversation}}", json.dumps(conversation))

        # Prepare the messages for the API call
    
        messages = [
                    {"role": "system", "content": "You are an expert evaluator."},
                    {"role": "user", "content": eval_prompt},
                ]
        response = client.chat.completions.create(
            messages=messages,
            model="gpt-4o",
            stream=False,
            temperature=0,

        )

        response_content = response.choices[0].message.content
        # Parse the response content to extract the JSON data
        try:
            response_json = json.loads(response_content)
            evals.append(response_json)
        except json.JSONDecodeError:
            print("Failed to parse JSON response for conversation:")
            return None
        print(response_json)
run_eval(folder)

Conversation
{'tool_use_correctness': False, 'tool_use_reason': "The assistant used the 'run_primary_llm_query' tool correctly to retrieve leads from Bombay and to list people who work at universities. However, it failed to use the 'generate_sales_email' tool when the user requested to draft an email for lead 588124. Instead, it asked for additional information, which was not necessary as the tool should have been used.", 'hallucination': False, 'hallucination_reason': "The assistant did not provide any hallucinated information. All responses were based on the data retrieved from the tool or were within the scope of the assistant's knowledge.", 'redundant_tool_call': False, 'redundant_tool_reason': 'The tool calls made by the assistant were necessary for retrieving the requested information about leads and universities.', 'out_of_scope': True, 'out_of_scope_reason': 'The assistant provided out-of-scope information when it answered the question about the capital of Maharashtra. This inf

In [43]:
import json
import os
from pathlib import Path
from collections import Counter

def aggregate_metrics(evals):
    total = len(evals)
    if total == 0:
        print("No evaluations found!")
        return {}
    
    # Compute counts of True values from boolean fields
    tool_use_correct_count = sum(1 for ev in evals if ev.get("tool_use_correctness", False))
    hallucination_count = sum(1 for ev in evals if ev.get("hallucination", False))
    redundant_tool_call_count = sum(1 for ev in evals if ev.get("redundant_tool_call", False))
    out_of_scope_count = sum(1 for ev in evals if ev.get("out_of_scope", False))
    
    # Count the different responses for missing_info_handling ("good", "bad", "not_applicable")
    missing_info_counter = Counter(ev.get("missing_info_handling", "not_applicable") for ev in evals)
    
    metrics = {
        "total_evaluations": total,
        "tool_use_correct_percentage": round(tool_use_correct_count / total * 100, 2),
        "hallucination_percentage": round(hallucination_count / total * 100, 2),
        "redundant_tool_call_percentage": round(redundant_tool_call_count / total * 100, 2),
        "out_of_scope_percentage": round(out_of_scope_count / total * 100, 2),
        "missing_info_handling_breakdown": dict(missing_info_counter)
    }
    return metrics

agg_metrics = aggregate_metrics(evals)
print("Final Aggregate Evaluation Metrics:")
for key, value in agg_metrics.items():
    print(f"{key}: {value}")


Final Aggregate Evaluation Metrics:
total_evaluations: 6
tool_use_correct_percentage: 83.33
hallucination_percentage: 0.0
redundant_tool_call_percentage: 0.0
out_of_scope_percentage: 16.67
missing_info_handling_breakdown: {'not_applicable': 6}
