In [213]:
import os
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

In [239]:
from getpass import getpass
model = "NousResearch/Meta-Llama-3-8B-Instruct"
base_url="http://localhost:8000/v1"

from openai import OpenAI
import re

client = OpenAI(
    api_key=openai_api_key,
)

def chat_completion_request_openai(prompt):
    messages = [
        {"role": "user", "content": prompt}
    ]

    chat_response = client.chat.completions.create(
    model='o1',
    messages=messages,
    temperature=1.0)
    if chat_response.choices:
        completion_text = chat_response.choices[0].message.content
    else:
        completion_text = None
    return completion_text

definitions = open("taxonomy_definitions_examples/definitions.txt", "r").read()

def openai_evaluator(trace, definitions=definitions, examples='', atrace=None):
    text_desc = (
    "Below I will provide a multiagent system trace. The multiagent system is designed to solve AIME math problems. \n"
    "The agents were provided the following descriptions as their roles: \n"
    """{
  "id": "C0001",
  "scratch": "C0001 is an error detection specialist focused on identifying calculation mistakes, logical inconsistencies, and reasoning flaws in mathematical solutions.",
  "objective": "Carefully examine mathematical solutions and discussions to identify potential errors, inconsistencies, and flaws in reasoning or calculations.",
  "message_format_desc": "You should analyze the provided solutions and discussions to detect any errors or inconsistencies.\nThen, provide your findings and corrections if any errors are found.",
  "message_format_field": "Error Analysis,Corrections"
}\n"""
    """{
  "id": "C0002",
  "scratch": "C0002 is a logical reasoning specialist specialized in mathematical proof construction, step-by-step deduction, and logical argument validation.",
  "objective": "Construct rigorous mathematical proofs and validate logical arguments through systematic step-by-step deduction and reasoning.",
  "message_format_desc": "You should provide logical reasoning and construct mathematical proofs with clear step-by-step deduction.\nThen, present your logical conclusion or proof structure.",
  "message_format_field": "Logical Reasoning,Proof Structure"
}\n"""
    """{
    "id": "C0003",
    "scratch": "C0003 is a context comprehension specialist responsible for understanding problem statements, extracting key information, and summarizing lengthy mathematical contexts.",
    "objective": "Analyze and interpret mathematical problem statements, extract essential information, and provide clear summaries of complex mathematical contexts.",
    "message_format_desc": "You should analyze the problem context and extract key information from the mathematical problem.\nThen, provide a clear summary of the essential elements and requirements.",
    "message_format_field": "Context Analysis,Key Information Summary"
  }\n"""
    """{
    "id": "C0004",
    "scratch": "C0004 is a computational specialist dedicated to performing accurate calculations, numerical analysis, and algebraic manipulations.",
    "objective": "Execute precise mathematical calculations, perform numerical analysis, and handle complex algebraic manipulations with high accuracy.",
    "message_format_desc": "You should perform the necessary calculations and algebraic manipulations step by step.\nThen, provide the computational results and numerical analysis.",
    "message_format_field": "Calculations,Numerical Results"
  }\n"""
    """{
    "id": "C0005",
    "scratch": "C0005 is a solution verification specialist focused on checking final answers, validating solution paths, and ensuring mathematical correctness.",
    "objective": "Verify the correctness of mathematical solutions by checking final answers, validating solution methodologies, and ensuring overall mathematical accuracy.",
    "message_format_desc": "You should verify the provided solutions and check the correctness of the final answers and solution paths.\nThen, provide your verification results and confirmation of correctness.",
    "message_format_field": "Solution Verification,Correctness Confirmation"
}\n"""
    "Besides these agents, there is the EventManager agent, which manages when these agents fire. \n"
    "Provide me an analysis of the failure modes and inefficiencies as I will say below. \n"
    "In the traces, analyze the system behaviour."
    "There are several failure modes in multiagent systems I identified. I will provide them below. Tell me if you encounter any of them, as a binary yes or no. \n"
    "Also, give me a one sentence (be brief) summary of the problems with the inefficiencies or failure modes in the trace. Only mark a failure mode if you can provide an example of it in the trace, and specify that in your summary at the end"
    "Also tell me whether the task is successfully completed or not, as a binary yes or no."
    "At the very end, I provide you with the definitions of the failure modes and inefficiencies. After the definitions, I will provide you with examples of the failure modes and inefficiencies for you to understand them better."
    "Tell me if you encounter any of them between the @@ symbols as I will say below, as a binary yes or no."
    "Here are the things you should answer. Start after the @@ sign and end before the next @@ sign (do not include the @@ symbols in your answer):"
    "*** begin of things you should answer *** @@"
    "A. Freeform text summary of the problems with the inefficiencies or failure modes in the trace: <summary>"
    "B. Whether the task is successfully completed or not: <yes or no>"
    "C. Whether you encounter any of the failure modes or inefficiencies:"
    "1.1 Disobey Task Specification: <yes or no>"
    "1.2 Disobey Role Specification: <yes or no>"
    "1.3 Step Repetition: <yes or no>"
    "1.4 Loss of Conversation History: <yes or no>"
    "1.5 Unaware of Termination Conditions: <yes or no>"
    "2.1 Conversation Reset: <yes or no>"
    "2.2 Fail to Ask for Clarification: <yes or no>"
    "2.3 Task Derailment: <yes or no>"
    "2.4 Information Withholding: <yes or no>"
    "2.5 Ignored Other Agent's Input: <yes or no>"
    "2.6 Action-Reasoning Mismatch: <yes or no>"
    "3.1 Premature Termination: <yes or no>"
    "3.2 No or Incorrect Verification: <yes or no>"
    "3.3 Weak Verification: <yes or no>"
    "@@*** end of your answer ***"
    "An example answer is: \n"
    "A. The task is not completed due to disobeying role specification as agents went rogue and started to chat with each other instead of completing the task. Agents derailed and verifier is not strong enough to detect it.\n"
    "B. no \n"
    "C. \n"
    "1.1 no \n"
    "1.2 no \n"
    "1.3 no \n"
    "1.4 no \n"
    "1.5 no \n"
    "1.6 yes \n"
    "2.1 no \n"
    "2.2 no \n"
    "2.3 yes \n"
    "2.4 no \n"
    "2.5 no \n"
    "2.6 yes \n"
    "2.7 no \n"
    "3.1 no \n"
    "3.2 yes \n"
    "3.3 no \n"
    )
    if atrace:
        prompt = (
        f"{text_desc}"
        "Here is the trace: \n"
        f"{trace}"
        "Here is some additional context about the trace: \n"
        f"{atrace}"
        "Also, here are the explanations (definitions) of the failure modes and inefficiencies: \n"
        f"{definitions} \n"
        "Here are some examples of the failure modes and inefficiencies: \n"
        f"{examples}"
        )
    else:
        prompt = (
        ""
        f"{text_desc}"
        "Here is the trace: \n"
        f"{trace}"
        "Also, here are the explanations (definitions) of the failure modes and inefficiencies: \n"
        f"{definitions} \n"
        "Here are some examples of the failure modes and inefficiencies: \n"
        f"{examples}"
        )
    # print(prompt)
    return chat_completion_request_openai(prompt)

In [215]:
examples = open("taxonomy_definitions_examples/examples.txt", "r").read()

## LLM As a Judge Experiments

In [216]:
import pandas as pd

In [217]:
def matches_pattern(s):
    # ^C\d{4} say to All: -> ^ means start of string, \d{4} is 4 digits
    pattern = r"^C\d{4} say to All:"
    return bool(re.match(pattern, s))

In [228]:
import json

for fon in range(7, 29+1):
    print(fon)
    # file_path = f"agentgroupchat_mbpp_trace/gpt_task_MBPP_{fon}.json"
    file_path = f"agentgroupchat_aime_trace/gpt_task_AIME_{fon}.json"

    # Step 1: Read the full raw file content
    with open(file_path, "r") as f:
        trace1 = f.read()

    # n_file_path = f"agentgroupchat_mbpp_trace/updated_trace_MBPP_{fon}.txt"
    n_file_path = f"agentgroupchat_aime_trace/updated_trace_AIME_{fon}.txt"
    err = False
    with open(n_file_path, "w") as f:
        # Step 2: Parse JSON objects line by line
        data = []
        for line in trace1.splitlines():
            line = line.strip()
            if line:  # skip empty lines
                # print(line)
                data.append(json.loads(line))
                if data[-1]['args'] == 'Prompt INFO':
                    pass
                elif len(data[-1]['args'])>len('### ERROR:') and data[-1]['args'][:len('### ERROR:')] == '### ERROR:':
                    # print(json.loads(data[-1]['kwargs']))
                    err = True
                    f.write(str(json.loads(data[-1]['kwargs'])))
                    f.write('\n')

                else:
                    # print(json.loads(data[-1]['kwargs'])['message'])
                    if matches_pattern(json.loads(data[-1]['kwargs'])['message']):
                        # print(json.loads(data[-1]['kwargs'])['message'])
                        f.write(json.loads(data[-1]['kwargs'])['message'])
                        f.write("\n----END MESSAGE----\n\n")

    # a_file_path = f"agentgroupchat_mbpp_trace/updated_atrace_MBPP_{fon}.txt"
    a_file_path = f"agentgroupchat_aime_trace/updated_atrace_AIME_{fon}.txt"
    with open(a_file_path, "w") as f:
        if err:
            f.write("The agent was not successful in its answer.\n")
            continue
        last_entry = data[-1]
        scs = json.loads(last_entry['kwargs'])['success']
        if scs:
            f.write("The agent was successful in its answer.\n")
        else:
            f.write("The agent was not successful in its answer.\n")
        f.write("The agent obtained as an answer: ")
        f.write(json.loads(last_entry['kwargs'])['answer'])
        f.write("\nThe real answer was answer: ")
        f.write(json.loads(last_entry['kwargs'])['ground_truth'])

7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [None]:
def parse_responses(responses):
    """
    Parse the LLM responses to extract yes/no answers for each failure mode.
    
    Args:
        responses: List of LLM responses evaluating traces
        
    Returns:
        Dictionary mapping failure mode codes to lists of binary values (0 for no, 1 for yes)
    """
    import re
    
    # Initialize dictionary with empty lists for each failure mode
    failure_modes = {
        '1.1': [], '1.2': [], '1.3': [], '1.4': [], '1.5': [],
        '2.1': [], '2.2': [], '2.3': [], '2.4': [], '2.5': [], '2.6': [],
        '3.1': [], '3.2': [], '3.3': []
    }
    
    for i, response in enumerate(responses):
        try:
            # Clean up the response - remove @@ markers if present
            cleaned_response = response.strip()
            if cleaned_response.startswith('@@'):
                cleaned_response = cleaned_response[2:]
            if cleaned_response.endswith('@@'):
                cleaned_response = cleaned_response[:-2]
            
            # Process each failure mode
            for mode in failure_modes.keys():
                # Various patterns to match different response formats
                patterns = [
                    # Format with C. prefix and colon
                    rf"C\..*?{mode}.*?(yes|no)",
                    # Format with just C prefix without dot
                    rf"C{mode}\s+(yes|no)",
                    # Format with mode directly (with or without spaces)
                    rf"{mode}\s*[:]\s*(yes|no)",
                    rf"{mode}\s+(yes|no)",
                    # Format with newlines
                    rf"{mode}\s*\n\s*(yes|no)",
                    # Format with C prefix and newlines
                    rf"C\.{mode}\s*\n\s*(yes|no)"
                ]
                
                found = False
                for pattern in patterns:
                    matches = re.findall(pattern, cleaned_response, re.IGNORECASE | re.DOTALL)
                    if matches:
                        # Use the first match
                        value = 1 if matches[0].lower() == 'yes' else 0
                        failure_modes[mode].append(value)
                        found = True
                        break
                
                if not found:
                    # If we still can't find a match, try a more general approach
                    # Look for the mode number followed by any text and then yes/no
                    general_pattern = rf"(?:C\.)?{mode}.*?(yes|no)"
                    match = re.search(general_pattern, cleaned_response, re.IGNORECASE | re.DOTALL)
                    
                    if match:
                        value = 1 if match.group(1).lower() == 'yes' else 0
                        failure_modes[mode].append(value)
                    else:
                        # If all attempts fail, default to 'no'
                        print(f"Warning: Could not find mode {mode} in response {i}")
                        failure_modes[mode].append(0)
                    
        except Exception as e:
            print(f"Error processing response {i}: {e}")
            # If there's an error, default to 'no' for all modes for this response
            for mode in failure_modes:
                if len(failure_modes[mode]) <= i:  # Only append if we haven't already
                    failure_modes[mode].append(0)
    
    # Ensure all lists have the same length
    max_length = max(len(values) for values in failure_modes.values())
    for mode in failure_modes:
        if len(failure_modes[mode]) < max_length:
            failure_modes[mode].extend([0] * (max_length - len(failure_modes[mode])))
    
    return failure_modes


In [None]:
def extract_summaries(responses):
    """
    Extract the 'A.' summary lines from each model response.
    
    Args:
        responses (list[str]): Raw model responses
        
    Returns:
        list[str]: A list of summaries (one per response)
    """
    summaries = []
    for i, response in enumerate(responses):
        if not response:
            summaries.append("")
            continue
        
        # Look for "A. <text until B.>"
        match = re.search(r"A\.\s*(.*?)\s*B\.", response, re.DOTALL | re.IGNORECASE)
        if match:
            summaries.append(match.group(1).strip())
        else:
            print(f"Warning: Could not find A. summary in response {i}")
            summaries.append("")
    
    return summaries

In [240]:
for fileon in range(7, 29+1):
    print(f"File on: {fileon}")
    trace1 = open(f"agentgroupchat_aime_trace/updated_trace_AIME_{fileon}.txt", "r").read()
    atrace1 = open(f"agentgroupchat_aime_trace/updated_atrace_AIME_{fileon}.txt", "r").read()
    openai_results = []

    try:
        openai_evaluation = openai_evaluator(trace1, examples=examples, atrace=atrace1)
        openai_results.append(openai_evaluation)
    except Exception as e:
        print(f"Error on evaluation of trace: {str(e)}")
    o1_results = openai_results


    # Process the OpenAI evaluation results
    failure_mode_results_o1 = parse_responses(o1_results)

    # Print the first few entries of each failure mode to verify

    # with open("llmjudge_output/llmjudge_output_" + str(fon) + ".txt", "w") as f:
    # for mode, values in failure_mode_results_o1.items():
    #     print(f"{mode}: {values[:5]} (total yes: {sum(values)}/{len(values)}, {round(sum(values)/len(values)*100, 2)}%)")
    #         # f.write(str(values[:5]))
    #         # f.write('\n')

    # Define the sizes of each source
    source_sizes = [1]

    # Calculate the starting index for each source
    source_indices = [0] #[0, 28, 58, 88, 119]
    for size in source_sizes[:-1]:
        source_indices.append(source_indices[-1] + size)

    # print(source_indices)
    # Dictionary to store average scores for each failure mode across sources
    average_scores_by_source = {}

    # Calculate average scores for each failure mode across the 5 sources
    for mode, values in failure_mode_results_o1.items():
        source_averages = []
        # print(len(values))

        # Calculate average for each source
        for i in range(len(source_sizes)):
            start_idx = source_indices[i]
            end_idx = start_idx + source_sizes[i]
            if i == 4:
                end_idx = len(values)
            source_values = values[start_idx:end_idx]
            
                    # Check if source_values is not empty to avoid division by zero
            if len(source_values) > 0:
                avg_score = sum(source_values) / len(source_values)
            else:
                print(f"Warning: Source {i} has no values for mode {mode}")
                avg_score = 0
                
            source_averages.append(avg_score)
        
        average_scores_by_source[mode] = source_averages
        
        # Print the average scores for each source
        print(f"{mode}: {[round(score * 100, 2) for score in source_averages]}%")

    # Print a sample of the dictionary structure
    print("\nSample of average_scores_by_source dictionary:")
    sample_key = list(average_scores_by_source.keys())[0]
    print(f"{sample_key}: {average_scores_by_source[sample_key]}")

    summaries = extract_summaries(o1_results)

    for i, summary in enumerate(summaries):
        print(f"Trace {i+1} summary: {summary}")


File on: 7
1.1: [0.0]%
1.2: [0.0]%
1.3: [0.0]%
1.4: [0.0]%
1.5: [0.0]%
2.1: [0.0]%
2.2: [0.0]%
2.3: [0.0]%
2.4: [0.0]%
2.5: [0.0]%
2.6: [0.0]%
3.1: [0.0]%
3.2: [0.0]%
3.3: [0.0]%

Sample of average_scores_by_source dictionary:
1.1: [0.0]
Trace 1 summary: All agents adhered to their roles without evident inefficiencies or errors, producing a correct final answer smoothly.
File on: 8
1.1: [0.0]%
1.2: [0.0]%
1.3: [0.0]%
1.4: [0.0]%
1.5: [0.0]%
2.1: [0.0]%
2.2: [0.0]%
2.3: [0.0]%
2.4: [0.0]%
2.5: [0.0]%
2.6: [0.0]%
3.1: [0.0]%
3.2: [0.0]%
3.3: [0.0]%

Sample of average_scores_by_source dictionary:
1.1: [0.0]
Trace 1 summary: No observable inefficiencies or failure modes; the agents fulfilled their roles correctly and reached the correct solution without issues.
File on: 9
1.1: [0.0]%
1.2: [0.0]%
1.3: [100.0]%
1.4: [0.0]%
1.5: [0.0]%
2.1: [0.0]%
2.2: [0.0]%
2.3: [0.0]%
2.4: [0.0]%
2.5: [0.0]%
2.6: [0.0]%
3.1: [0.0]%
3.2: [0.0]%
3.3: [0.0]%

Sample of average_scores_by_source dictionary:
1.1