In [None]:
from getpass import getpass
model = "NousResearch/Meta-Llama-3-8B-Instruct"
base_url="http://localhost:8000/v1"

from openai import OpenAI
import re

openai_api_key = ""
client = OpenAI(
    api_key=openai_api_key,
)

def chat_completion_request_openai(prompt):
    messages = [
        {"role": "user", "content": prompt}
    ]

    chat_response = client.chat.completions.create(
    model='o1',
    messages=messages,
    temperature=1.0)
    if chat_response.choices:
        completion_text = chat_response.choices[0].message.content
    else:
        completion_text = None
    return completion_text

definitions = open("taxonomy_definitions_examples/definitions.txt", "r").read()

def openai_evaluator(trace, definitions=definitions, examples='', atrace='' ):
    prompt = (
    "Below I will provide a multiagent system trace. The multiagent system is designed to solve olympiad math problems. \n"
    "The multiagent system has the following format: a translator agent that converts the text of the math problem into an easy to understand format, removing all the fluff, "
    "a computation agent that calculates mathematical expressions, "
    "a reasoning agent that thinks through the problem and works with the computation agent, but does no computation itself, "
    "and a verifier agent that checks if the provided solution is correct, and if not, sends it back. \n"
    "The structure is the translator agent receives the plaintext problem and sends the translated problem to the reasoning agent. \n"
    "The reasoning agent works with the computation agent to come up with a solution, communicating back and forth. \n"
    "The verifier agent receives the potential solution from the reasoning agent, and if it seems correct, terminates. If not, it sends it back to the verifier agent with feedback. \n"
    "Provide me an analysis of the failure modes and inefficiencies as I will say below. \n"
    "In the traces, analyze the system behaviour."
    "There are several failure modes in multiagent systems I identified. I will provide them below. Tell me if you encounter any of them, as a binary yes or no. \n"
    "Also, give me a one sentence (be brief) summary of the problems with the inefficiencies or failure modes in the trace. Only mark a failure mode if you can provide an example of it in the trace, and specify that in your summary at the end"
    "Also tell me whether the task is successfully completed or not, as a binary yes or no."
    "At the very end, I provide you with the definitions of the failure modes and inefficiencies. After the definitions, I will provide you with examples of the failure modes and inefficiencies for you to understand them better."
    "Tell me if you encounter any of them between the @@ symbols as I will say below, as a binary yes or no."
    "Here are the things you should answer. Start after the @@ sign and end before the next @@ sign (do not include the @@ symbols in your answer):"
    "*** begin of things you should answer *** @@"
    "A. Freeform text summary of the problems with the inefficiencies or failure modes in the trace: <summary>"
    "B. Whether the task is successfully completed or not: <yes or no>"
    "C. Whether you encounter any of the failure modes or inefficiencies:"
    "1.1 Disobey Task Specification: <yes or no>"
    "1.2 Disobey Role Specification: <yes or no>"
    "1.3 Step Repetition: <yes or no>"
    "1.4 Loss of Conversation History: <yes or no>"
    "1.5 Unaware of Termination Conditions: <yes or no>"
    "2.1 Conversation Reset: <yes or no>"
    "2.2 Fail to Ask for Clarification: <yes or no>"
    "2.3 Task Derailment: <yes or no>"
    "2.4 Information Withholding: <yes or no>"
    "2.5 Ignored Other Agent's Input: <yes or no>"
    "2.6 Action-Reasoning Mismatch: <yes or no>"
    "3.1 Premature Termination: <yes or no>"
    "3.2 No or Incorrect Verification: <yes or no>"
    "3.3 Weak Verification: <yes or no>"
    "@@*** end of your answer ***"
    "An example answer is: \n"
    "A. The task is not completed due to disobeying role specification as agents went rogue and started to chat with each other instead of completing the task. Agents derailed and verifier is not strong enough to detect it.\n"
    "B. no \n"
    "C. \n"
    "1.1 no \n"
    "1.2 no \n"
    "1.3 no \n"
    "1.4 no \n"
    "1.5 no \n"
    "1.6 yes \n"
    "2.1 no \n"
    "2.2 no \n"
    "2.3 yes \n"
    "2.4 no \n"
    "2.5 no \n"
    "2.6 yes \n"
    "2.7 no \n"
    "3.1 no \n"
    "3.2 yes \n"
    "3.3 no \n"   
    "Here is the trace: \n"
    f"{trace}"
    "Here is some additional information about the MAS output versus the true output and solution: \n"
    f"{atrace}"
    "Also, here are the explanations (definitions) of the failure modes and inefficiencies: \n"
    f"{definitions} \n"
    "Here are some examples of the failure modes and inefficiencies: \n"
    f"{examples}"
)
    print(prompt)
    return chat_completion_request_openai(prompt)

In [96]:
examples = open("taxonomy_definitions_examples/examples.txt", "r").read()

## LLM As a Judge Experiments

In [97]:
import pandas as pd

In [98]:
trace1 = open("trace_math/trace_3.txt", "r").read()
atrace1 = open("trace_math/trace_add_3.txt", "r").read()

full_trace_list = [trace1]

In [99]:
openai_results = []

In [100]:
import pickle
import os
import tiktoken

dirname = 'saved_results'
os.makedirs(dirname, exist_ok=True)

for i in range(len(full_trace_list)):
    
    if len(full_trace_list[i] + examples) > 1048570:
            full_trace_list[i] = full_trace_list[i][:1048570 - len(examples)]


    try:
        openai_evaluation = openai_evaluator(full_trace_list[i], examples=examples, atrace=atrace1)
        openai_results.append(openai_evaluation)
        
        # Save the current results after each evaluation
        with open(f'{dirname}/o1_results_checkpoint.pkl', 'wb') as f:
            pickle.dump(openai_results, f)
            
        # Optional: Save a backup copy every 10 evaluations
        if (i + 1) % 10 == 0:
            with open(f'{dirname}/o1_results_backup_{i+1}.pkl', 'wb') as f:
                pickle.dump(openai_results, f)
                
        print(f"Completed and saved evaluation {i+1}/{len(full_trace_list)}")
    except Exception as e:
        print(f"Error on evaluation {i+1}: {str(e)}")
        # Save results even if there's an error
        with open(f'{dirname}/o1_results_checkpoint.pkl', 'wb') as f:
            pickle.dump(openai_results, f)

Below I will provide a multiagent system trace. The multiagent system is designed to solve olympiad math problems. 
The multiagent system has the following format: a translator agent that converts the text of the math problem into an easy to understand format, removing all the fluff, a computation agent that calculates mathematical expressions, a reasoning agent that thinks through the problem and works with the computation agent, but does no computation itself, and a verifier agent that checks if the provided solution is correct, and if not, sends it back. 
The structure is the translator agent receives the plaintext problem and sends the translated problem to the reasoning agent. 
The reasoning agent works with the computation agent to come up with a solution, communicating back and forth. 
The verifier agent receives the potential solution from the reasoning agent, and if it seems correct, terminates. If not, it sends it back to the verifier agent with feedback. 
Provide me an analy

In [101]:
o1_results = openai_results

In [102]:
def parse_responses(responses):
    """
    Parse the LLM responses to extract yes/no answers for each failure mode.
    
    Args:
        responses: List of LLM responses evaluating traces
        
    Returns:
        Dictionary mapping failure mode codes to lists of binary values (0 for no, 1 for yes)
    """
    import re
    
    # Initialize dictionary with empty lists for each failure mode
    failure_modes = {
        '1.1': [], '1.2': [], '1.3': [], '1.4': [], '1.5': [],
        '2.1': [], '2.2': [], '2.3': [], '2.4': [], '2.5': [], '2.6': [],
        '3.1': [], '3.2': [], '3.3': []
    }
    
    for i, response in enumerate(responses):
        try:
            # Clean up the response - remove @@ markers if present
            cleaned_response = response.strip()
            if cleaned_response.startswith('@@'):
                cleaned_response = cleaned_response[2:]
            if cleaned_response.endswith('@@'):
                cleaned_response = cleaned_response[:-2]
            
            # Process each failure mode
            for mode in failure_modes.keys():
                # Various patterns to match different response formats
                patterns = [
                    # Format with C. prefix and colon
                    rf"C\..*?{mode}.*?(yes|no)",
                    # Format with just C prefix without dot
                    rf"C{mode}\s+(yes|no)",
                    # Format with mode directly (with or without spaces)
                    rf"{mode}\s*[:]\s*(yes|no)",
                    rf"{mode}\s+(yes|no)",
                    # Format with newlines
                    rf"{mode}\s*\n\s*(yes|no)",
                    # Format with C prefix and newlines
                    rf"C\.{mode}\s*\n\s*(yes|no)"
                ]
                
                found = False
                for pattern in patterns:
                    matches = re.findall(pattern, cleaned_response, re.IGNORECASE | re.DOTALL)
                    if matches:
                        # Use the first match
                        value = 1 if matches[0].lower() == 'yes' else 0
                        failure_modes[mode].append(value)
                        found = True
                        break
                
                if not found:
                    # If we still can't find a match, try a more general approach
                    # Look for the mode number followed by any text and then yes/no
                    general_pattern = rf"(?:C\.)?{mode}.*?(yes|no)"
                    match = re.search(general_pattern, cleaned_response, re.IGNORECASE | re.DOTALL)
                    
                    if match:
                        value = 1 if match.group(1).lower() == 'yes' else 0
                        failure_modes[mode].append(value)
                    else:
                        # If all attempts fail, default to 'no'
                        print(f"Warning: Could not find mode {mode} in response {i}")
                        failure_modes[mode].append(0)
                    
        except Exception as e:
            print(f"Error processing response {i}: {e}")
            # If there's an error, default to 'no' for all modes for this response
            for mode in failure_modes:
                if len(failure_modes[mode]) <= i:  # Only append if we haven't already
                    failure_modes[mode].append(0)
    
    # Ensure all lists have the same length
    max_length = max(len(values) for values in failure_modes.values())
    for mode in failure_modes:
        if len(failure_modes[mode]) < max_length:
            failure_modes[mode].extend([0] * (max_length - len(failure_modes[mode])))
    
    return failure_modes


In [103]:
# Import regex for pattern matching
import re

# Process the OpenAI evaluation results
failure_mode_results_o1 = parse_responses(o1_results)

# Print the first few entries of each failure mode to verify
for mode, values in failure_mode_results_o1.items():
    print(f"{mode}: {values[:5]} (total yes: {sum(values)}/{len(values)}, {round(sum(values)/len(values)*100, 2)}%)")

1.1: [0] (total yes: 0/1, 0.0%)
1.2: [1] (total yes: 1/1, 100.0%)
1.3: [0] (total yes: 0/1, 0.0%)
1.4: [0] (total yes: 0/1, 0.0%)
1.5: [0] (total yes: 0/1, 0.0%)
2.1: [0] (total yes: 0/1, 0.0%)
2.2: [0] (total yes: 0/1, 0.0%)
2.3: [0] (total yes: 0/1, 0.0%)
2.4: [0] (total yes: 0/1, 0.0%)
2.5: [0] (total yes: 0/1, 0.0%)
2.6: [0] (total yes: 0/1, 0.0%)
3.1: [0] (total yes: 0/1, 0.0%)
3.2: [0] (total yes: 0/1, 0.0%)
3.3: [1] (total yes: 1/1, 100.0%)


In [104]:
# Define the sizes of each source
source_sizes = [1]

# Calculate the starting index for each source
source_indices = [0] #[0, 28, 58, 88, 119]
for size in source_sizes[:-1]:
    source_indices.append(source_indices[-1] + size)

# print(source_indices)
# Dictionary to store average scores for each failure mode across sources
average_scores_by_source = {}

# Calculate average scores for each failure mode across the 5 sources
for mode, values in failure_mode_results_o1.items():
    source_averages = []
    # print(len(values))

    # Calculate average for each source
    for i in range(len(source_sizes)):
        start_idx = source_indices[i]
        end_idx = start_idx + source_sizes[i]
        if i == 4:
            end_idx = len(values)
        source_values = values[start_idx:end_idx]
        
                # Check if source_values is not empty to avoid division by zero
        if len(source_values) > 0:
            avg_score = sum(source_values) / len(source_values)
        else:
            print(f"Warning: Source {i} has no values for mode {mode}")
            avg_score = 0
            
        source_averages.append(avg_score)
    
    average_scores_by_source[mode] = source_averages
    
    # Print the average scores for each source
    print(f"{mode}: {[round(score * 100, 2) for score in source_averages]}%")

# Print a sample of the dictionary structure
print("\nSample of average_scores_by_source dictionary:")
sample_key = list(average_scores_by_source.keys())[0]
print(f"{sample_key}: {average_scores_by_source[sample_key]}")


1.1: [0.0]%
1.2: [100.0]%
1.3: [0.0]%
1.4: [0.0]%
1.5: [0.0]%
2.1: [0.0]%
2.2: [0.0]%
2.3: [0.0]%
2.4: [0.0]%
2.5: [0.0]%
2.6: [0.0]%
3.1: [0.0]%
3.2: [0.0]%
3.3: [100.0]%

Sample of average_scores_by_source dictionary:
1.1: [0.0]


In [105]:
def extract_summaries(responses):
    """
    Extract the 'A.' summary lines from each model response.
    
    Args:
        responses (list[str]): Raw model responses
        
    Returns:
        list[str]: A list of summaries (one per response)
    """
    summaries = []
    for i, response in enumerate(responses):
        if not response:
            summaries.append("")
            continue
        
        # Look for "A. <text until B.>"
        match = re.search(r"A\.\s*(.*?)\s*B\.", response, re.DOTALL | re.IGNORECASE)
        if match:
            summaries.append(match.group(1).strip())
        else:
            print(f"Warning: Could not find A. summary in response {i}")
            summaries.append("")
    
    return summaries


summaries = extract_summaries(o1_results)

for i, summary in enumerate(summaries):
    print(f"Trace {i+1} summary: {summary}")


Trace 1 summary: They dismissed the constant solution and declared the identity function correct, with the verifier incorrectly accepting this wrong answer.
