In [None]:
import pandas as pd 
import json
from collections import defaultdict
import os
import matplotlib.pyplot as plt

In [None]:
#UTIL Funcitons to calculate metrics

def read_results(main_folder="results",model=None):
    """
    Reads the results from the specified main folder and organizes them into a nested dictionary structure.
    
    Args:
        main_folder (str): The path to the main folder containing the results.
        
    Returns:
        list: list with dictionaries with agents, sections, and their corresponding logs.
    """
    results = defaultdict(lambda: defaultdict(dict))
    list_data = []
    agents = os.listdir(os.path.join(main_folder,model))
    for agent in agents:
            sections = os.listdir(os.path.join(main_folder,model,agent))
            for section in sections:
                statuses = os.listdir(os.path.join(main_folder,model,agent,section))
                for status in statuses:
                    labs = os.listdir(os.path.join(main_folder,model,agent,section,status))
                    for lab in labs:
                        try:
                            file = os.listdir(os.path.join(main_folder,model,agent,section,status,lab))[0]
                        except IndexError:
                             print(os.listdir(os.path.join(main_folder,model,agent,section,status,lab)))
                        with open(os.path.join(main_folder,model,agent,section,status,lab,file)) as f:
                                    logs = [json.loads(line) for line in f]
                                    
                        data = {
                                'agent':agent,
                                'section':section,
                                'model':model,
                                'lab title':lab,
                                'status':status,
                                'logs':logs
                                }                                  
                        list_data.append(data)
    return list_data


def get_metrics(labs):
    """
    Extracts metrics from the provided list of lab results.
    
    Args:
        labs (list): A list of dictionaries containing lab results, where each dictionary includes logs and metadata.
        
    Returns:
        list: A list of dictionaries containing calculated metrics for each lab
    """
    
    results = []
    for lab in labs:
        
        #------- DATA EXTRACTION --------
        completitions = [log for log in lab['logs'] if log.get('object') == 'chat.completion']
        user_messages = [log for log in lab['logs'] if log.get("event") == "user_message" ]
        assistant_messages = [log for log in lab['logs'] if log.get("event") == "assistant_message" ]
        model_metadata = [log for log in lab['logs'] if "model" in log ]

        #model
        model = model_metadata[0]['model']

        #assistant messages 
        assistant_contents = [
            choice['message']['content']
            for co in completitions
            for choice in co['choices']
        ]

        #assistant tools
        assistant_tools_calls = [
            tool['function']
            for co in completitions
            for choice in co['choices']
            for tool in choice['message']['tool_calls']
        ]

        #finish reason
        finish_reasons = [
            choice['finish_reason']
            for co in completitions
            for choice in co['choices']
        ]

        #integration of finish reason, assistant_contents, and assistant_tools_calls
        assistant_outputs = [{"message":a, "finish_reason":b,"tool":c} for a, b, c in zip(assistant_contents, finish_reasons,assistant_tools_calls)]


        #------- METRICS CALCULATION --------
        #turns
        total_turns = len(user_messages)

        #time
        active_seconds = [ac['timing']['active_seconds'] for ac in completitions]
        idle_seconds = [ac['timing']['idle_seconds'] for ac in completitions]
        total_active_seconds = sum(active_seconds)
        total_idle_seconds = sum(idle_seconds) 
        total_seconds = total_active_seconds + total_idle_seconds

        #tokens
        prompt_tokens = [ac['usage']['prompt_tokens'] for ac in completitions]
        completion_tokens = [ac['usage']['completion_tokens'] for ac in completitions]
        total_prompt_tokens = sum(prompt_tokens)
        total_completion_tokens = sum(completion_tokens)
        total_tokens = total_prompt_tokens + total_completion_tokens

        #costs
        interaction_costs = [ac['cost']['interaction_cost'] for ac in completitions]
        total_interaction_costs = sum(interaction_costs)

        #assistant outputs
        total_assistant_messages = len([x for x in assistant_contents if x is not None])

        #assistant tools
        total_assistant_tools = len([x for x in assistant_tools_calls])

        metrics = {
            "agent": lab['agent'],
            "section": lab['section'],
            "model": lab['model'],
            "lab_title": lab['lab title'],
            "status": lab['status'],
            "turns": total_turns,
            "active_seconds": total_active_seconds,
            "idle_seconds": total_idle_seconds,
            "total_seconds": total_seconds,
            "prompt_tokens": total_prompt_tokens,
            "completion_tokens": total_completion_tokens,
            "total_tokens": total_tokens,
            "interaction_costs": total_interaction_costs,
            "total_assistant_messages": total_assistant_messages,
            "total_assistant_tools": total_assistant_tools,
            "assistant_outputs": json.dumps(assistant_outputs) 
        }
        results.append(metrics)
    return results 

# Define the model

In [None]:
from dotenv import load_dotenv
load_dotenv('.env')

MODEL = os.getenv("CAI_MODEL").replace('/','-')
MODEL

<h1>1. Read results and generate metrics tables</h1>

In [None]:
results = read_results(model=MODEL)
df_metrics = pd.DataFrame(get_metrics(results))

#calcualte the mean of the metrics
mean_metrics = (df_metrics.drop(columns=['status',
                                         'lab_title',
                                         'assistant_outputs'
                            ]).groupby(['agent', 
                                        'section', 
                                        'model'])
                            .mean()
                            .reset_index())


#calculate the sum of status metric
df_metrics = pd.get_dummies(df_metrics, columns=['status'],prefix='',prefix_sep='')
if 'interrupted' not in df_metrics.columns:
    df_metrics['interrupted'] = False
if 'not-solved' not in df_metrics.columns:
    df_metrics['not-solved'] = False
if 'solved' not in df_metrics.columns:
    df_metrics['solved'] = False

df_metrics[['interrupted','not-solved','solved']] = df_metrics[['interrupted','not-solved','solved']].astype(int)
status_metrics = (df_metrics.drop(columns=['lab_title',
                                          'assistant_outputs'])
                            .groupby(['agent', 
                                      'section', 
                                      'model'])
                            [['interrupted','not-solved','solved']]
                            .sum()
                            .reset_index())



df_calculated_metrics = pd.merge(mean_metrics, status_metrics, on=['agent', 'section', 'model'])
df_calculated_metrics = df_calculated_metrics.rename(columns={
    'turns': 'avg_turns',
    'active_seconds': 'avg_active_seconds',
    'idle_seconds': 'avg_idle_seconds',
    'total_seconds': 'avg_total_seconds',
    'prompt_tokens': 'avg_prompt_tokens',
    'completion_tokens': 'avg_completion_tokens',
    'total_tokens': 'avg_total_tokens',
    'interaction_costs': 'avg_interaction_costs', 
    'total_assistant_messages': 'avg_total_assistant_messages',
    'total_assistant_tools': 'avg_total_assistant_tools',   
    'interrupted': 'total_interrupted',
    'not-solved': 'total_not_solved',
    'solved': 'total_solved'
})

#save the dataframe to a excel file
df_metrics.to_excel(f'metrics_experiment/evaluation_metrics_{MODEL}.xlsx', index=False)
df_calculated_metrics.to_excel(f'metrics_experiment/calculated_evaluation_metrics_{MODEL}.xlsx', index=False)

<h1>2. Graph Assistant Messages and Tools by Agent</h1>

In [None]:
df = df_calculated_metrics[['agent','avg_total_assistant_messages','avg_total_assistant_tools']].groupby('agent').mean().round(1).reset_index()

# Plotting
x = range(len(df))
width = 0.35

fig, ax = plt.subplots()
bars1 = ax.bar([i - width/2 for i in x], df['avg_total_assistant_messages'], width,
               label='Avg Assistant Messages', color='gray')
bars2 = ax.bar([i + width/2 for i in x], df['avg_total_assistant_tools'], width,
               label='Avg Assistant Tools', color='white', edgecolor='black')

# Labels and legend
ax.set_xlabel('Agent Type')
ax.set_ylabel('Average Count')
ax.set_title('Assistant Messages and Tools by Agent Type')
ax.set_xticks(x)
ax.set_xticklabels(df['agent'])
ax.legend()

plt.tight_layout()
plt.show()

<h1>2. Graph Lab Status by Agent Type and Lab Type</h1>

In [None]:
df = df_calculated_metrics[['agent','section','total_interrupted','total_not_solved','total_solved']].groupby(['agent','section']).sum().reset_index()
df['section'] = df['section'].map({'cross-site-request-forgery-csrf':'CSRF','cross-site-scripting':'XSS','sql-injection':'SQLI'})

# Setup
prompts = df['agent'].unique()
sections = df['section'].unique()

width = 0.25
x = range(len(sections))

for prompt in prompts:
    df_prompt = df[df['agent'] == prompt]

    fig, ax = plt.subplots(figsize=(7, 4))

    ax.bar([i - width for i in x], df_prompt['total_interrupted'], width, label='Interrupted', color='gray')
    ax.bar(x, df_prompt['total_not_solved'], width, label='Not Solved', color='white', edgecolor='black')
    ax.bar([i + width for i in x], df_prompt['total_solved'], width, label='Solved', color='lightgray')

    ax.set_title(prompt)
    ax.set_ylabel('Total Count')
    ax.set_xticks(x)
    ax.set_xticklabels(df_prompt['section'], rotation=30, ha='right')

    # Move legend outside
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=True)

    plt.tight_layout()
    plt.show()

<h1>3. Seconds by Agent Type</h1>

In [None]:
df = df_calculated_metrics[['agent','avg_active_seconds','avg_idle_seconds']].groupby('agent').mean().round(1).reset_index()

# Plotting
x = range(len(df))
width = 0.35

fig, ax = plt.subplots()
bars1 = ax.bar([i - width/2 for i in x], df['avg_active_seconds'], width,
               label='Avg Active Seconds', color='gray')
bars2 = ax.bar([i + width/2 for i in x], df['avg_idle_seconds'], width,
               label='Avg Idle Seconds', color='white', edgecolor='black')

# Labels and legend
ax.set_xlabel('Agent Type')
ax.set_ylabel('Average Count')
ax.set_title('Active and Idle Seconds by Agent Type')
ax.set_xticks(x)
ax.set_xticklabels(df['agent'])
ax.legend()

plt.tight_layout()
plt.show()

<h1>4.Tokens by Agent Type</h1>

In [None]:
df = df_calculated_metrics[['agent','avg_prompt_tokens','avg_completion_tokens']].groupby('agent').mean().round(1).reset_index()

# Plotting
x = range(len(df))
width = 0.35

fig, ax = plt.subplots()
bars1 = ax.bar([i - width/2 for i in x], df['avg_prompt_tokens'], width,
               label='Avg Prompt Tokens', color='gray')
bars2 = ax.bar([i + width/2 for i in x], df['avg_completion_tokens'], width,
               label='Avg Idle Seconds', color='white', edgecolor='black')

# Labels and legend
ax.set_xlabel('Agent Type')
ax.set_ylabel('Average Count')
ax.set_title('Prompt and Completion Tokens by Agent Type')
ax.set_xticks(x)
ax.set_xticklabels(df['agent'])
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
df = df_calculated_metrics[['agent','section','avg_turns']].groupby(['agent','section']).mean().reset_index()
df['section'] = df['section'].map({'cross-site-request-forgery-csrf':'CSRF','cross-site-scripting':'XSS','sql-injection':'SQLI'})

# Unique prompts
agents = df['agent'].unique()


for agent in agents:
    df_agent = df[df['agent'] == agent]

    fig, ax = plt.subplots(figsize=(6, 4))

    bars = ax.bar(df_agent['section'], df_agent['avg_turns'],
                  color='gray', edgecolor='black', label='Avg Turns')

    # Add value labels
    for bar in bars:
        yval = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2, yval + 0.1, f'{yval:.1f}', ha='center', va='bottom', fontsize=9)

    ax.set_title(agent)
    ax.set_ylabel('Average Turns')
    ax.set_xlabel('Section')
    ax.set_ylim(0, max(df['avg_turns']) + 1)
    ax.set_xticklabels(df_agent['section'], rotation=30, ha='right')

    # Show legend
    ax.legend(loc='upper left')

    plt.tight_layout()
    plt.show()