In [132]:
import igibson.object_states as object_states
from igibson.tasks.behavior_task import BehaviorTask
from igibson.utils.ig_logging import IGLogReader
from igibson.utils.utils import parse_config
import os
import igibson
from igibson.envs.igibson_env import iGibsonEnv
from igibson.transition_model_v3.eval_env import EvalEnv
from igibson.transition_model_v3.eval_env import EvalActions
from tqdm import tqdm
import json




demo_to_conds_path = "/Users/bryan/Desktop/wkdir/behavior-vllm-eval/igibson/evaluation/goal_interpretation/assets/all_conditions.json"
demo_to_objs_path = "/Users/bryan/Desktop/wkdir/behavior-vllm-eval/igibson/evaluation/goal_interpretation/assets/all_objects.json"
demo_names_path = "/Users/bryan/Desktop/wkdir/behavior-vllm-eval/igibson/evaluation/goal_interpretation/assets/100_selected_demos.txt"
task_to_instructions_path = "/Users/bryan/Desktop/wkdir/behavior-vllm-eval/igibson/evaluation/goal_interpretation/assets/instructions_by_activity_name.json"
prompt_path = "/Users/bryan/Desktop/wkdir/behavior-vllm-eval/igibson/evaluation/goal_interpretation/prompts/behavior_goal_interpretation.txt"
task_to_demo_path = "/Users/bryan/Desktop/wkdir/behavior-vllm-eval/igibson/evaluation/goal_interpretation/assets/task_to_demo.json"
demo_to_prompt_path = "/Users/bryan/Desktop/wkdir/behavior-vllm-eval/igibson/evaluation/goal_interpretation/assets/llm_prompts.json"


with open(demo_to_conds_path, 'r') as json_file:
    demo_to_conds = json.load(json_file)

with open(demo_to_objs_path, 'r') as json_file:
    demo_to_objs = json.load(json_file)

with open(demo_to_prompt_path, 'r') as json_file:
    demo_to_prompt = json.load(json_file)

with open(task_to_instructions_path, 'r') as json_file:
    task_to_instructions = json.load(json_file)
    
with open(task_to_demo_path, 'r') as json_file:
    task_to_demos = json.load(json_file)

with open(demo_names_path, 'r') as file:
    demo_names = file.read().splitlines()

In [141]:
object_states = {
    "node_states": [
        "Cooked",
        "Dusty",
        "Frozen",
        "Open",  
        "Sliced",
        "Soaked",
        "Stained",
        "Toggled_On"
    ],
    "edge_states": [
        "Inside",
        "NextTo",
        "OnFloor",
        "OnTop",
        "Touching",
        "Under"
    ]
}

In [203]:
gpt35_results_path = "/Users/bryan/Desktop/wkdir/behavior-vllm-eval/igibson/evaluation/goal_interpretation/results/trial1/gpt35_goal_interpretation.json"
gpt4_results_path = "/Users/bryan/Desktop/wkdir/behavior-vllm-eval/igibson/evaluation/goal_interpretation/results/trial3/gpt4_goal_interpretation.json"

with open(gpt35_results_path, 'r') as json_file:
    gpt35_results = json.load(json_file)

with open(gpt4_results_path, 'r') as json_file:
    gpt4_results = json.load(json_file)
    

model_results = gpt35_results
MODEL_NAME = "gpt35"
save_path = f"/Users/bryan/Desktop/wkdir/behavior-vllm-eval/igibson/evaluation/goal_interpretation/results/trial3/{MODEL_NAME}_goal_interpretation_evaluated.json"

In [199]:
def flatten_goals(goal_data):
    """Flatten goal data into a single list of conditions."""
    return [condition for goal_type in goal_data.values() for condition in goal_type]

def check_satisfaction(predicted_conditions, ground_truth_conditions):
    """check which of the conditions in the ground truth are satisfied by the predicted conditions."""
    satisfied_conditions = []
    unsatisfied_conditions = []
    for condition in ground_truth_conditions:
        if condition in predicted_conditions:
            satisfied_conditions.append(condition)
        else:
            unsatisfied_conditions.append(condition)
    return satisfied_conditions, unsatisfied_conditions


def is_node_condition(condition):
    """check of a condition is a node condition or an edge condition."""
    if condition[0] == "not":
        if len(condition[1]) == 2:
            return True
        elif len(condition[1]) == 3:
            return False
        else:
            raise ValueError("Invalid condition")
    else:
        if len(condition) == 2:
            return True
        elif len(condition) == 3:
            return False
        else:
            raise ValueError("Invalid condition")

def is_state_condition(state, condition):
    """check if the given condition is about the given state."""
    if condition[0] == 'not':
        return condition[1][0] == state.lower()
    else:
        return condition[0] == state.lower()
    

def compute_metrics(all_satisfied_conditions, all_unsatisfied_conditions, predicted_conditions, keep_conditions=True):
    
    # Compute evaluation metrics
    true_positives = len(all_satisfied_conditions)
    false_positives = len(predicted_conditions) - true_positives
    false_negatives = len(all_unsatisfied_conditions)
    accuracy = true_positives / len(predicted_conditions) if predicted_conditions else 0
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    
    if keep_conditions:
        return {
            # 'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1_score,
            'all_satisfied_conditions': all_satisfied_conditions,
            'all_unsatisfied_conditions': all_unsatisfied_conditions,
            'predicted_conditions': predicted_conditions
        }
    else:
        return {
            # 'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1_score
        }

def compute_breakdown_metrics(all_satisfied_conditions, all_unsatisfied_conditions, predicted_conditions, keep_conditions=True):
    """Compute metrics for node and edge conditions separately."""
    node_predicted_conditions = [condition for condition in predicted_conditions if is_node_condition(condition)]
    node_satisfied_conditions = [condition for condition in all_satisfied_conditions if is_node_condition(condition)]
    node_unsatisfied_conditions = [condition for condition in all_unsatisfied_conditions if is_node_condition(condition)]
    edge_predicted_conditions = [condition for condition in predicted_conditions if not is_node_condition(condition)]
    edge_satisfied_conditions = [condition for condition in all_satisfied_conditions if not is_node_condition(condition)]
    edge_unsatisfied_conditions = [condition for condition in all_unsatisfied_conditions if not is_node_condition(condition)]
    
    
    complete_metrics = compute_metrics(all_satisfied_conditions, all_unsatisfied_conditions, predicted_conditions, keep_conditions)
    node_metrics = compute_metrics(node_satisfied_conditions, node_unsatisfied_conditions, node_predicted_conditions, keep_conditions)
    edge_metrics = compute_metrics(edge_satisfied_conditions, edge_unsatisfied_conditions, edge_predicted_conditions, keep_conditions)
    
    return {
        'complete_metrics': complete_metrics,
        'node_metrics': node_metrics,
        'edge_metrics': edge_metrics
    }


def compute_metrics_by_state(all_satisfied_conditions, all_unsatisfied_conditions, predicted_conditions):
    """Compute metrics for each state separately."""
    node_state_results = {}
    edge_state_results = {}
    
    for state in object_states["node_states"]:
        node_state_results[state] = compute_metrics(
            [condition for condition in all_satisfied_conditions if is_state_condition(state, condition)],
            [condition for condition in all_unsatisfied_conditions if is_state_condition(state, condition)],
            [condition for condition in predicted_conditions if is_state_condition(state, condition)],
            keep_conditions=False
        )
    
    for state in object_states["edge_states"]:
        edge_state_results[state] = compute_metrics(
            [condition for condition in all_satisfied_conditions if is_state_condition(state, condition)],
            [condition for condition in all_unsatisfied_conditions if is_state_condition(state, condition)],
            [condition for condition in predicted_conditions if is_state_condition(state, condition)],
            keep_conditions=False
        )
    
    return node_state_results, edge_state_results


def check_conditions_correctly_mapped(all_satisfied_conditions, all_unsatisfied_conditions, predicted_conditions):
    
    
    node_predicted_conditions = [condition for condition in predicted_conditions if is_node_condition(condition)]
    node_conditions = [condition for condition in all_satisfied_conditions if is_node_condition(condition)] + [condition for condition in all_unsatisfied_conditions if is_node_condition(condition)]
    edge_predicted_conditions = [condition for condition in predicted_conditions if not is_node_condition(condition)]
    edge_conditions = [condition for condition in all_satisfied_conditions if not is_node_condition(condition)] + [condition for condition in all_unsatisfied_conditions if not is_node_condition(condition)]
    
    for condition in node_conditions:
        found = False
        for state in object_states["node_states"]:
            if is_state_condition(state, condition):
                found = True
        if found == False:       
            print("condition at hand: ", condition)
            raise ValueError("GT Node condition not found in object states")
    
    for condition in edge_conditions:
        found = False
        for state in object_states["edge_states"]:
            if is_state_condition(state, condition):
                found = True
        if found == False:
            print("condition at hand: ", condition)
            raise ValueError("GT Edge condition not found in object states")
    
    for condition in node_predicted_conditions:
        found = False
        for state in object_states["node_states"]:
            if is_state_condition(state, condition):
                found = True
        if found == False:
            print("condition at hand: ", condition)
            # raise ValueError("Pred Node condition not found in object states")

    for condition in edge_predicted_conditions:
        found = False
        for state in object_states["edge_states"]:
            if is_state_condition(state, condition):
                found = True
        if found == False:
            print("condition at hand: ", condition)
            # raise ValueError("Pred Edge condition not found in object states")

In [200]:
def evaluate_goals(predicted_goals, ground_truth_goals):
    """Evaluate predicted goals against ground truth goals."""
    # Flatten the predicted goals
    predicted_conditions = flatten_goals(predicted_goals)
    
    
    all_satisfied_conditions = []
    all_unsatisfied_conditions = []
    
    # check each goal in ground_truth_goals
    for key, value in ground_truth_goals.items():
        # if there is only one way to satisfy the goal
        if len(value) == 1:
            satisfied_conditions, unsatisfied_conditions = check_satisfaction(predicted_conditions, value[0])
        # if there are multiple ways to satisfy the goal, choose the one that satisfies the most number of conditions
        else:
            satisfied_nums = [len([cond for cond in option if cond in predicted_conditions]) for option in value]
            max_satisfied_option = value[satisfied_nums.index(max(satisfied_nums))]
            satisfied_conditions, unsatisfied_conditions = check_satisfaction(predicted_conditions, max_satisfied_option)
        
        all_satisfied_conditions.extend(satisfied_conditions)
        all_unsatisfied_conditions.extend(unsatisfied_conditions) 
    
    return compute_breakdown_metrics(all_satisfied_conditions, all_unsatisfied_conditions, predicted_conditions)
    

In [204]:
all_satisfied_conditions = []
all_unsatisfied_conditions = []
all_predicted_conditions = []

model_results_evaluated = {}


for demo in demo_names:
    goal_conds = demo_to_conds[demo]['goal_conditions']
    model_pred = model_results[demo]
    eval_result = evaluate_goals(model_pred, goal_conds)
    
    all_satisfied_conditions.extend(eval_result['complete_metrics']['all_satisfied_conditions'])
    all_unsatisfied_conditions.extend(eval_result['complete_metrics']['all_unsatisfied_conditions'])
    all_predicted_conditions.extend(flatten_goals(model_pred))
    model_results_evaluated[demo] = eval_result

sorted_model_results_evaluated  = {key: model_results_evaluated [key] for key in sorted(model_results_evaluated)}

dataset_results_evaluated = compute_breakdown_metrics(all_satisfied_conditions, all_unsatisfied_conditions, all_predicted_conditions, keep_conditions=False)
node_state_results, edge_state_results = compute_metrics_by_state(all_satisfied_conditions, all_unsatisfied_conditions, all_predicted_conditions)


# check_conditions_correctly_mapped(all_satisfied_conditions, all_unsatisfied_conditions, all_predicted_conditions)

# node_state_results
# edge_state_results
dataset_results_evaluated

# with open(save_path, 'w') as json_file:
#     json.dump(sorted_model_results_evaluated, json_file, indent=4)

{'complete_metrics': {'precision': 0.6068249258160238,
  'recall': 0.6077265973254086,
  'f1_score': 0.6072754268745361},
 'node_metrics': {'precision': 0.396875,
  'recall': 0.8300653594771242,
  'f1_score': 0.5369978858350951},
 'edge_metrics': {'precision': 0.7966101694915254,
  'recall': 0.5423076923076923,
  'f1_score': 0.6453089244851258}}

In [190]:
import matplotlib.pyplot as plt
import numpy as np


# Sort the data by F-1 score in descending order
sorted_data = dataset_results_evaluated
# sorted_data = sorted(edge_state_results.items(), key=lambda x: x[1]['f1_score'], reverse=True)
# sorted_data = sorted(node_state_results.items(), key=lambda x: x[1]['f1_score'], reverse=True)

# Extract the sorted node states and their scores
node_states = [item[0] for item in sorted_data]
precision = [item[1]['precision'] for item in sorted_data]
recall = [item[1]['recall'] for item in sorted_data]
f1_scores = [item[1]['f1_score'] for item in sorted_data]

# Set up the bar chart
x = np.arange(len(node_states))  # the label locations
width = 0.28  # the width of the bars, increased to make bars thicker

fig, ax = plt.subplots(figsize=(12, 6))  # Increase figure size for better display
rects1 = ax.bar(x - width, precision, width, label='Precision', color='#AED6F1')  # Classic blue
rects2 = ax.bar(x, recall, width, label='Recall', color='#5DADE2')  # Medium blue
rects3 = ax.bar(x + width, f1_scores, width, label='F1 Score', color='#3498db')  # Lighter cyan-blue

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('All Edge States', fontsize=14)
ax.set_ylabel('Precision, Recall and F-1 Scores', fontsize=14)
ax.set_title('GPT4 Performance Breakdown by Edge State', fontsize=16)
ax.set_xticks(x)
ax.set_xticklabels(node_states, fontsize=12)
ax.set_ylim(0, 1.075)  # Increase the limit to prevent label overlap

# Add a legend to explain the color coding
fig.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3)

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(round(height, 2)),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 5),  # 5 points vertical offset to improve space
                    textcoords="offset points",
                    ha='center', va='bottom')

# Call the autolabel function to label the heights of the bars
autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

fig.tight_layout()

# Show the plot
plt.show()


TypeError: string indices must be integers

In [15]:
print("all_satisfied_conditions: ", len(all_satisfied_conditions))
print("all_unsatisfied_conditions: ", len(all_unsatisfied_conditions))
print("all_predicted_conditions: ", len(all_predicted_conditions))
# Compute evaluation metrics
true_positives = len(all_satisfied_conditions)
false_positives = len(all_predicted_conditions) - true_positives
false_negatives = len(all_unsatisfied_conditions)
precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0


result = {
        'accuracy': true_positives / len(all_predicted_conditions) if all_predicted_conditions else 0,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'all_satisfied_conditions': all_satisfied_conditions,
        'all_unsatisfied_conditions': all_unsatisfied_conditions,
        'all_predicted_conditions': all_predicted_conditions
    }

result


all_satisfied_conditions:  516
all_unsatisfied_conditions:  157
all_predicted_conditions:  688


{'accuracy': 0.75,
 'precision': 0.75,
 'recall': 0.7667161961367014,
 'f1_score': 0.7582659808963998,
 'all_satisfied_conditions': [['not', ['stained', 'bathtub.n.01_1']],
  ['inside', 'plate.n.04_1', 'cabinet.n.01_1'],
  ['inside', 'plate.n.04_2', 'cabinet.n.01_1'],
  ['inside', 'plate.n.04_3', 'cabinet.n.01_1'],
  ['inside', 'plate.n.04_4', 'cabinet.n.01_1'],
  ['inside', 'plate.n.04_5', 'cabinet.n.01_1'],
  ['inside', 'plate.n.04_6', 'cabinet.n.01_1'],
  ['inside', 'plate.n.04_7', 'cabinet.n.01_1'],
  ['inside', 'plate.n.04_8', 'cabinet.n.01_1'],
  ['not', ['stained', 'pan.n.01_1']],
  ['inside', 'pan.n.01_1', 'cabinet.n.01_1'],
  ['not', ['stained', 'pan.n.01_2']],
  ['not', ['stained', 'pan.n.01_3']],
  ['not', ['stained', 'kettle.n.01_1']],
  ['inside', 'kettle.n.01_1', 'cabinet.n.01_2'],
  ['not', ['stained', 'teapot.n.01_1']],
  ['inside', 'teapot.n.01_1', 'cabinet.n.01_1'],
  ['not', ['dusty', 'table.n.02_1']],
  ['not', ['dusty', 'shelf.n.01_1']],
  ['under', 'rag.n.01_1', '