In [2]:
!python run_mia2.py

nohup: ignoring input and redirecting stderr to stdout


# Analysis

In [6]:
import json
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

output_dir = 'results/target_docs'
output_file = f'{output_dir}/direct_query-nfcorpus-llama3-Top15-M250-N15.json'
with open(output_file, 'r') as f:
    data = json.load(f)

def extract_yes_no(answer):
    """Extract 'Yes' or 'No' from a response."""
    match = re.search(r'\b(Yes|No)\b', answer, re.IGNORECASE)
    return match.group(1).capitalize() if match else "Invalid"

def load_and_calculate_metrics(data):
    expected_answers = []
    predicted_answers = []

    for doc_id, doc_data in data.items():
        expected_answer = "Yes" if doc_data["mem"].lower() == "yes" else "No"
        llm_response = extract_yes_no(doc_data["llm_responses"][0])

        # Skip invalid responses
        if llm_response == "Invalid":
            continue

        expected_answers.append(expected_answer)
        predicted_answers.append(llm_response)

    # Calculate metrics
    accuracy = accuracy_score(expected_answers, predicted_answers) * 100
    precision = precision_score(expected_answers, predicted_answers, pos_label="Yes", zero_division=0)
    recall = recall_score(expected_answers, predicted_answers, pos_label="Yes", zero_division=0)
    f1 = f1_score(expected_answers, predicted_answers, pos_label="Yes", zero_division=0)

    return accuracy, precision, recall, f1

# Calculate and print metrics
accuracy, precision, recall, f1 = load_and_calculate_metrics(data)
print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 75.00%
Precision: 0.67
Recall: 0.98
F1 Score: 0.80
