### Build Prompt Template

In [31]:
def generate_mbti_prompt(scale_name, dimension_name, dimension_description, experimenter_name,
                         character_name, language, type1, type2):
    prompt = f"""
You are an expert in Psychometrics, especially {scale_name}. I am conducting the {scale_name} test on someone. 
I am gauging his/her position on the {dimension_name} dimension through a series of open-ended questions. 
For clarity, here’s some background on this particular dimension:
===
{dimension_description}
===
My name is {experimenter_name}. I’ve invited a participant, and we had many conversations in {language}. 
I will input the conversations.

Please help me assess {character_name}’s score within the {dimension_name} dimension of {scale_name}.
You should provide the percentage of each category, which sums to 100%, e.g., 30% {type1} and 70% {type2}. 
Please output in the following json format: 
=== 
{{ 
  "analysis": <your analysis based on the conversations>, 
  "result": {{ 
    "{type1}": <percentage 1>, 
    "{type2}": <percentage 2> 
  }} 
}} 
(The sum of percentage 1 and percentage 2 should be 100%. Output with percent sign.)
"""
    return prompt.strip()


In [32]:
# prompt = generate_mbti_prompt(
#     scale_name="MBTI",
#     dimension_name="Introversion vs. Extraversion",
#     dimension_description="This dimension reflects how individuals derive their energy: from solitude and reflection (Introversion) or from social interaction (Extraversion).",
#     experimenter_name="Dr. Lee",
#     character_name="Alex",
#     language="English",
#     type1="Introversion",
#     type2="Extraversion"
# )

# print(prompt)


In [33]:
def generate_bigfive_prompt(scale_name, dimension_name, dimension_description,
                            experimenter_name, character_name, language,
                            lowest_score, middle_score, highest_score):
    prompt = f"""
You are an expert in Psychometrics, especially {scale_name}. I am conducting the {scale_name} test on someone. 
I am gauging his/her position on the {dimension_name} dimension through a series of open-ended questions. 
For clarity, here’s some background on this particular dimension:
===
{dimension_description}
===
My name is {experimenter_name}. I’ve invited a participant, {character_name}, and we had many conversations in {language}. 
I will input the conversations.

Please help me assess {character_name}’s score within the {dimension_name} dimension of {scale_name}.
You should provide the score of {character_name} in terms of {dimension_name}, which is a number between {lowest_score} and {highest_score}. 
{lowest_score} denotes ‘not {dimension_name} at all’, {middle_score} denotes ‘neutral’, and {highest_score} denotes ‘strongly {dimension_name}’. 
Other numbers in this range represent different degrees of ‘{dimension_name}’. 
Please output in the following json format: 
=== 
{{ 
  "analysis": <your analysis based on the conversations>, 
  "result": <your score> 
}}
"""
    return prompt.strip()


In [34]:
# prompt = generate_bigfive_prompt(
#     scale_name="Big Five Personality Traits",
#     dimension_name="Openness",
#     dimension_description="Openness involves active imagination, aesthetic sensitivity, attentiveness to inner feelings, preference for variety, and intellectual curiosity.",
#     experimenter_name="Dr. Jane",
#     character_name="Jordan",
#     language="English",
#     lowest_score=1,
#     middle_score=3,
#     highest_score=5
# )

# print(prompt)


### Prepare OpenAI API

In [35]:
import os
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI
from huggingface_hub import InferenceClient
import time
import random


In [36]:
# === OpenAI API Setup ===
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key, timeout=60)


class OpenAICallFailed(Exception):
    pass

def openai_generator(prompt: str, system_prompt: str = "", model: str = "gpt-4.1-nano",
                     temperature: float = 0.7, top_p=1.0, max_tokens: int = 1000,
                     max_retries: int = 3):
    for attempt in range(max_retries):
        try:
            messages = []
            if system_prompt:
                messages.append({"role": "system", "content": system_prompt})
            messages.append({"role": "user", "content": prompt})

            response = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                top_p=top_p
            )
            # ⬅️ wrap string inside list of dicts
            return [{"generated_text": response.choices[0].message.content.strip()}]
        except Exception as e:
            wait = 2 ** attempt + random.uniform(0, 1)
            print(f"OpenAI call failed (attempt {attempt + 1}/{max_retries}): {e}")
            time.sleep(wait)

    raise OpenAICallFailed("OpenAI call failed after multiple retries.")

In [37]:
import json
import os
from collections import defaultdict

def split_list(input_list, n=4):
    if len(input_list) < 2 * (n-1):
        return [input_list]

    result = [input_list[i:i+n] for i in range(0, len(input_list), n)]
    
    # If last list is too short, balance it out
    num_to_pop = n - 1 - len(result[-1])
    for i in range(num_to_pop):
        result[-1].append(result[i].pop())
        
    return result

def group_responses_by_dimension(response_path):
    with open(response_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    responses_by_dim = defaultdict(list)

    for item in data["responses"]:
        dimension = item["dimension"]
        question = item["question"].strip()
        response = item["response"].strip()
        responses_by_dim[dimension].append((question, response))

    return responses_by_dim

def prepare_batches_per_dimension(response_path, batch_size=4):
    grouped = group_responses_by_dimension(response_path)
    batched = {}

    for dim, qr_pairs in grouped.items():
        batched[dim] = split_list(qr_pairs, n=batch_size)

    return batched
    

In [38]:
# # Example file path
# test_path = "character_response/MBTI/myRAG/albus_dumbledore/deepseek/albus_dumbledore_1_16p_responses.json"

# batches = prepare_batches_per_dimension(test_path, batch_size=4)

# # Check structure
# for dim, sets in batches.items():
#     print(f"{dim} has {len(sets)} batches")
#     for batch in sets:
#         for q, r in batch:
#             print(f"Q: {q}\nR: {r}\n")
#         print("-" * 50)


In [39]:
# Load and cache the 16Personalities schema once
with open("16Personalities.json", "r", encoding="utf-8") as f:
    mbti_schema = json.load(f)

dim_desc = mbti_schema["prompts"]["dim_desc"]
dim_types = {
    "E/I": ("E", "I"),
    "S/N": ("S", "N"),
    "T/F": ("T", "F"),
    "P/J": ("P", "J")
}

In [40]:
def generate_er_batch_prompt_auto(
    dimension_name,
    qr_batch,
    language="English",
    experimenter_name="Interviewer",
    character_name="the participant"
):
    scale_name = mbti_schema["name"]
    dimension_description = dim_desc[dimension_name]
    type1, type2 = dim_types[dimension_name]

    qr_text = "\n".join(
        [f"Q: {q}\nA: {r}" for q, r in qr_batch]
    )

    prompt = f"""
You are an expert in Psychometrics, especially {scale_name}. I am conducting the {scale_name} test on someone. 
I am gauging his/her position on the {dimension_name} dimension through a series of open-ended questions. 
For clarity, here’s some background on this particular dimension:
===
{dimension_description}
===
I am an {experimenter_name}. I’ve invited a participant, and we had many conversations in {language}. 
Below is a batch of question-and-response pairs from our interview:
===
{qr_text}
===
Please help me assess {character_name}’s score within the {dimension_name} dimension of {scale_name}.
You should provide the percentage of each category, which sums to 100%, e.g., 30% {type1} and 70% {type2}. 
Please output in the following json format:
===
{{ 
  "analysis": <your analysis based on the conversations>, 
  "result": {{ 
    "{type1}": <percentage 1>, 
    "{type2}": <percentage 2> 
  }} 
}}
(The sum of percentage 1 and percentage 2 should be 100%. Output with percent sign.)
"""
    return prompt.strip()


### Evaluate Function

In [75]:
import os
import json
from pathlib import Path
from statistics import mean
import time

def safe_parse_llm_output(output):
    try:
        return json.loads(output)
    except json.JSONDecodeError as e:
        print("JSON parsing failed. Raw output:")
        print(output[:300] + "...")
        raise e

def evaluate_character_er_batch(character_backend_path, model="gpt-4.1", max_retries=3):
    character_name = Path(character_backend_path).parts[-2]
    backend_name = Path(character_backend_path).parts[-1]
    all_dim_scores = {}

    response_files = [
        f for f in os.listdir(character_backend_path)
        if f.endswith(".json") and not (
            f.endswith("_summary.json")
            or f.endswith("_eval.json")
            or f.endswith("_logs.json")
        )
    ]

    if len(response_files) != 3:
        raise RuntimeError(f"{character_backend_path} contains {len(response_files)} response files (expected 3)")

    for response_file in response_files:
        full_path = os.path.join(character_backend_path, response_file)
        out_resp_path = os.path.join(character_backend_path, response_file.replace(".json", "_er_eval.json"))

        # NEW: If already evaluated, load and extract dimension scores
        if os.path.exists(out_resp_path):
            print(f"Skipping {response_file} (already evaluated)")
            try:
                with open(out_resp_path, "r", encoding="utf-8") as f:
                    existing_eval = json.load(f)
                    for dim, dim_data in existing_eval.get("dimensions", {}).items():
                        type1, type2 = dim_types[dim]
                        avg = dim_data.get("average", {})
                        if type1 in avg and type2 in avg:
                            val1 = int(avg[type1].strip('%'))
                            val2 = int(avg[type2].strip('%'))
                            if dim not in all_dim_scores:
                                all_dim_scores[dim] = []
                            all_dim_scores[dim].append({type1: val1, type2: val2})
            except Exception as e:
                print(f"Warning: Failed to load existing eval for {response_file}: {e}")
            continue

        # Normal evaluation flow
        batches_by_dim = prepare_batches_per_dimension(full_path)
        per_response_output = {
            "character": character_name,
            "backend": backend_name,
            "response_file": response_file,
            "dimensions": {}
        }

        try:
            for dim, batches in batches_by_dim.items():
                dim_scores = []

                for batch in batches:
                    prompt = generate_er_batch_prompt_auto(dim, batch)

                    for attempt in range(max_retries + 1):
                        try:
                            output = openai_generator(prompt, system_prompt="", model=model)[0]["generated_text"]
                            parsed = safe_parse_llm_output(output)
                            dim_scores.append(parsed["result"])
                            break
                        except Exception as e:
                            print(f"Attempt {attempt+1}/{max_retries+1} failed for {response_file}, {dim}: {e}")
                            time.sleep(2)
                            if attempt == max_retries:
                                raise RuntimeError(f"All retries failed for {response_file}, {dim}")

                type1, type2 = dim_types[dim]
                avg_type1 = mean([int(d[type1].strip('%')) for d in dim_scores])
                avg_type2 = mean([int(d[type2].strip('%')) for d in dim_scores])

                per_response_output["dimensions"][dim] = {
                    "batches": dim_scores,
                    "average": {
                        type1: f"{round(avg_type1)}%",
                        type2: f"{round(avg_type2)}%"
                    }
                }

                if dim not in all_dim_scores:
                    all_dim_scores[dim] = []
                all_dim_scores[dim].append({type1: avg_type1, type2: avg_type2})

            # Ensure all dimensions are present
            required_dims = {"E/I", "S/N", "T/F", "P/J"}
            missing_dims = required_dims - set(per_response_output["dimensions"].keys())
            if missing_dims:
                raise RuntimeError(f"Missing dimensions in {response_file}: {missing_dims}")

            with open(out_resp_path, "w", encoding="utf-8") as f:
                json.dump(per_response_output, f, indent=2)
            print(f"Saved: {out_resp_path}")

        except Exception as e:
            if os.path.exists(out_resp_path):
                os.remove(out_resp_path)
            print(f"Evaluation aborted for {response_file}. Reason: {e}")
            raise e

    # Final summary with collected average scores
    summary = {
        "character": character_name,
        "backend": backend_name,
        "averages": {}
    }

    for dim, scores in all_dim_scores.items():
        type1, type2 = dim_types[dim]
        avg_type1 = mean([s[type1] for s in scores])
        avg_type2 = mean([s[type2] for s in scores])
        summary["averages"][dim] = {
            type1: f"{round(avg_type1)}%",
            type2: f"{round(avg_type2)}%"
        }

    out_summary_path = os.path.join(character_backend_path, f"{character_name}_{backend_name}_er_eval_summary.json")
    with open(out_summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)

    print(f"Final summary written: {out_summary_path}")


In [42]:
# test_path = "character_response/MBTI/myRAG/albus_dumbledore/llama3"
# evaluate_character_er_batch(test_path)


In [76]:
def validate_mbti_structure(base_dir="character_response/MBTI"):
    rag_types = sorted(os.listdir(base_dir))
    invalid_paths = []

    for rag in rag_types:
        rag_path = os.path.join(base_dir, rag)
        if not os.path.isdir(rag_path):
            continue

        characters = sorted(os.listdir(rag_path))
        for character in characters:
            character_path = os.path.join(rag_path, character)
            if not os.path.isdir(character_path):
                continue

            backends = sorted(os.listdir(character_path))
            for backend in backends:
                if backend.lower() == "mistral":
                    continue

                backend_path = os.path.join(character_path, backend)
                if not os.path.isdir(backend_path):
                    continue

                # Collect response files excluding *_summary.json, *_eval.json, *_logs.json
                response_files = [
                    f for f in os.listdir(backend_path)
                    if f.endswith(".json") and not (
                        f.endswith("_summary.json")
                        or f.endswith("_eval.json")
                        or f.endswith("_logs.json")
                    )
                ]

                if len(response_files) != 3:
                    invalid_paths.append((backend_path, len(response_files)))

    if invalid_paths:
        for path, count in invalid_paths:
            print(f"Invalid path: {path} — contains {count} response files (expected 3)")
        raise RuntimeError(f"{len(invalid_paths)} backend paths failed validation. Aborting.")

    print("All character/backend paths passed validation.")


In [77]:
def run_mbti_evaluation_all(base_dir="character_response/MBTI"):
    rag_types = sorted(os.listdir(base_dir))

    for rag in rag_types:
        rag_path = os.path.join(base_dir, rag)
        if not os.path.isdir(rag_path):
            continue

        characters = sorted(os.listdir(rag_path))
        for character in characters:
            character_path = os.path.join(rag_path, character)
            if not os.path.isdir(character_path):
                continue

            backends = sorted(os.listdir(character_path))
            for backend in backends:
                if backend.lower() == "mistral":
                    continue

                backend_path = os.path.join(character_path, backend)
                if not os.path.isdir(backend_path):
                    continue

                print(f"\n=== Evaluating: {rag}/{character}/{backend} ===")
                evaluate_character_er_batch(backend_path)


In [78]:
# First: strict validation
validate_mbti_structure()

# If it passes, run all MBTI evaluations
run_mbti_evaluation_all()


All character/backend paths passed validation.

=== Evaluating: EmotionRAG/albus_dumbledore/deepseek ===
Skipping albus_dumbledore_3_MBTI.json (already evaluated)
Skipping albus_dumbledore_1_MBTI.json (already evaluated)
Skipping albus_dumbledore_2_MBTI.json (already evaluated)
Final summary written: character_response/MBTI/EmotionRAG/albus_dumbledore/deepseek/albus_dumbledore_deepseek_er_eval_summary.json

=== Evaluating: EmotionRAG/albus_dumbledore/llama3 ===
Skipping albus_dumbledore_3_MBTI.json (already evaluated)
Skipping albus_dumbledore_1_MBTI.json (already evaluated)
Skipping albus_dumbledore_2_MBTI.json (already evaluated)
Final summary written: character_response/MBTI/EmotionRAG/albus_dumbledore/llama3/albus_dumbledore_llama3_er_eval_summary.json

=== Evaluating: EmotionRAG/albus_dumbledore/qwen ===
Skipping albus_dumbledore_3_MBTI.json (already evaluated)
Skipping albus_dumbledore_1_MBTI.json (already evaluated)
Skipping albus_dumbledore_2_MBTI.json (already evaluated)
Final

BFI

In [46]:
# Load BFI schema once
with open("BFI.json", "r", encoding="utf-8") as f:
    bfi_schema = json.load(f)

bfi_dim_desc = bfi_schema["prompts"]["dim_desc"]
bfi_score_range = bfi_schema["range"]  # Expected to be [1, 3, 5] or similar

def generate_bfi_prompt_auto(
    dimension_name,
    qr_batch,
    language="English",
    experimenter_name="Interviewer",
    character_name="the participant"
):
    scale_name = bfi_schema["name"]
    dimension_description = bfi_dim_desc[dimension_name]
    lowest_score, highest_score = bfi_score_range
    middle_score = (lowest_score + highest_score) / 2

    qr_text = "\n".join([f"Q: {q}\nA: {r}" for q, r in qr_batch])

    prompt = f"""
You are an expert in Psychometrics, especially {scale_name}. I am conducting the {scale_name} test on someone. 
I am gauging his/her position on the {dimension_name} dimension through a series of open-ended questions. 
For clarity, here’s some background on this particular dimension:
===
{dimension_description}
===
My name is {experimenter_name}. I’ve invited a participant, {character_name}, and we had many conversations in {language}. 
Below is a batch of question-and-response pairs from our interview:
===
{qr_text}
===
Please help me assess {character_name}’s score within the {dimension_name} dimension of {scale_name}.
You should provide the score of {character_name} in terms of {dimension_name}, which is a number between {lowest_score} and {highest_score}. 
{lowest_score} denotes ‘not {dimension_name} at all’, {middle_score} denotes ‘neutral’, and {highest_score} denotes ‘strongly {dimension_name}’. 
Other numbers in this range represent different degrees of ‘{dimension_name}’. 
Please output in the following json format:
=== 
{{ 
  "analysis": <your analysis based on the conversations>, 
  "result": <your score> 
}}
"""
    return prompt.strip()


In [47]:
# bfi_test_path = "character_response/BFI/myRAG/albus_dumbledore/deepseek/albus_dumbledore_1_BFI.json"
# batches = prepare_batches_per_dimension(bfi_test_path)
# example_batch = batches["Openness"][0]

# prompt = generate_bfi_prompt_auto("Openness", example_batch)
# print(prompt)


In [69]:
import os
import json
from pathlib import Path
from statistics import mean
import time

def safe_parse_bfi_output(output):
    try:
        parsed = json.loads(output)
        score = float(parsed["result"])
        return parsed["analysis"], score
    except Exception as e:
        print("Failed to parse BFI LLM output. Raw output:")
        print(output[:300] + "...")
        raise e

def evaluate_character_bfi_er_batch(character_backend_path, model="gpt-4.1", max_retries=3):
    character_name = Path(character_backend_path).parts[-2]
    backend_name = Path(character_backend_path).parts[-1]
    all_dim_scores = {}

    response_files = [
        f for f in os.listdir(character_backend_path)
        if f.endswith(".json") and not (
            f.endswith("_summary.json")
            or f.endswith("_eval.json")
            or f.endswith("_logs.json")
        )
    ]

    if len(response_files) != 3:
        raise RuntimeError(f"{character_backend_path} contains {len(response_files)} response files (expected 3)")

    for response_file in response_files:
        full_path = os.path.join(character_backend_path, response_file)
        out_resp_path = os.path.join(character_backend_path, response_file.replace(".json", "_er_eval.json"))

        # ✅ NEW: Load existing eval file if already evaluated
        if os.path.exists(out_resp_path):
            print(f"Skipping {response_file} (already evaluated)")
            try:
                with open(out_resp_path, "r", encoding="utf-8") as f:
                    existing_eval = json.load(f)
                    for dim, dim_data in existing_eval.get("dimensions", {}).items():
                        avg_score = dim_data.get("average_score")
                        if avg_score is not None:
                            if dim not in all_dim_scores:
                                all_dim_scores[dim] = []
                            all_dim_scores[dim].append(avg_score)
            except Exception as e:
                print(f"Warning: Failed to load existing eval for {response_file}: {e}")
            continue

        batches_by_dim = prepare_batches_per_dimension(full_path)
        per_response_output = {
            "character": character_name,
            "backend": backend_name,
            "response_file": response_file,
            "dimensions": {}
        }

        try:
            for dim, batches in batches_by_dim.items():
                batch_scores = []

                for batch in batches:
                    prompt = generate_bfi_prompt_auto(
                        dimension_name=dim,
                        qr_batch=batch,
                        character_name="the participant",
                        experimenter_name="Interviewer"
                    )

                    for attempt in range(max_retries + 1):
                        try:
                            output = openai_generator(prompt, model=model)[0]["generated_text"]
                            analysis, score = safe_parse_bfi_output(output)
                            batch_scores.append(score)
                            break
                        except Exception as e:
                            print(f"Attempt {attempt+1}/{max_retries+1} failed for {response_file}, {dim}: {e}")
                            time.sleep(2)
                            if attempt == max_retries:
                                raise RuntimeError(f"All retries failed for {response_file}, {dim}")

                avg_score = round(mean(batch_scores), 2)
                per_response_output["dimensions"][dim] = {
                    "batch_scores": batch_scores,
                    "average_score": avg_score
                }

                if dim not in all_dim_scores:
                    all_dim_scores[dim] = []
                all_dim_scores[dim].append(avg_score)

            required_dims = set(bfi_schema["prompts"]["dim_desc"].keys())
            missing_dims = required_dims - set(per_response_output["dimensions"].keys())
            if missing_dims:
                raise RuntimeError(f"Incomplete evaluation for {response_file}. Missing dimensions: {missing_dims}")

            with open(out_resp_path, "w", encoding="utf-8") as f:
                json.dump(per_response_output, f, indent=2)
            print(f"Saved: {out_resp_path}")

        except Exception as e:
            if os.path.exists(out_resp_path):
                os.remove(out_resp_path)
            print(f"Evaluation failed for {response_file}: {e}")
            raise e

    # ✅ Final summary always reflects all available evaluations
    summary = {
        "character": character_name,
        "backend": backend_name,
        "averages": {}
    }

    for dim, scores in all_dim_scores.items():
        avg_score = round(mean(scores), 2)
        summary["averages"][dim] = avg_score

    out_summary_path = os.path.join(character_backend_path, f"{character_name}_{backend_name}_bfi_er_eval_summary.json")
    with open(out_summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)

    print(f"Final summary written: {out_summary_path}")


In [49]:
# test_path = "character_response/BFI/myRAG/albus_dumbledore/deepseek"
# evaluate_character_bfi_er_batch(test_path)


### Complete Evaluation on BFI

In [70]:
def validate_bfi_structure(base_dir="character_response/BFI"):
    rag_types = sorted(os.listdir(base_dir))
    invalid_paths = []

    for rag in rag_types:
        rag_path = os.path.join(base_dir, rag)
        if not os.path.isdir(rag_path):
            continue

        characters = sorted(os.listdir(rag_path))
        for character in characters:
            character_path = os.path.join(rag_path, character)
            if not os.path.isdir(character_path):
                continue

            backends = sorted(os.listdir(character_path))
            for backend in backends:
                if backend.lower() == "mistral":
                    continue

                backend_path = os.path.join(character_path, backend)
                if not os.path.isdir(backend_path):
                    continue

                # Use refined logic to count only true response files
                response_files = [
                    f for f in os.listdir(backend_path)
                    if f.endswith(".json") and not (
                        f.endswith("_summary.json")
                        or f.endswith("_eval.json")
                        or f.endswith("_logs.json")
                    )
                ]

                if len(response_files) != 3:
                    invalid_paths.append((backend_path, len(response_files)))

    if invalid_paths:
        for path, count in invalid_paths:
            print(f"Invalid path: {path} — contains {count} response files (expected 3)")
        raise RuntimeError(f"{len(invalid_paths)} backend paths failed validation. Aborting.")

    print("All character/backend paths passed validation.")

In [71]:
def run_bfi_evaluation_all(base_dir="character_response/BFI"):
    rag_types = sorted(os.listdir(base_dir))

    for rag in rag_types:
        rag_path = os.path.join(base_dir, rag)
        if not os.path.isdir(rag_path):
            continue

        characters = sorted(os.listdir(rag_path))
        for character in characters:
            character_path = os.path.join(rag_path, character)
            if not os.path.isdir(character_path):
                continue

            backends = sorted(os.listdir(character_path))
            for backend in backends:
                if backend.lower() == "mistral":
                    continue

                backend_path = os.path.join(character_path, backend)
                if not os.path.isdir(backend_path):
                    continue

                print(f"\n=== Evaluating: {rag}/{character}/{backend} ===")
                evaluate_character_bfi_er_batch(backend_path)

In [52]:
# First validate
validate_bfi_structure()


All character/backend paths passed validation.


In [72]:
# Then run evaluations only if validation passed
run_bfi_evaluation_all()


=== Evaluating: EmotionRAG/albus_dumbledore/deepseek ===
Skipping albus_dumbledore_3_BFI.json (already evaluated)
Skipping albus_dumbledore_1_BFI.json (already evaluated)
Skipping albus_dumbledore_2_BFI.json (already evaluated)
Final summary written: character_response/BFI/EmotionRAG/albus_dumbledore/deepseek/albus_dumbledore_deepseek_bfi_er_eval_summary.json

=== Evaluating: EmotionRAG/albus_dumbledore/llama3 ===
Skipping albus_dumbledore_3_BFI.json (already evaluated)
Skipping albus_dumbledore_1_BFI.json (already evaluated)
Skipping albus_dumbledore_2_BFI.json (already evaluated)
Final summary written: character_response/BFI/EmotionRAG/albus_dumbledore/llama3/albus_dumbledore_llama3_bfi_er_eval_summary.json

=== Evaluating: EmotionRAG/albus_dumbledore/qwen ===
Skipping albus_dumbledore_3_BFI.json (already evaluated)
Skipping albus_dumbledore_1_BFI.json (already evaluated)
Skipping albus_dumbledore_2_BFI.json (already evaluated)
Final summary written: character_response/BFI/EmotionRA

In [None]:
# import glob
# import json

# # Collect all summary files
# summaries = glob.glob("character_response/BFI/**/**/**/*_bfi_er_eval_summary.json", recursive=True)

# # Track empty summaries
# empty_summaries = []

# # Check each summary
# for summary_path in summaries:
#     with open(summary_path, "r", encoding="utf-8") as f:
#         data = json.load(f)
#     averages = data.get("averages", {})

#     if not averages:  # Check if dictionary is empty
#         empty_summaries.append(summary_path)
#         print(f"⚠️ EMPTY: {summary_path}")
#     else:
#         print(f"✅ OK: {summary_path} — {averages}")

# # Report total empty cases at the end
# if empty_summaries:
#     print(f"\n🚨 Found {len(empty_summaries)} empty summary files:")
#     for path in empty_summaries:
#         print(f"  - {path}")
# else:
#     print("\n✅ All summaries have valid average scores.")


✅ OK: character_response/BFI/EmotionRAG/minerva_mcgonagall/qwen/minerva_mcgonagall_qwen_bfi_er_eval_summary.json — {'Extraversion': 2.1, 'Agreeableness': 4.04, 'Conscientiousness': 4.93, 'Neuroticism': 1.92, 'Openness': 3.74}
✅ OK: character_response/BFI/EmotionRAG/minerva_mcgonagall/deepseek/minerva_mcgonagall_deepseek_bfi_er_eval_summary.json — {'Extraversion': 2.2, 'Agreeableness': 4.09, 'Conscientiousness': 5.0, 'Neuroticism': 1.58, 'Openness': 4.47}
✅ OK: character_response/BFI/EmotionRAG/minerva_mcgonagall/llama3/minerva_mcgonagall_llama3_bfi_er_eval_summary.json — {'Extraversion': 1.83, 'Agreeableness': 4.39, 'Conscientiousness': 4.93, 'Neuroticism': 1.8, 'Openness': 4.58}
✅ OK: character_response/BFI/EmotionRAG/harry_potter/qwen/harry_potter_qwen_bfi_er_eval_summary.json — {'Extraversion': 2.82, 'Agreeableness': 4.59, 'Conscientiousness': 4.12, 'Neuroticism': 3.8, 'Openness': 3.72}
✅ OK: character_response/BFI/EmotionRAG/harry_potter/deepseek/harry_potter_deepseek_bfi_er_eval_s

### Result Analysis

In [100]:
import json
from pathlib import Path
from statistics import mean
import os
import glob
from statistics import mean

CHARACTER_NAME_MAP = {
    "harry_potter": "Harry-en",
    "hermione_granger": "Hermione-en",
    "ron_weasley": "Ron-en",
    "luna_lovegood": "Luna-en",
    "draco_malfoy": "Malfoy-en",
    "severus_snape": "Snape-en",
    "albus_dumbledore": "Dumbledore-en",
    "minerva_mcgonagall": "McGonagall-en"
}

BFI_DIMENSIONS = ["Extraversion", "Agreeableness", "Conscientiousness", "Neuroticism", "Openness"]
MBTI_DIMENSIONS = ["E/I", "S/N", "T/F", "P/J"]

# For scaling MAE (BFI is 1–5 scale, MBTI is 0–100%)
BFI_SCORE_RANGE = 4.0  # 5.0 - 1.0
MBTI_SCORE_RANGE = 100.0

In [None]:
# Load ground truth
with open("characters_labels.json", "r", encoding="utf-8") as f:
    character_labels = json.load(f)["annotation"]

In [105]:
def evaluate_bfi_summary(path, character_labels):
    with open(path, "r", encoding="utf-8") as f:
        summary = json.load(f)

    raw_character = summary["character"]
    backend = summary["backend"]
    rag_framework = Path(path).parts[-4]  # ✅ Correctly extract RAG framework

    if raw_character not in CHARACTER_NAME_MAP:
        return None

    gt_character = CHARACTER_NAME_MAP[raw_character]
    if gt_character not in character_labels or "BFI" not in character_labels[gt_character]:
        return None

    predicted = summary["averages"]
    gt = character_labels[gt_character]["BFI"]

    total_mae = 0
    acc_dim = 0
    total_dims = 0
    full_correct = True

    for dim in BFI_DIMENSIONS:
        if dim not in predicted or dim not in gt:
            continue

        gt_entry = gt[dim]
        if gt_entry["type"] == "X":
            continue

        pred_score = predicted[dim]
        true_score = gt_entry["score"]
        true_type = gt_entry["type"]
        pred_type = "H" if pred_score > 3.0 else "L"

        mae = abs(pred_score - true_score)
        total_mae += mae
        total_dims += 1

        if pred_type == true_type:
            acc_dim += 1
        else:
            full_correct = False

    if total_dims == 0:
        return None

    return {
        "backend": backend,
        "rag_framework": rag_framework,
        "MAE_rescaled": total_mae / (total_dims * BFI_SCORE_RANGE),
        "AccDim": acc_dim / total_dims,
        "AccFull": 1.0 if full_correct else 0.0
    }

In [106]:
def aggregate_evaluations(base_dir="character_response/BFI"):
    results_by_backend = defaultdict(lambda: defaultdict(list))

    summary_files = glob.glob(f"{base_dir}/**/*_bfi_er_eval_summary.json", recursive=True)

    for path in summary_files:
        result = evaluate_bfi_summary(path, character_labels)
        if result:
            backend = result["backend"]
            rag = result["rag_framework"]
            results_by_backend[backend][rag].append(result)

    # Aggregate per backend+rag
    final = {}
    for backend, rag_dict in results_by_backend.items():
        final[backend] = {}
        for rag, entries in rag_dict.items():
            mae = mean(r["MAE_rescaled"] for r in entries)
            acc_dim = mean(r["AccDim"] for r in entries)
            acc_full = mean(r["AccFull"] for r in entries)
            final[backend][rag] = {
                "MAE_rescaled": round(mae, 4),
                "AccDim": round(acc_dim, 4),
                "AccFull": round(acc_full, 4),
                "characters": len(entries)
            }
    return final

In [107]:
final_summary = aggregate_evaluations()

# Print nicely
import pprint
pprint.pprint(final_summary)

# Save to JSON
with open("bfi_measured_alignment_summary.json", "w", encoding="utf-8") as f:
    json.dump(final_summary, f, indent=2)


{'deepseek': {'EmotionRAG': {'AccDim': 0.8458,
                             'AccFull': 0.375,
                             'MAE_rescaled': 0.1412,
                             'characters': 8},
              'RAG': {'AccDim': 0.8146,
                      'AccFull': 0.375,
                      'MAE_rescaled': 0.1418,
                      'characters': 8},
              'myRAG': {'AccDim': 0.7729,
                        'AccFull': 0.25,
                        'MAE_rescaled': 0.1535,
                        'characters': 8}},
 'llama3': {'EmotionRAG': {'AccDim': 0.8229,
                           'AccFull': 0.625,
                           'MAE_rescaled': 0.1686,
                           'characters': 8},
            'RAG': {'AccDim': 0.8292,
                    'AccFull': 0.5,
                    'MAE_rescaled': 0.1678,
                    'characters': 8},
            'myRAG': {'AccDim': 0.8396,
                      'AccFull': 0.5,
                      'MAE_rescaled': 0.1377,


### MBTI

In [108]:
with open("characters_labels.json", "r", encoding="utf-8") as f:
    character_labels = json.load(f)["annotation"]


In [109]:
def evaluate_mbti_summary(path, character_labels):
    with open(path, "r", encoding="utf-8") as f:
        summary = json.load(f)

    raw_character = summary["character"]
    backend = summary["backend"]
    rag_framework = Path(path).parts[-4]  # extract like BFI

    if raw_character not in CHARACTER_NAME_MAP:
        return None

    gt_character = CHARACTER_NAME_MAP[raw_character]
    if gt_character not in character_labels or "16Personalities" not in character_labels[gt_character]:
        return None

    predicted = summary["averages"]
    gt = character_labels[gt_character]["16Personalities"]

    total_mae = 0
    acc_dim = 0
    total_dims = 0
    full_correct = True

    for dim in MBTI_DIMENSIONS:
        if dim not in predicted or dim not in gt:
            continue

        gt_entry = gt[dim]
        if gt_entry["type"] == "X":
            continue

        # Get two opposing letters and percentages
        type1, type2 = list(predicted[dim].keys())
        pct1 = int(predicted[dim][type1].strip('%'))
        pct2 = int(predicted[dim][type2].strip('%'))

        # Predicted type is the one with the higher percent
        pred_type_letter = type1 if pct1 > pct2 else type2
        pred_score = max(pct1, pct2)
        true_score = gt_entry["score"]
        true_type = gt_entry["type"]

        # H means toward high side (i.e., pred_type_letter), L means other
        pred_type = "H" if pred_type_letter == type1 else "L"

        total_mae += abs(pred_score - true_score)
        total_dims += 1

        if pred_type == true_type:
            acc_dim += 1
        else:
            full_correct = False

    if total_dims == 0:
        return None

    return {
        "backend": backend,
        "rag_framework": rag_framework,
        "MAE_rescaled": total_mae / (total_dims * MBTI_SCORE_RANGE),
        "AccDim": acc_dim / total_dims,
        "AccFull": 1.0 if full_correct else 0.0
    }


In [110]:
def aggregate_mbti_evaluations(base_dir="character_response/MBTI"):
    results_by_backend = defaultdict(lambda: defaultdict(list))

    summary_files = glob.glob(f"{base_dir}/**/*_er_eval_summary.json", recursive=True)

    for path in summary_files:
        result = evaluate_mbti_summary(path, character_labels)
        if result:
            backend = result["backend"]
            rag = result["rag_framework"]
            results_by_backend[backend][rag].append(result)

    final = {}
    for backend, rag_dict in results_by_backend.items():
        final[backend] = {}
        for rag, entries in rag_dict.items():
            mae = mean(r["MAE_rescaled"] for r in entries)
            acc_dim = mean(r["AccDim"] for r in entries)
            acc_full = mean(r["AccFull"] for r in entries)
            final[backend][rag] = {
                "MAE_rescaled": round(mae, 4),
                "AccDim": round(acc_dim, 4),
                "AccFull": round(acc_full, 4),
                "characters": len(entries)
            }
    return final




In [111]:
final_mbti_summary = aggregate_mbti_evaluations()

# Print
import pprint
pprint.pprint(final_mbti_summary)

# Save
with open("mbti_measured_alignment_summary.json", "w", encoding="utf-8") as f:
    json.dump(final_mbti_summary, f, indent=2)


{'deepseek': {'EmotionRAG': {'AccDim': 0.5833,
                             'AccFull': 0.25,
                             'MAE_rescaled': 0.375,
                             'characters': 8},
              'RAG': {'AccDim': 0.6458,
                      'AccFull': 0.375,
                      'MAE_rescaled': 0.3783,
                      'characters': 8},
              'myRAG': {'AccDim': 0.75,
                        'AccFull': 0.5,
                        'MAE_rescaled': 0.366,
                        'characters': 8}},
 'llama3': {'EmotionRAG': {'AccDim': 0.5729,
                           'AccFull': 0.375,
                           'MAE_rescaled': 0.3546,
                           'characters': 8},
            'RAG': {'AccDim': 0.6458,
                    'AccFull': 0.375,
                    'MAE_rescaled': 0.3538,
                    'characters': 8},
            'myRAG': {'AccDim': 0.6458,
                      'AccFull': 0.375,
                      'MAE_rescaled': 0.3489,
  

In [112]:
def results_to_table(results_dict):
    rows = []
    for backend, rag_data in results_dict.items():
        for rag, metrics in rag_data.items():
            row = {
                "Backend": backend,
                "RAG": rag,
                "MAE_rescaled": metrics["MAE_rescaled"],
                "AccDim": metrics["AccDim"],
                "AccFull": metrics["AccFull"],
                "Characters": metrics["characters"]
            }
            rows.append(row)
    return rows

    # Assuming you already have these from previous code blocks
bfi_table = results_to_table(final_summary)            # from aggregate_evaluations()
mbti_table = results_to_table(final_mbti_summary)      # from aggregate_mbti_evaluations()



In [113]:
import pandas as pd

bfi_df = pd.DataFrame(bfi_table)
mbti_df = pd.DataFrame(mbti_table)

print("📊 BFI Evaluation Results:")
display(bfi_df.sort_values(by=["Backend", "RAG"]).reset_index(drop=True))

print("\n📊 MBTI Evaluation Results:")
display(mbti_df.sort_values(by=["Backend", "RAG"]).reset_index(drop=True))


📊 BFI Evaluation Results:


Unnamed: 0,Backend,RAG,MAE_rescaled,AccDim,AccFull,Characters
0,deepseek,EmotionRAG,0.1412,0.8458,0.375,8
1,deepseek,RAG,0.1418,0.8146,0.375,8
2,deepseek,myRAG,0.1535,0.7729,0.25,8
3,llama3,EmotionRAG,0.1686,0.8229,0.625,8
4,llama3,RAG,0.1678,0.8292,0.5,8
5,llama3,myRAG,0.1377,0.8396,0.5,8
6,qwen,EmotionRAG,0.1587,0.8708,0.625,8
7,qwen,RAG,0.1577,0.8292,0.5,8
8,qwen,myRAG,0.1488,0.7729,0.25,8



📊 MBTI Evaluation Results:


Unnamed: 0,Backend,RAG,MAE_rescaled,AccDim,AccFull,Characters
0,deepseek,EmotionRAG,0.375,0.5833,0.25,8
1,deepseek,RAG,0.3783,0.6458,0.375,8
2,deepseek,myRAG,0.366,0.75,0.5,8
3,llama3,EmotionRAG,0.3546,0.5729,0.375,8
4,llama3,RAG,0.3538,0.6458,0.375,8
5,llama3,myRAG,0.3489,0.6458,0.375,8
6,qwen,EmotionRAG,0.3664,0.5833,0.25,8
7,qwen,RAG,0.3564,0.6146,0.25,8
8,qwen,myRAG,0.3663,0.6146,0.25,8
