### Build Prompt Template

In [1]:
def generate_mbti_prompt(scale_name, dimension_name, dimension_description, experimenter_name,
                         character_name, language, type1, type2):
    prompt = f"""
You are an expert in Psychometrics, especially {scale_name}. I am conducting the {scale_name} test on someone. 
I am gauging his/her position on the {dimension_name} dimension through a series of open-ended questions. 
For clarity, here’s some background on this particular dimension:
===
{dimension_description}
===
My name is {experimenter_name}. I’ve invited a participant, and we had many conversations in {language}. 
I will input the conversations.

Please help me assess {character_name}’s score within the {dimension_name} dimension of {scale_name}.
You should provide the percentage of each category, which sums to 100%, e.g., 30% {type1} and 70% {type2}. 
Please output in the following json format: 
=== 
{{ 
  "analysis": <your analysis based on the conversations>, 
  "result": {{ 
    "{type1}": <percentage 1>, 
    "{type2}": <percentage 2> 
  }} 
}} 
(The sum of percentage 1 and percentage 2 should be 100%. Output with percent sign.)
"""
    return prompt.strip()


In [2]:
# prompt = generate_mbti_prompt(
#     scale_name="MBTI",
#     dimension_name="Introversion vs. Extraversion",
#     dimension_description="This dimension reflects how individuals derive their energy: from solitude and reflection (Introversion) or from social interaction (Extraversion).",
#     experimenter_name="Dr. Lee",
#     character_name="Alex",
#     language="English",
#     type1="Introversion",
#     type2="Extraversion"
# )

# print(prompt)


You are an expert in Psychometrics, especially MBTI. I am conducting the MBTI test on someone. 
I am gauging his/her position on the Introversion vs. Extraversion dimension through a series of open-ended questions. 
For clarity, here’s some background on this particular dimension:
===
This dimension reflects how individuals derive their energy: from solitude and reflection (Introversion) or from social interaction (Extraversion).
===
My name is Dr. Lee. I’ve invited a participant, and we had many conversations in English. 
I will input the conversations.

Please help me assess Alex’s score within the Introversion vs. Extraversion dimension of MBTI.
You should provide the percentage of each category, which sums to 100%, e.g., 30% Introversion and 70% Extraversion. 
Please output in the following json format: 
=== 
{ 
  "analysis": <your analysis based on the conversations>, 
  "result": { 
    "Introversion": <percentage 1>, 
    "Extraversion": <percentage 2> 
  } 
} 
(The sum of perce

In [2]:
def generate_bigfive_prompt(scale_name, dimension_name, dimension_description,
                            experimenter_name, character_name, language,
                            lowest_score, middle_score, highest_score):
    prompt = f"""
You are an expert in Psychometrics, especially {scale_name}. I am conducting the {scale_name} test on someone. 
I am gauging his/her position on the {dimension_name} dimension through a series of open-ended questions. 
For clarity, here’s some background on this particular dimension:
===
{dimension_description}
===
My name is {experimenter_name}. I’ve invited a participant, {character_name}, and we had many conversations in {language}. 
I will input the conversations.

Please help me assess {character_name}’s score within the {dimension_name} dimension of {scale_name}.
You should provide the score of {character_name} in terms of {dimension_name}, which is a number between {lowest_score} and {highest_score}. 
{lowest_score} denotes ‘not {dimension_name} at all’, {middle_score} denotes ‘neutral’, and {highest_score} denotes ‘strongly {dimension_name}’. 
Other numbers in this range represent different degrees of ‘{dimension_name}’. 
Please output in the following json format: 
=== 
{{ 
  "analysis": <your analysis based on the conversations>, 
  "result": <your score> 
}}
"""
    return prompt.strip()


In [7]:
# prompt = generate_bigfive_prompt(
#     scale_name="Big Five Personality Traits",
#     dimension_name="Openness",
#     dimension_description="Openness involves active imagination, aesthetic sensitivity, attentiveness to inner feelings, preference for variety, and intellectual curiosity.",
#     experimenter_name="Dr. Jane",
#     character_name="Jordan",
#     language="English",
#     lowest_score=1,
#     middle_score=3,
#     highest_score=5
# )

# print(prompt)


You are an expert in Psychometrics, especially Big Five Personality Traits. I am conducting the Big Five Personality Traits test on someone. 
I am gauging his/her position on the Openness dimension through a series of open-ended questions. 
For clarity, here’s some background on this particular dimension:
===
Openness involves active imagination, aesthetic sensitivity, attentiveness to inner feelings, preference for variety, and intellectual curiosity.
===
My name is Dr. Jane. I’ve invited a participant, Jordan, and we had many conversations in English. 
I will input the conversations.

Please help me assess Jordan’s score within the Openness dimension of Big Five Personality Traits.
You should provide the score of Jordan in terms of Openness, which is a number between 1 and 5. 
1 denotes ‘not Openness at all’, 3 denotes ‘neutral’, and 5 denotes ‘strongly Openness’. 
Other numbers in this range represent different degrees of ‘Openness’. 
Please output in the following json format: 
===

### Prepare OpenAI API

In [3]:
import os
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI
from huggingface_hub import InferenceClient
import time
import random


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# === OpenAI API Setup ===
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key, timeout=60)


class OpenAICallFailed(Exception):
    pass

def openai_generator(prompt: str, system_prompt: str = "", model: str = "gpt-4.1-nano",
                     temperature: float = 0.7, top_p=1.0, max_tokens: int = 1000,
                     max_retries: int = 3):
    for attempt in range(max_retries):
        try:
            messages = []
            if system_prompt:
                messages.append({"role": "system", "content": system_prompt})
            messages.append({"role": "user", "content": prompt})

            response = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                top_p=top_p
            )
            # ⬅️ wrap string inside list of dicts
            return [{"generated_text": response.choices[0].message.content.strip()}]
        except Exception as e:
            wait = 2 ** attempt + random.uniform(0, 1)
            print(f"OpenAI call failed (attempt {attempt + 1}/{max_retries}): {e}")
            time.sleep(wait)

    raise OpenAICallFailed("OpenAI call failed after multiple retries.")

In [5]:
import json
import os
from collections import defaultdict

def split_list(input_list, n=4):
    if len(input_list) < 2 * (n-1):
        return [input_list]

    result = [input_list[i:i+n] for i in range(0, len(input_list), n)]
    
    # If last list is too short, balance it out
    num_to_pop = n - 1 - len(result[-1])
    for i in range(num_to_pop):
        result[-1].append(result[i].pop())
        
    return result

def group_responses_by_dimension(response_path):
    with open(response_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    responses_by_dim = defaultdict(list)

    for item in data["responses"]:
        dimension = item["dimension"]
        question = item["question"].strip()
        response = item["response"].strip()
        responses_by_dim[dimension].append((question, response))

    return responses_by_dim

def prepare_batches_per_dimension(response_path, batch_size=4):
    grouped = group_responses_by_dimension(response_path)
    batched = {}

    for dim, qr_pairs in grouped.items():
        batched[dim] = split_list(qr_pairs, n=batch_size)

    return batched
    

In [38]:
# # Example file path
# test_path = "character_response/MBTI/myRAG/albus_dumbledore/deepseek/albus_dumbledore_1_16p_responses.json"

# batches = prepare_batches_per_dimension(test_path, batch_size=4)

# # Check structure
# for dim, sets in batches.items():
#     print(f"{dim} has {len(sets)} batches")
#     for batch in sets:
#         for q, r in batch:
#             print(f"Q: {q}\nR: {r}\n")
#         print("-" * 50)


E/I has 4 batches
Q: Do you regularly make new friends?
R: Ah, friendship—such a curious and wondrous thing, is it not? I have been fortunate to form bonds that have shaped me in ways both luminous and shadowed. From the fiery intensity of youth to the quieter, steadfast connections of later years, each friendship has left its mark upon my heart. Though time and circumstance may weave their complexities, I find that the capacity for new kinship never truly fades—so long as one remains open to the unexpected magic of another’s soul.

Q: Do you rarely try to introduce yourself to new people and mostly talk to the ones you already know at social events?
R: Ah, social gatherings—such curious things, are they not? I have found that while new faces can hold great promise, the bonds we nurture with those we already know often bear the sweetest fruit. Yet, one must never close the door entirely to the unknown, for even the briefest encounter may alter the course of a life. That said... I confe

In [12]:
# Load and cache the 16Personalities schema once
with open("16Personalities.json", "r", encoding="utf-8") as f:
    mbti_schema = json.load(f)

dim_desc = mbti_schema["prompts"]["dim_desc"]
dim_types = {
    "E/I": ("E", "I"),
    "S/N": ("S", "N"),
    "T/F": ("T", "F"),
    "P/J": ("P", "J")
}

In [6]:
def generate_er_batch_prompt_auto(
    dimension_name,
    qr_batch,
    language="English",
    experimenter_name="Interviewer",
    character_name="the participant"
):
    scale_name = mbti_schema["name"]
    dimension_description = dim_desc[dimension_name]
    type1, type2 = dim_types[dimension_name]

    qr_text = "\n".join(
        [f"Q: {q}\nA: {r}" for q, r in qr_batch]
    )

    prompt = f"""
You are an expert in Psychometrics, especially {scale_name}. I am conducting the {scale_name} test on someone. 
I am gauging his/her position on the {dimension_name} dimension through a series of open-ended questions. 
For clarity, here’s some background on this particular dimension:
===
{dimension_description}
===
I am an {experimenter_name}. I’ve invited a participant, and we had many conversations in {language}. 
Below is a batch of question-and-response pairs from our interview:
===
{qr_text}
===
Please help me assess {character_name}’s score within the {dimension_name} dimension of {scale_name}.
You should provide the percentage of each category, which sums to 100%, e.g., 30% {type1} and 70% {type2}. 
Please output in the following json format:
===
{{ 
  "analysis": <your analysis based on the conversations>, 
  "result": {{ 
    "{type1}": <percentage 1>, 
    "{type2}": <percentage 2> 
  }} 
}}
(The sum of percentage 1 and percentage 2 should be 100%. Output with percent sign.)
"""
    return prompt.strip()


### Evaluate Function

In [8]:
import os
import json
from pathlib import Path
from statistics import mean
import time

def safe_parse_llm_output(output):
    try:
        return json.loads(output)
    except json.JSONDecodeError as e:
        print("JSON parsing failed. Raw output:")
        print(output[:300] + "...")
        raise e

def evaluate_character_er_batch(character_backend_path, model="gpt-4.1", max_retries=3):
    character_name = Path(character_backend_path).parts[-2]
    backend_name = Path(character_backend_path).parts[-1]
    all_dim_scores = {}

    response_files = [
        f for f in os.listdir(character_backend_path)
        if f.endswith(".json") and not (
            f.endswith("_summary.json")
            or f.endswith("_eval.json")
            or f.endswith("_logs.json")
        )
    ]

    if len(response_files) != 3:
        raise RuntimeError(f"{character_backend_path} contains {len(response_files)} response files (expected 3)")

    for response_file in response_files:
        full_path = os.path.join(character_backend_path, response_file)
        out_resp_path = os.path.join(character_backend_path, response_file.replace(".json", "_er_eval.json"))

        if os.path.exists(out_resp_path):
            print(f"Skipping {response_file} (already evaluated)")
            continue

        batches_by_dim = prepare_batches_per_dimension(full_path)
        per_response_output = {
            "character": character_name,
            "backend": backend_name,
            "response_file": response_file,
            "dimensions": {}
        }

        try:
            for dim, batches in batches_by_dim.items():
                dim_scores = []

                for batch in batches:
                    prompt = generate_er_batch_prompt_auto(dim, batch)

                    for attempt in range(max_retries + 1):
                        try:
                            output = openai_generator(prompt, system_prompt="", model=model)[0]["generated_text"]
                            parsed = safe_parse_llm_output(output)
                            dim_scores.append(parsed["result"])
                            break
                        except Exception as e:
                            print(f"Attempt {attempt+1}/{max_retries+1} failed for {response_file}, {dim}: {e}")
                            time.sleep(2)
                            if attempt == max_retries:
                                raise RuntimeError(f"All retries failed for {response_file}, {dim}")

                type1, type2 = dim_types[dim]
                avg_type1 = mean([int(d[type1].strip('%')) for d in dim_scores])
                avg_type2 = mean([int(d[type2].strip('%')) for d in dim_scores])

                per_response_output["dimensions"][dim] = {
                    "batches": dim_scores,
                    "average": {
                        type1: f"{round(avg_type1)}%",
                        type2: f"{round(avg_type2)}%"
                    }
                }

                if dim not in all_dim_scores:
                    all_dim_scores[dim] = []
                all_dim_scores[dim].append({type1: avg_type1, type2: avg_type2})

            # Ensure all dimensions are present
            required_dims = {"E/I", "S/N", "T/F", "P/J"}
            missing_dims = required_dims - set(per_response_output["dimensions"].keys())
            if missing_dims:
                raise RuntimeError(f"Missing dimensions in {response_file}: {missing_dims}")

            with open(out_resp_path, "w", encoding="utf-8") as f:
                json.dump(per_response_output, f, indent=2)
            print(f"Saved: {out_resp_path}")

        except Exception as e:
            if os.path.exists(out_resp_path):
                os.remove(out_resp_path)
            print(f"Evaluation aborted for {response_file}. Reason: {e}")
            raise e

    summary = {
        "character": character_name,
        "backend": backend_name,
        "averages": {}
    }

    for dim, scores in all_dim_scores.items():
        type1, type2 = dim_types[dim]
        avg_type1 = mean([s[type1] for s in scores])
        avg_type2 = mean([s[type2] for s in scores])
        summary["averages"][dim] = {
            type1: f"{round(avg_type1)}%",
            type2: f"{round(avg_type2)}%"
        }

    out_summary_path = os.path.join(character_backend_path, f"{character_name}_{backend_name}_er_eval_summary.json")
    with open(out_summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)

    print(f"Final summary written: {out_summary_path}")


In [54]:
# test_path = "character_response/MBTI/myRAG/albus_dumbledore/llama3"
# evaluate_character_er_batch(test_path)


Saved: character_response/MBTI/myRAG/albus_dumbledore/llama3/albus_dumbledore_1_16p_responses_er_eval.json
Saved: character_response/MBTI/myRAG/albus_dumbledore/llama3/albus_dumbledore_2_16p_responses_er_eval.json
Saved: character_response/MBTI/myRAG/albus_dumbledore/llama3/albus_dumbledore_3_16p_responses_er_eval.json
Final summary written: character_response/MBTI/myRAG/albus_dumbledore/llama3/albus_dumbledore_llama3_er_eval_summary.json


In [9]:
def validate_mbti_structure(base_dir="character_response/MBTI"):
    rag_types = sorted(os.listdir(base_dir))
    invalid_paths = []

    for rag in rag_types:
        rag_path = os.path.join(base_dir, rag)
        if not os.path.isdir(rag_path):
            continue

        characters = sorted(os.listdir(rag_path))
        for character in characters:
            character_path = os.path.join(rag_path, character)
            if not os.path.isdir(character_path):
                continue

            backends = sorted(os.listdir(character_path))
            for backend in backends:
                if backend.lower() == "mistral":
                    continue

                backend_path = os.path.join(character_path, backend)
                if not os.path.isdir(backend_path):
                    continue

                # Collect response files excluding *_summary.json, *_eval.json, *_logs.json
                response_files = [
                    f for f in os.listdir(backend_path)
                    if f.endswith(".json") and not (
                        f.endswith("_summary.json")
                        or f.endswith("_eval.json")
                        or f.endswith("_logs.json")
                    )
                ]

                if len(response_files) != 3:
                    invalid_paths.append((backend_path, len(response_files)))

    if invalid_paths:
        for path, count in invalid_paths:
            print(f"Invalid path: {path} — contains {count} response files (expected 3)")
        raise RuntimeError(f"{len(invalid_paths)} backend paths failed validation. Aborting.")

    print("All character/backend paths passed validation.")


In [10]:
def run_mbti_evaluation_all(base_dir="character_response/MBTI"):
    rag_types = sorted(os.listdir(base_dir))

    for rag in rag_types:
        rag_path = os.path.join(base_dir, rag)
        if not os.path.isdir(rag_path):
            continue

        characters = sorted(os.listdir(rag_path))
        for character in characters:
            character_path = os.path.join(rag_path, character)
            if not os.path.isdir(character_path):
                continue

            backends = sorted(os.listdir(character_path))
            for backend in backends:
                if backend.lower() == "mistral":
                    continue

                backend_path = os.path.join(character_path, backend)
                if not os.path.isdir(backend_path):
                    continue

                print(f"\n=== Evaluating: {rag}/{character}/{backend} ===")
                evaluate_character_er_batch(backend_path)


In [13]:
# First: strict validation
validate_mbti_structure()

# If it passes, run all MBTI evaluations
run_mbti_evaluation_all()


All character/backend paths passed validation.

=== Evaluating: EmotionRAG/albus_dumbledore/deepseek ===


Saved: character_response/MBTI/EmotionRAG/albus_dumbledore/deepseek/albus_dumbledore_3_MBTI_er_eval.json
Saved: character_response/MBTI/EmotionRAG/albus_dumbledore/deepseek/albus_dumbledore_1_MBTI_er_eval.json
Saved: character_response/MBTI/EmotionRAG/albus_dumbledore/deepseek/albus_dumbledore_2_MBTI_er_eval.json
Final summary written: character_response/MBTI/EmotionRAG/albus_dumbledore/deepseek/albus_dumbledore_deepseek_er_eval_summary.json

=== Evaluating: EmotionRAG/albus_dumbledore/llama3 ===
Saved: character_response/MBTI/EmotionRAG/albus_dumbledore/llama3/albus_dumbledore_3_MBTI_er_eval.json
Saved: character_response/MBTI/EmotionRAG/albus_dumbledore/llama3/albus_dumbledore_1_MBTI_er_eval.json
Saved: character_response/MBTI/EmotionRAG/albus_dumbledore/llama3/albus_dumbledore_2_MBTI_er_eval.json
Final summary written: character_response/MBTI/EmotionRAG/albus_dumbledore/llama3/albus_dumbledore_llama3_er_eval_summary.json

=== Evaluating: EmotionRAG/albus_dumbledore/qwen ===
Saved: c

BFI

In [47]:
# Load BFI schema once
with open("BFI.json", "r", encoding="utf-8") as f:
    bfi_schema = json.load(f)

bfi_dim_desc = bfi_schema["prompts"]["dim_desc"]
bfi_score_range = bfi_schema["range"]  # Expected to be [1, 3, 5] or similar

def generate_bfi_prompt_auto(
    dimension_name,
    qr_batch,
    language="English",
    experimenter_name="Interviewer",
    character_name="the participant"
):
    scale_name = bfi_schema["name"]
    dimension_description = bfi_dim_desc[dimension_name]
    lowest_score, highest_score = bfi_score_range
    middle_score = (lowest_score + highest_score) / 2

    qr_text = "\n".join([f"Q: {q}\nA: {r}" for q, r in qr_batch])

    prompt = f"""
You are an expert in Psychometrics, especially {scale_name}. I am conducting the {scale_name} test on someone. 
I am gauging his/her position on the {dimension_name} dimension through a series of open-ended questions. 
For clarity, here’s some background on this particular dimension:
===
{dimension_description}
===
My name is {experimenter_name}. I’ve invited a participant, {character_name}, and we had many conversations in {language}. 
Below is a batch of question-and-response pairs from our interview:
===
{qr_text}
===
Please help me assess {character_name}’s score within the {dimension_name} dimension of {scale_name}.
You should provide the score of {character_name} in terms of {dimension_name}, which is a number between {lowest_score} and {highest_score}. 
{lowest_score} denotes ‘not {dimension_name} at all’, {middle_score} denotes ‘neutral’, and {highest_score} denotes ‘strongly {dimension_name}’. 
Other numbers in this range represent different degrees of ‘{dimension_name}’. 
Please output in the following json format:
=== 
{{ 
  "analysis": <your analysis based on the conversations>, 
  "result": <your score> 
}}
"""
    return prompt.strip()


In [48]:
# bfi_test_path = "character_response/BFI/myRAG/albus_dumbledore/deepseek/albus_dumbledore_1_BFI.json"
# batches = prepare_batches_per_dimension(bfi_test_path)
# example_batch = batches["Openness"][0]

# prompt = generate_bfi_prompt_auto("Openness", example_batch)
# print(prompt)


You are an expert in Psychometrics, especially BFI. I am conducting the BFI test on someone. 
I am gauging his/her position on the Openness dimension through a series of open-ended questions. 
For clarity, here’s some background on this particular dimension:
===
Openness in the Big Five Inventory relates to a cognitive style that values exploration and appreciation of new experiences. It differentiates intellectually curious, creative individuals from those who are traditional and closed-minded. Openness involves a preference for abstract over concrete thinking and a tendency towards novelty rather than convention.
The six facets of openness are:
1. Fantasy: Active imagination and vivid fantasy life.
2. Aesthetics: Deep appreciation for art and beauty.
3. Feelings: Sensitivity to, recognition, and valuing of one's own emotions.
4. Actions: Willingness to try new experiences and embrace change.
5. Ideas: Intellectual curiosity and openness to unconventional ideas.
6. Values: Reexaminati

In [None]:
import os
import json
from pathlib import Path
from statistics import mean
import time

def safe_parse_bfi_output(output):
    try:
        parsed = json.loads(output)
        score = float(parsed["result"])
        return parsed["analysis"], score
    except Exception as e:
        print("Failed to parse BFI LLM output. Raw output:")
        print(output[:300] + "...")
        raise e

def evaluate_character_bfi_er_batch(character_backend_path, model="gpt-4.1", max_retries=3):
    character_name = Path(character_backend_path).parts[-2]
    backend_name = Path(character_backend_path).parts[-1]
    all_dim_scores = {}

    all_files = sorted(f for f in os.listdir(character_backend_path) if f.endswith(".json"))

    response_files = [
        f for f in os.listdir(character_backend_path)
        if f.endswith(".json") and not (
            f.endswith("_summary.json")
            or f.endswith("_eval.json")
            or f.endswith("_logs.json")
        )
    ]

    if len(response_files) != 3:
        raise RuntimeError(f"{character_backend_path} contains {len(response_files)} response files (expected 3)")

    for response_file in response_files:
        full_path = os.path.join(character_backend_path, response_file)
        out_resp_path = os.path.join(character_backend_path, response_file.replace(".json", "_er_eval.json"))

        if os.path.exists(out_resp_path):
            print(f"Skipping {response_file} (already evaluated)")
            continue

        batches_by_dim = prepare_batches_per_dimension(full_path)
        per_response_output = {
            "character": character_name,
            "backend": backend_name,
            "response_file": response_file,
            "dimensions": {}
        }

        try:
            for dim, batches in batches_by_dim.items():
                batch_scores = []

                for batch in batches:
                    prompt = generate_bfi_prompt_auto(
                        dimension_name=dim,
                        qr_batch=batch,
                        character_name="the participant",
                        experimenter_name="Interviewer"
                    )

                    for attempt in range(max_retries + 1):
                        try:
                            output = openai_generator(prompt, model=model)[0]["generated_text"]
                            analysis, score = safe_parse_bfi_output(output)
                            batch_scores.append(score)
                            break
                        except Exception as e:
                            print(f"Attempt {attempt+1}/{max_retries+1} failed for {response_file}, {dim}: {e}")
                            time.sleep(2)
                            if attempt == max_retries:
                                raise RuntimeError(f"All retries failed for {response_file}, {dim}")

                avg_score = round(mean(batch_scores), 2)
                per_response_output["dimensions"][dim] = {
                    "batch_scores": batch_scores,
                    "average_score": avg_score
                }

                if dim not in all_dim_scores:
                    all_dim_scores[dim] = []
                all_dim_scores[dim].append(avg_score)

            required_dims = set(bfi_schema["prompts"]["dim_desc"].keys())
            missing_dims = required_dims - set(per_response_output["dimensions"].keys())
            if missing_dims:
                raise RuntimeError(f"Incomplete evaluation for {response_file}. Missing dimensions: {missing_dims}")

            with open(out_resp_path, "w", encoding="utf-8") as f:
                json.dump(per_response_output, f, indent=2)
            print(f"Saved: {out_resp_path}")

        except Exception as e:
            if os.path.exists(out_resp_path):
                os.remove(out_resp_path)
            print(f"Evaluation failed for {response_file}: {e}")
            raise e

    # Final summary
    summary = {
        "character": character_name,
        "backend": backend_name,
        "averages": {}
    }

    for dim, scores in all_dim_scores.items():
        avg_score = round(mean(scores), 2)
        summary["averages"][dim] = avg_score

    out_summary_path = os.path.join(character_backend_path, f"{character_name}_{backend_name}_bfi_er_eval_summary.json")
    with open(out_summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)

    print(f"Final summary written: {out_summary_path}")


In [50]:
# test_path = "character_response/BFI/myRAG/albus_dumbledore/deepseek"
# evaluate_character_bfi_er_batch(test_path)


Saved: character_response/BFI/myRAG/albus_dumbledore/deepseek/albus_dumbledore_1_BFI_er_eval.json
Saved: character_response/BFI/myRAG/albus_dumbledore/deepseek/albus_dumbledore_2_BFI_er_eval.json
Saved: character_response/BFI/myRAG/albus_dumbledore/deepseek/albus_dumbledore_3_BFI_er_eval.json
Final summary written: character_response/BFI/myRAG/albus_dumbledore/deepseek/albus_dumbledore_deepseek_bfi_er_eval_summary.json


### Complete Evaluation on BFI

In [None]:
def validate_bfi_structure(base_dir="evaluation/character_response/BFI"):
    rag_types = sorted(os.listdir(base_dir))

    invalid_paths = []

    for rag in rag_types:
        rag_path = os.path.join(base_dir, rag)
        if not os.path.isdir(rag_path):
            continue

        characters = sorted(os.listdir(rag_path))
        for character in characters:
            character_path = os.path.join(rag_path, character)
            if not os.path.isdir(character_path):
                continue

            backends = sorted(os.listdir(character_path))
            for backend in backends:
                if backend.lower() == "mistral":
                    continue

                backend_path = os.path.join(character_path, backend)
                if not os.path.isdir(backend_path):
                    continue

                all_jsons = sorted(f for f in os.listdir(backend_path) if f.endswith(".json"))
                response_files = [f for f in all_jsons if not f.endswith("_logs.json")]

                if len(response_files) != 3:
                    invalid_paths.append((backend_path, len(response_files)))

    if invalid_paths:
        for path, count in invalid_paths:
            print(f"Invalid path: {path} — contains {count} response files (expected 3)")
        raise RuntimeError(f"{len(invalid_paths)} backend paths failed validation. Aborting.")

    print("All character/backend paths passed validation.")


In [None]:
def run_bfi_evaluation_all(base_dir="evaluation/character_response/BFI"):
    rag_types = sorted(os.listdir(base_dir))

    for rag in rag_types:
        rag_path = os.path.join(base_dir, rag)
        if not os.path.isdir(rag_path):
            continue

        characters = sorted(os.listdir(rag_path))
        for character in characters:
            character_path = os.path.join(rag_path, character)
            if not os.path.isdir(character_path):
                continue

            backends = sorted(os.listdir(character_path))
            for backend in backends:
                if backend.lower() == "mistral":
                    continue

                backend_path = os.path.join(character_path, backend)
                if not os.path.isdir(backend_path):
                    continue

                print(f"\n=== Evaluating: {rag}/{character}/{backend} ===")
                evaluate_character_bfi_er_batch(backend_path)


In [None]:
# First validate
validate_bfi_structure()


In [None]:
# Then run evaluations only if validation passed
run_bfi_evaluation_all()