In [24]:
from datasets import load_dataset
from collections import Counter
import pandas as pd
import math
import json 
import os
from dotenv import load_dotenv
import time
import openai
from random import randint
from typing import List
import tqdm
from openai import OpenAI

In [9]:
BASE_DIR = "../../database"

# List all character subfolders
character_names = [name for name in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, name))]

# Load all raw_paragraphs.json files
all_raw_paragraphs = {}
for character in character_names:
    path = os.path.join(BASE_DIR, character, "raw_paragraphs.json")
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            all_raw_paragraphs[character] = json.load(f)

print(f"Loaded {len(all_raw_paragraphs)} characters.")



Loaded 8 characters.


In [11]:
from transformers import AutoTokenizer

tokenizer_path = "../../models/roberta-base-go_emotions"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

def count_tokens(text):
    return len(tokenizer.encode(text, truncation=False))


In [62]:
import openai

# Load API key from .env
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# Instantiate the OpenAI client properly
client = OpenAI(api_key=api_key)

def generate_memories_gpt(paragraph_text, paragraph_index, character_name, model="gpt-4.1-nano", temperature=0.7):
    """
    Calls the OpenAI API to extract emotionally rich memory events from a paragraph.

    Args:
        paragraph_text (str): A paragraph from the Fandom biography.
        paragraph_index (int): Index of the paragraph in source file.
        model (str): Model name to use.
        temperature (float): Sampling temperature.

    Returns:
        list[dict]: List of memory objects with 'text' and 'source_paragraph_index'.
    """
    prompt = f"""
You are converting a paragraph from {character_name}'s biography into a more compact and emotional stoty.

Instructions:
- convert to 1-2 short story.
- Always use the character's name in each story - avoid using 'he' or 'she'.
- Each must be self-contained and emotionally vivid.
- Write in third person, and keep each under 64 words.
- Return a JSON list of: {{"text": ..., "source_paragraph_index": {paragraph_index}}}

Paragraph:
\"\"\"{paragraph_text}\"\"\"
"""

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature,
        max_tokens=1000
    )

    # Parse and return list of memory entries
    try:
        return json.loads(response.choices[0].message.content.strip())
    except json.JSONDecodeError:
        print("‚ö†Ô∏è GPT returned invalid JSON for paragraph index", paragraph_index)
        return []


### test

In [57]:
# Load Snape's raw paragraphs
with open("../../database/severus_snape/raw_paragraphs.json", "r", encoding="utf-8") as f:
    snape_paragraphs = json.load(f)

# Preview top 3 paragraphs
for i in range(3):
    print(f"--- Paragraph {i} ---\n{snape_paragraphs[i]['raw_text']}\n")


--- Paragraph 0 ---
Severus Snape was born 9 January 1960 to Tobias Snape, an abusive Muggle, and Eileen Prince, a neglectful pure-blood witch.[23] He began to identify with his mother's family and created a secret nickname from his mother's maiden name, calling himself the "Half-Blood Prince". It is implied that Severus was friendless and uncared for by his parents. This lack of care largely shaped Severus's bitter disposition and cruel behaviour later in his life.

--- Paragraph 1 ---
Severus grew up at Spinner's End, a shabby suburb of Cokeworth.[24] This area of town was near a dirty river and full of dilapidated houses, disused factories and broken down street lamps. Through the rest of his life, Severus continued to return there when he was not at school. The young Severus is depicted as being unwashed and wearing ill-fitting clothes "that were so mismatched that it looked deliberate". As a child, Severus was neglected and his parents often fought with one another. He could not w

In [63]:
# Run memory generation on top 3
test_memories = []
for i in range(3):
    para = snape_paragraphs[i]
    output = generate_memories_gpt(para["raw_text"], para["paragraph_index"], "Severus Snape", model="gpt-4.1-nano", temperature=0.7)
    test_memories.extend(output)

# Print results
for mem in test_memories:
    print(f"üîπ ({count_tokens(mem['text'])} tokens): {mem['text']}\n")


üîπ (71 tokens): Severus Snape, born to an abusive Muggle father and a neglectful witch mother, grew up unloved and alone. Embracing his mother's bloodline, he secretly called himself the 'Half-Blood Prince,' his bitterness deepening with every moment of neglect and pain. His lonely childhood forged a heart hardened by pain and cruelty.

üîπ (76 tokens): Severus Snape grew up in Spinner's End, a bleak neighborhood haunted by neglect. Surrounded by broken homes and a dirty river, he longed for escape. The fights at home made him yearn for Hogwarts, where he found solace. Yet, even as an adult, the taste of Holiday Blancmange reminded him of childhood's fleeting comfort.

üîπ (68 tokens): Severus Snape watched Lily Evans with admiration, sensing her hidden magic. Their bond blossomed quickly, filling him with hope and affection. Yet Lily saw him only as a friend, while he harbored feelings that ran deep, longing for a connection that seemed just out of reach. His heart ached with unsp

### build memory

In [65]:
import os
import json
from tqdm import tqdm

BASE_DIR = "../../database"
character_dirs = [d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]

def count_tokens(text):
    return len(tokenizer.encode(text, truncation=False))

MAX_RETRIES = 3
SLEEP_BETWEEN_RETRIES = 2  # seconds

for char_dir in character_dirs:
    char_name = char_dir.replace("_", " ").title()
    raw_path = os.path.join(BASE_DIR, char_dir, "raw_paragraphs.json")
    memory_path = os.path.join(BASE_DIR, char_dir, "memory.json")

    if not os.path.exists(raw_path):
        continue

    # Load paragraph source
    with open(raw_path, "r", encoding="utf-8") as f:
        paragraphs = json.load(f)

    # Load existing memory if resuming
    if os.path.exists(memory_path):
        with open(memory_path, "r", encoding="utf-8") as f:
            all_memories = json.load(f)
        existing_indices = {m["source_paragraph_index"] for m in all_memories}
    else:
        all_memories = []
        existing_indices = set()

    for entry in tqdm(paragraphs, desc=f"Processing {char_name}"):
        para_idx = entry["paragraph_index"]
        para_text = entry["raw_text"]

        if para_idx in existing_indices:
            continue  # Skip already processed

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                mems = generate_memories_gpt(para_text, para_idx, character_name=char_name)
                # Check token limits
                mems = [m for m in mems if 50 <= count_tokens(m["text"]) <= 128]
                all_memories.extend(mems)

                # Save after each success
                with open(memory_path, "w", encoding="utf-8") as f:
                    json.dump(all_memories, f, indent=2, ensure_ascii=False)
                break  # Success ‚Üí break retry loop

            except Exception as e:
                print(f"‚ö†Ô∏è Error on paragraph {para_idx} (attempt {attempt}): {e}")
                time.sleep(SLEEP_BETWEEN_RETRIES)

                if attempt == MAX_RETRIES:
                    print(f"‚ùå Failed after {MAX_RETRIES} attempts on paragraph {para_idx}")



Processing Minerva Mcgonagall: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 148/148 [03:11<00:00,  1.30s/it]
Processing Harry Potter:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 386/391 [09:13<00:06,  1.27s/it]

‚ö†Ô∏è GPT returned invalid JSON for paragraph index 385


Processing Harry Potter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 391/391 [09:20<00:00,  1.43s/it]
Processing Ron Weasley:  11%|‚ñà         | 14/133 [00:15<01:59,  1.00s/it]

‚ö†Ô∏è Error on paragraph 14 (attempt 1): string indices must be integers


Processing Ron Weasley:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 102/133 [02:21<00:41,  1.35s/it]

‚ö†Ô∏è Error on paragraph 102 (attempt 1): string indices must be integers
‚ö†Ô∏è Error on paragraph 102 (attempt 2): string indices must be integers
‚ö†Ô∏è Error on paragraph 102 (attempt 3): string indices must be integers


Processing Ron Weasley:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 103/133 [02:31<01:53,  3.79s/it]

‚ùå Failed after 3 attempts on paragraph 102


Processing Ron Weasley: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 133/133 [03:14<00:00,  1.46s/it]
Processing Luna Lovegood: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 55/55 [01:08<00:00,  1.25s/it]
Processing Albus Dumbledore:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 89/164 [01:56<01:44,  1.40s/it]

‚ö†Ô∏è GPT returned invalid JSON for paragraph index 88


Processing Albus Dumbledore:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 101/164 [02:16<02:00,  1.92s/it]

‚ö†Ô∏è GPT returned invalid JSON for paragraph index 100


Processing Albus Dumbledore: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 164/164 [03:44<00:00,  1.37s/it]
Processing Severus Snape: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 139/139 [03:09<00:00,  1.36s/it]
Processing Hermione Granger:  15%|‚ñà‚ñç        | 21/142 [00:23<02:30,  1.24s/it]

‚ö†Ô∏è GPT returned invalid JSON for paragraph index 20


Processing Hermione Granger:  23%|‚ñà‚ñà‚ñé       | 33/142 [00:38<02:24,  1.33s/it]

‚ö†Ô∏è GPT returned invalid JSON for paragraph index 32


Processing Hermione Granger: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 142/142 [03:06<00:00,  1.31s/it]
Processing Draco Malfoy: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 59/59 [01:21<00:00,  1.37s/it]


In [66]:

char = "ron_weasley"
paragraph_index = 102

# Load raw paragraph
with open(f"../../database/{char}/raw_paragraphs.json", "r", encoding="utf-8") as f:
    paragraphs = json.load(f)

target_paragraph = [p for p in paragraphs if p["paragraph_index"] == paragraph_index][0]

# Load existing memory
with open(f"../../database/{char}/memory.json", "r", encoding="utf-8") as f:
    memories = json.load(f)


In [68]:
# Rerun GPT
patched_mems = generate_memories_gpt(
    paragraph_text=target_paragraph["raw_text"],
    paragraph_index=paragraph_index,
    character_name="Ron Weasley"
)

# Optional: filter by token length
patched_mems = [m for m in patched_mems if 50 <= count_tokens(m["text"]) <= 128]


In [69]:
# Add to memory list
memories.extend(patched_mems)

# Sort by paragraph index (optional but tidy)
memories = sorted(memories, key=lambda x: x["source_paragraph_index"])

# Save updated memory
with open(f"../../database/{char}/memory.json", "w", encoding="utf-8") as f:
    json.dump(memories, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Patched paragraph {paragraph_index} for Ron Weasley.")


‚úÖ Patched paragraph 102 for Ron Weasley.


In [74]:
import os
import json
from tqdm import tqdm
import time

# Paths
BASE_DIR = "../../database"
character_dirs = [d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]

# Thresholds
MAX_RETRIES = 3
TOKEN_MIN = 50
TOKEN_MAX = 128

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(path, obj):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)

def patch_memory(char):
    print(f"\nüõ† Patching: {char}")
    
    raw_path = os.path.join(BASE_DIR, char, "raw_paragraphs.json")
    mem_path = os.path.join(BASE_DIR, char, "memory.json")
    character_name = char.replace("_", " ").title()
    
    raw_paragraphs = load_json(raw_path)
    raw_index_map = {p["paragraph_index"]: p["raw_text"] for p in raw_paragraphs}
    raw_indices = set(raw_index_map.keys())

    if os.path.exists(mem_path):
        memories = load_json(mem_path)
        mem_indices = {m["source_paragraph_index"] for m in memories}
    else:
        memories = []
        mem_indices = set()

    missing_indices = sorted(raw_indices - mem_indices)
    print(f"üîç Missing: {missing_indices}")

    for idx in tqdm(missing_indices, desc=f"Patching {character_name}"):
        para_text = raw_index_map[idx]

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                mems = generate_memories_gpt(para_text, idx, character_name)
                filtered = [m for m in mems if TOKEN_MIN <= count_tokens(m["text"]) <= TOKEN_MAX]
                memories.extend(filtered)

                # Save after each successful addition
                save_json(mem_path, sorted(memories, key=lambda x: x["source_paragraph_index"]))
                break

            except Exception as e:
                print(f"‚ö†Ô∏è Error on {char} paragraph {idx} (attempt {attempt}): {e}")
                time.sleep(2)
                if attempt == MAX_RETRIES:
                    print(f"‚ùå Failed to patch {char} paragraph {idx} after {MAX_RETRIES} attempts.")

# Only patch characters with known missing data
incomplete_info = {
    "luna_lovegood": [32],
}

for char, missing in incomplete_info.items():
    patch_memory(char)



üõ† Patching: luna_lovegood
üîç Missing: [32]


Patching Luna Lovegood: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.65s/it]


In [76]:
import os
import json

BASE_DIR = "../../database"
character_dirs = [d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

incomplete_characters = {}

for char in character_dirs:
    raw_path = os.path.join(BASE_DIR, char, "raw_paragraphs.json")
    mem_path = os.path.join(BASE_DIR, char, "memory.json")

    if not os.path.exists(raw_path):
        print(f"‚ö†Ô∏è No raw_paragraphs.json for {char}")
        continue

    raw_paragraphs = load_json(raw_path)
    raw_indices = {p["paragraph_index"] for p in raw_paragraphs}

    if not os.path.exists(mem_path):
        print(f"No memory.json for {char} ‚Äî all {len(raw_indices)} missing")
        incomplete_characters[char] = sorted(list(raw_indices))
        continue

    memories = load_json(mem_path)
    mem_indices = {m["source_paragraph_index"] for m in memories}

    missing = raw_indices - mem_indices
    if missing:
        incomplete_characters[char] = sorted(list(missing))
        missing_sorted = sorted(missing)
        print(f"{char}: {len(missing_sorted)} missing of {len(raw_indices)} paragraphs ‚Üí {missing_sorted[:5]}{'...' if len(missing_sorted) > 5 else ''}")

    else:
        print(f"{char}: complete")

# Optional summary
print("\nüîç Summary:")
print(f"Checked {len(character_dirs)} characters.")
if incomplete_characters:
    print(f"{len(incomplete_characters)} incomplete:")
    for char, indices in incomplete_characters.items():
        print(f" - {char}: {len(indices)} missing ‚Üí {indices[:5]}{'...' if len(indices) > 5 else ''}")
else:
    print("All characters have complete memory coverage!")


minerva_mcgonagall: complete
harry_potter: complete
ron_weasley: complete
luna_lovegood: complete
albus_dumbledore: complete
severus_snape: complete
hermione_granger: complete
draco_malfoy: complete

üîç Summary:
Checked 8 characters.
All characters have complete memory coverage!
