In [24]:
from datasets import load_dataset
from collections import Counter
import pandas as pd
import math
import json 
import os
from dotenv import load_dotenv
import time
import openai
from random import randint
from typing import List
import tqdm
from openai import OpenAI

In [9]:
BASE_DIR = "../../database"

# List all character subfolders
character_names = [name for name in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, name))]

# Load all raw_paragraphs.json files
all_raw_paragraphs = {}
for character in character_names:
    path = os.path.join(BASE_DIR, character, "raw_paragraphs.json")
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            all_raw_paragraphs[character] = json.load(f)

print(f"Loaded {len(all_raw_paragraphs)} characters.")



Loaded 8 characters.


In [11]:
from transformers import AutoTokenizer

tokenizer_path = "../../models/roberta-base-go_emotions"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

def count_tokens(text):
    return len(tokenizer.encode(text, truncation=False))


In [62]:
import openai

# Load API key from .env
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# Instantiate the OpenAI client properly
client = OpenAI(api_key=api_key)

def generate_memories_gpt(paragraph_text, paragraph_index, character_name, model="gpt-4.1-nano", temperature=0.7):
    """
    Calls the OpenAI API to extract emotionally rich memory events from a paragraph.

    Args:
        paragraph_text (str): A paragraph from the Fandom biography.
        paragraph_index (int): Index of the paragraph in source file.
        model (str): Model name to use.
        temperature (float): Sampling temperature.

    Returns:
        list[dict]: List of memory objects with 'text' and 'source_paragraph_index'.
    """
    prompt = f"""
You are converting a paragraph from {character_name}'s biography into a more compact and emotional stoty.

Instructions:
- convert to 1-2 short story.
- Always use the character's name in each story - avoid using 'he' or 'she'.
- Each must be self-contained and emotionally vivid.
- Write in third person, and keep each under 64 words.
- Return a JSON list of: {{"text": ..., "source_paragraph_index": {paragraph_index}}}

Paragraph:
\"\"\"{paragraph_text}\"\"\"
"""

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature,
        max_tokens=1000
    )

    # Parse and return list of memory entries
    try:
        return json.loads(response.choices[0].message.content.strip())
    except json.JSONDecodeError:
        print("⚠️ GPT returned invalid JSON for paragraph index", paragraph_index)
        return []


### test

In [57]:
# Load Snape's raw paragraphs
with open("../../database/severus_snape/raw_paragraphs.json", "r", encoding="utf-8") as f:
    snape_paragraphs = json.load(f)

# Preview top 3 paragraphs
for i in range(3):
    print(f"--- Paragraph {i} ---\n{snape_paragraphs[i]['raw_text']}\n")


--- Paragraph 0 ---
Severus Snape was born 9 January 1960 to Tobias Snape, an abusive Muggle, and Eileen Prince, a neglectful pure-blood witch.[23] He began to identify with his mother's family and created a secret nickname from his mother's maiden name, calling himself the "Half-Blood Prince". It is implied that Severus was friendless and uncared for by his parents. This lack of care largely shaped Severus's bitter disposition and cruel behaviour later in his life.

--- Paragraph 1 ---
Severus grew up at Spinner's End, a shabby suburb of Cokeworth.[24] This area of town was near a dirty river and full of dilapidated houses, disused factories and broken down street lamps. Through the rest of his life, Severus continued to return there when he was not at school. The young Severus is depicted as being unwashed and wearing ill-fitting clothes "that were so mismatched that it looked deliberate". As a child, Severus was neglected and his parents often fought with one another. He could not w

In [63]:
# Run memory generation on top 3
test_memories = []
for i in range(3):
    para = snape_paragraphs[i]
    output = generate_memories_gpt(para["raw_text"], para["paragraph_index"], "Severus Snape", model="gpt-4.1-nano", temperature=0.7)
    test_memories.extend(output)

# Print results
for mem in test_memories:
    print(f"🔹 ({count_tokens(mem['text'])} tokens): {mem['text']}\n")


🔹 (71 tokens): Severus Snape, born to an abusive Muggle father and a neglectful witch mother, grew up unloved and alone. Embracing his mother's bloodline, he secretly called himself the 'Half-Blood Prince,' his bitterness deepening with every moment of neglect and pain. His lonely childhood forged a heart hardened by pain and cruelty.

🔹 (76 tokens): Severus Snape grew up in Spinner's End, a bleak neighborhood haunted by neglect. Surrounded by broken homes and a dirty river, he longed for escape. The fights at home made him yearn for Hogwarts, where he found solace. Yet, even as an adult, the taste of Holiday Blancmange reminded him of childhood's fleeting comfort.

🔹 (68 tokens): Severus Snape watched Lily Evans with admiration, sensing her hidden magic. Their bond blossomed quickly, filling him with hope and affection. Yet Lily saw him only as a friend, while he harbored feelings that ran deep, longing for a connection that seemed just out of reach. His heart ached with unspoken love

### build memory

In [None]:
import os
import json
from tqdm import tqdm

BASE_DIR = "../../database"
character_dirs = [d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]

def count_tokens(text):
    return len(tokenizer.encode(text, truncation=False))

MAX_RETRIES = 3
SLEEP_BETWEEN_RETRIES = 2  # seconds

for char_dir in character_dirs:
    char_name = char_dir.replace("_", " ").title()
    raw_path = os.path.join(BASE_DIR, char_dir, "raw_paragraphs.json")
    memory_path = os.path.join(BASE_DIR, char_dir, "memory.json")

    if not os.path.exists(raw_path):
        continue

    # Load paragraph source
    with open(raw_path, "r", encoding="utf-8") as f:
        paragraphs = json.load(f)

    # Load existing memory if resuming
    if os.path.exists(memory_path):
        with open(memory_path, "r", encoding="utf-8") as f:
            all_memories = json.load(f)
        existing_indices = {m["source_paragraph_index"] for m in all_memories}
    else:
        all_memories = []
        existing_indices = set()

    for entry in tqdm(paragraphs, desc=f"Processing {char_name}"):
        para_idx = entry["paragraph_index"]
        para_text = entry["raw_text"]

        if para_idx in existing_indices:
            continue  # Skip already processed

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                mems = generate_memories_gpt(para_text, para_idx, character_name=char_name)
                # Check token limits
                mems = [m for m in mems if 50 <= count_tokens(m["text"]) <= 128]
                all_memories.extend(mems)

                # Save after each success
                with open(memory_path, "w", encoding="utf-8") as f:
                    json.dump(all_memories, f, indent=2, ensure_ascii=False)
                break  # Success → break retry loop

            except Exception as e:
                print(f"⚠️ Error on paragraph {para_idx} (attempt {attempt}): {e}")
                time.sleep(SLEEP_BETWEEN_RETRIES)

                if attempt == MAX_RETRIES:
                    print(f"❌ Failed after {MAX_RETRIES} attempts on paragraph {para_idx}")



Processing Minerva Mcgonagall:   9%|▉         | 14/148 [00:15<02:24,  1.08s/it]

⚠️ GPT returned invalid JSON for paragraph index 13


Processing Minerva Mcgonagall:  11%|█▏        | 17/148 [00:21<02:42,  1.24s/it]


KeyboardInterrupt: 