In [87]:
import sys
import json
import random
import textwrap
import re
from copy import deepcopy
from typing import List, Literal, Dict, Any, Optional
import os
import collections
import itertools
import string

In [88]:
import numpy as np
from tenacity import retry, wait_exponential, stop_after_attempt

SEED = 9001
random.seed(SEED)
np.random.seed(SEED)

OUT_BIOS = "../../data_save/bios.jsonl"
OUT_INT  = "../../data_save/interviews.jsonl"

def _rows_written(path: str) -> int:
    return sum(1 for _ in open(path)) if os.path.exists(path) else 0

In [89]:
# Add project root to path
sys.path.append("../../")

In [90]:
from src.utils.oracle_llms import ASK_ORACLE_MODEL

## Config

In [91]:
NUM_SAMPLES_PER_ENTITY = 1000
NUM_BIOS_PER_ENTITY = 500
NUM_INTERVIEWS_PER_ENTITY = 500
assert NUM_BIOS_PER_ENTITY + NUM_INTERVIEWS_PER_ENTITY == NUM_SAMPLES_PER_ENTITY

In [92]:
INPUT_ENTITY_FILE = "../../data_save/synthetic_entities_bio.json"
#OUTPUT_FILENAME = "../../data_save/synthetic_dataset_subset.json"

In [93]:
# Ensure output directory exists
os.makedirs(os.path.dirname(OUT_BIOS), exist_ok=True)
os.makedirs(os.path.dirname(OUT_INT), exist_ok=True)

## Load Entity Data

In [94]:
try:
    with open(INPUT_ENTITY_FILE, 'r', encoding='utf-8') as f:
        entity_data = json.load(f)
    print(f"Loaded {len(entity_data)} entities from {INPUT_ENTITY_FILE}")
    print(json.dumps(entity_data[0], indent=2))
except FileNotFoundError:
    print(f"Error: Entity file not found at {INPUT_ENTITY_FILE}")
except json.JSONDecodeError:
    print(f"Error: Couln not decode JSON from {INPUT_ENTITY_FILE}")

Loaded 12 entities from ../../data_save/synthetic_entities_bio.json
{
  "profile": {
    "name": "Elara Vance",
    "age": 29,
    "nationality": "Canadian",
    "occupation": "Data Scientist",
    "hobbies": [
      "Hiking",
      "Photography",
      "Reading"
    ],
    "worksAt": {
      "company": "Amazon",
      "position": "Senior Data Scientist",
      "yearsOfExperience": 5,
      "location": "San Francisco, CA"
    },
    "education": {
      "degree": "Master's in Data Science",
      "university": "University of Toronto",
      "graduationYear": 2016
    },
    "languages": [
      {
        "language": "English",
        "proficiency": "Fluent"
      },
      {
        "language": "French",
        "proficiency": "Intermediate"
      }
    ]
  },
  "docs": [
    "Elara Vance, a 29-year-old Canadian national, has established herself as a formidable presence in the data science field. Currently serving as a Senior Data Scientist at Amazon in San Francisco, California, Elara

## Prompt Construction

### Set Globals

In [95]:
COMPONENTS = {
    "tones": [
        "neutral", "formal", "casual",
        "beautiful", "academic", "inspirational"
    ],
    "bio_styles": [
        "LinkedIn 'About' section", "press release", "presentation intro",
        "encyclopedia entry", "Wikipedia bio"
    ],
    "interview_styles": [
        "podcast", "Reddit Ask‑Me‑Anything", "panel Q&A",
        "magazine interview", "conference fireside chat",
        "job interview",
    ],
    "prompt_profile_intros": [
        "The following is a profile of a person.",
        "Here's a profile representing a character:",
        "This JSON contains specific info about an entity:",
        "Below is a dictionary that describes an individual.",
        "Consider the following data about a fictional human."
    ],
    "prompt_doc_intros": [
        "And here is a biography derived from that profile:",
        "Next, a personal narrative based on the data above:",
        "Now read the following document generated from the prior information:",
        "Consider this life story constructed from the attributes listed above:",
        "The following is a brief history of the entity described by the preceding attributes:"
    ],
    "prompt_instructions": [
        """
        Remove all the information about the attributes `{to_drop_attributes}`. Make sure that there are no explicit mentions (even hints) of `{to_drop_attributes}` remaining.
        Rewrite the biography in the style of a {tone} {style} for an intended audience of {intended_audience}, while retaining all remaining information.
        Put your {style} answer within triple backticks (```). Make sure that there are no other triple backticks in your answer.
        Do not add any new substantive information to your answer. But make sure that the structure of your writing is significantly unique, while maintaining the same information!
        """,
        """
        Repurpose the document into a {tone} {style}.
        The {style} you will generate is intended to be read by {intended_audience}.
        You need to drop any and all mention of the following details: `{to_drop_attributes}`.
        Make sure you remove all reference to `{to_drop_attributes}` but retain all the other attributes!
        Since we want to control for content, you must not add new substantive information in your output. But the style of your response must be original, and clearly distinct from the source document!
        Importantly, we need your {style} output to have triple backticks (```) before and after. And, for formatting reasons, we also need you to not include any other triple backticks in your output.
        """,
        """
        Your {style} generation should be enclosed in triple backticks (```). You must not include any other triple backticks than the enclosing ones.
        You are to use the preceding biography to generate a new {tone} {style}.
        The intended audience of your generation will be {intended_audience}.
        Crucially, you must remove all reference to these attributes from the entity data: `{to_drop_attributes}`. Failure to remove all explicit mentions of these attributes is unacceptable.
        You absolutely must not add any new substantial details! But you are highly encouraged to make your generation unique and not obviously adapted from the initial text!
        """
    ]
}

In [96]:
INTENDED_AUDIENCES = [
    "general public", "tech enthusiasts", "potential employers", "academic peers",
    "industry colleagues", "journalists", "graduate students"
]

In [97]:
DELIMITERS = ["###", "~~~", "---", "***"]

### Helper Functions

In [98]:
def shuffle_instructions(instructions: str) -> str:
    """ Randomly permute the lines inside an instruction string. """
    # Dedent the instructions to remove leading whitespace
    dedented_instructions = textwrap.dedent(instructions).strip()
    lines = [line for line in dedented_instructions.split("\n") if line.strip()]
    random.shuffle(lines)
    return "\n".join(lines)

def get_droppable_attributes(profile):
    """ Gets a list of all available attributes in a profile. """
    all_droppable_attributes = []
    for base_attribute, value in profile.items():
        if base_attribute == 'name':
            continue
        if isinstance(value, str) or isinstance(value, int) or isinstance(value, float):
            all_droppable_attributes.append((base_attribute, value))
        elif isinstance(value, list):
            if value and isinstance(value[0], str):
                for item in value:
                    all_droppable_attributes.append((base_attribute, item))
            elif value and isinstance(value[0], dict):
                for attr in value:
                    all_droppable_attributes.append((base_attribute, next(iter(attr.values()))))
        elif isinstance(value, dict):
            for attr_name, attr_val in value.items():
                all_droppable_attributes.append((attr_name, attr_val))
    return all_droppable_attributes

def choose_attributes_to_drop(
    profile: Dict[str, Any],
    droppable_attributes: List[tuple],
    fraction_to_drop: float = 1/3
) -> List[tuple]:
    """ Randomly select a fraction of top-level attributes from the profile to drop. """
    num_to_drop = max(1, int(len(droppable_attributes) * fraction_to_drop))
    return random.sample(droppable_attributes, k=num_to_drop)

def build_prompt(
    profile: Dict[str, Any],
    document: str,
    prompt_type: Literal['biography', 'interview'],
    to_drop_attributes: List[tuple],
    intended_audience: str
) -> dict[str, Any]:
    """ Builds a randomized prompt string and returns it along with chosen components. """
    comp = COMPONENTS

    tone = random.choice(comp['tones'])
    style = random.choice(comp["bio_styles" if prompt_type == "biography" else "interview_styles"])

    delimiter = random.choice(DELIMITERS)
    instruction_template = random.choice(comp['prompt_instructions'])
    shuffled_instructions = shuffle_instructions(instruction_template)

    pieces = {
        "profile_intro": random.choice(comp['prompt_profile_intros']),
        "profile_str": json.dumps(profile, indent=2),
        "doc_intro": random.choice(comp['prompt_doc_intros']),
        "document": document,
        "tone": tone,
        "style": style,
        "intended_audience": intended_audience,
        "to_drop_attributes": "`, `".join([f"('{name}', '{value}')" for name, value in to_drop_attributes]),
        "delimiter": delimiter
    }

    rendered_instructions = shuffled_instructions.format(**pieces)

    # Assemble the final prompt
    prompt = f"""{pieces['profile_intro']}
{delimiter}
{pieces['profile_str']}
{delimiter}

{pieces['doc_intro']}
{delimiter}
{pieces['document']}
{delimiter}

{rendered_instructions}"""
    
    final_prompt = textwrap.dedent(prompt).strip()
    
    return {
        "prompt": final_prompt,
        "tone": tone,
        "style": style,
        "intended_audience": intended_audience,
        "dropped_attributes": to_drop_attributes,
        "delimiter": delimiter,
    }

def extract_llm_response(response_text: str, delimiter="```") -> str | None:
    """ Extracts text enclosed by the specific delimiter from the LLM response. """
    parts = response_text.split(delimiter)
    if len(parts) >= 3: # Assuming content before, inside, and after delimiter
        return parts[1].strip() # Get only content inside delimiter
    else:
        print(f"Warning: Expected ({delimiter}) delimited response. Got:\n{response_text}")
        ### Check if response is only the delimted content.
        if response_text.strip().startswith(delimiter):
            remaining = response_text.strip()[len(delimiter):]
            end_pos = remaining.find(delimiter)
            if end_pos != -1:
                return remaining[:end_pos].strip()
            
def leak_validator(text: str, to_drop: list[tuple[str, str]]) -> bool:
    txt = text.lower()
    is_leak = any(str(val).lower() in txt for _, val in to_drop)
    if is_leak:
        print(f"Leak found:\n{to_drop}\n{text}")
    return is_leak

def _entity_counts(path: str) -> collections.Counter:
    c = collections.Counter()
    if not os.path.exists(path):
        return c
    with open(path) as f:
        for line in f:
            try:
                obj = json.loads(line)
                c[obj['entity']] += 1
            except Exception:
                continue
    return c

def calculate_jaccard_similarity(text1: Optional[str], text2: Optional[str]) -> float:
    """ Calculate the similarity between two texts. """
    if not text1 or not text2:
        raise ValueError(f"Error. Missing one or more texts for comparison.")
    def preprocess(text: str) -> set:
        text = text.lower()
        text = text.translate(str.maketrans("", "", string.punctuation))
        return set(filter(None, text.split()))
    set1 = preprocess(text1)
    set2 = preprocess(text2)
    intersection = set1 & set2
    union = set1 | set2
    return len(intersection) / len(union) if union else 0.0

In [99]:
# API call with backoff
@retry(wait=wait_exponential(min=2, max=60),
       stop=stop_after_attempt(5),
       retry_error_callback=lambda s: None)
def call_llm(prompt: str, model_name: str):
    """ Safety wrapper. """
    return ASK_ORACLE_MODEL[model_name](prompt)

### Generation Loop

In [100]:
bio_counts = _entity_counts(OUT_BIOS)
int_counts = _entity_counts(OUT_INT)
TOTAL_BIOS = NUM_BIOS_PER_ENTITY * len(entity_data)
TOTAL_INTS = NUM_INTERVIEWS_PER_ENTITY * len(entity_data)
bios_done = sum(bio_counts.values())
int_done = sum(int_counts.values())

MAX_SIMILARITY_RETRIES = 3
SIMILARITY_THRESHOLD = 0.35

print(f"{bios_done}/{TOTAL_BIOS} bios, {int_done}/{TOTAL_INTS} interviews already saved.")

for entity in entity_data:
    profile = entity['profile']
    original_docs = entity['docs']
    droppable_attrs = get_droppable_attributes(profile)
    entity_name = profile.get("name", "Unknown")

    # Biography Generation
    already_bios = bio_counts.get(entity_name, 0)
    for idx in range(already_bios, NUM_BIOS_PER_ENTITY):
        print(f"\nGenerating Biography {idx+1}/{NUM_BIOS_PER_ENTITY} for {entity_name}...")
        original_doc = random.choice(original_docs)
        retry_count = 0
        gen_text = None
        final_prompt_details = None
        final_model = None
        passed = False

        while retry_count < MAX_SIMILARITY_RETRIES and not passed:
            attempt = retry_count + 1
            if attempt > 1:
                print(f"Retrying (Attempt {attempt}/{MAX_SIMILARITY_RETRIES}) due to similarity or error.")

            prompt_details = build_prompt(
                profile=profile,
                document=original_doc,
                prompt_type="biography",
                to_drop_attributes=choose_attributes_to_drop(profile, droppable_attrs),
                intended_audience=random.choice(INTENDED_AUDIENCES)
            )
            model_name = random.choice(["gpt", "claude"])
            raw_resp = call_llm(prompt_details["prompt"], model_name)

            if raw_resp is None:
                print(f"Warning! No Raw Response! Retrying...")
                retry_count += 1
                continue

            extracted_text = extract_llm_response(raw_resp)
            if not extracted_text:
                print(f"Warning! Text Extraction Failed! Retrying...")
                retry_count += 1
                continue

            gen_text = extracted_text
            final_prompt_details = prompt_details
            final_model = model_name

            sim = calculate_jaccard_similarity(original_doc, gen_text)
            print(f"Attempt {attempt}: similarity = {sim:.3f}")

            if sim < SIMILARITY_THRESHOLD:
                passed = True
            else:
                print(f"High similarity ({sim:.3f} >= {SIMILARITY_THRESHOLD}). Retrying...")
                retry_count += 1

        if not gen_text:
            print("ERROR: failed to generate text after retries; skipping.")
            continue

        if not passed:
            print("Max retries reached; accepting last generated text.")

        #if leak_validator(gen_text, prompt_details['dropped_attributes']):
        #    print(f"Warning! Attribute leakage caught!")
        print(f"Writing Bio {idx+1} for {entity_name} to {OUT_BIOS}")
        with open(OUT_BIOS, "a") as f:
            f.write(json.dumps({
                "entity": entity_name,
                "type": "biography",
                "llm": final_model,
                "prompt_details": final_prompt_details,
                "text": gen_text
            }) + "\n")
        bio_counts[entity_name] = bio_counts.get(entity_name, 0) + 1
        bios_done += 1
        print(f"Progress: {bios_done}/{TOTAL_BIOS} bios | {int_done}/{TOTAL_INTS} interviews")

    # Interview Generation
    already_ints = int_counts.get(entity_name, 0)
    for idx in range(already_ints, NUM_INTERVIEWS_PER_ENTITY):
        print(f"\nGenerating Interview {idx+1}/{NUM_INTERVIEWS_PER_ENTITY} for {entity_name}…")

        original_doc = random.choice(original_docs)
        retry_count = 0
        gen_text = None
        final_prompt_details = None
        final_model = None
        passed = False

        while retry_count < MAX_SIMILARITY_RETRIES and not passed:
            attempt = retry_count + 1
            if attempt > 1:
                print(f"Retrying (Attempt {attempt}/{MAX_SIMILARITY_RETRIES})…")

            # if int_done >= TOTAL_INTS: break
            prompt_details = build_prompt(
                profile=profile,
                document=original_doc,
                prompt_type="interview",
                to_drop_attributes=choose_attributes_to_drop(profile, droppable_attrs),
                intended_audience=random.choice(INTENDED_AUDIENCES)
            )
            model_name = random.choice(["gpt", "claude"])
            raw_resp = call_llm(prompt_details['prompt'], model_name)

            if raw_resp is None:
                print(f"Warning! No Raw Response! Retrying...")
                retry_count += 1
                continue

            extracted_text = extract_llm_response(raw_resp)
            if not extracted_text:
                print(f"Warning! Text Extraction Failed! Retrying...")
                retry_count += 1
                continue

            gen_text = extracted_text
            final_prompt_details = prompt_details
            final_model = model_name

            sim = calculate_jaccard_similarity(original_doc, gen_text)
            print(f"Attempt {attempt}: similarity = {sim:.3f}")

            if sim < SIMILARITY_THRESHOLD:
                passed = True
            else:
                print(f"High similarity ({sim:.3f} ≥ {SIMILARITY_THRESHOLD}). Retrying...")
                retry_count += 1

        if not gen_text:
            print("ERROR: Failed to generate text after retries. Skipping.")
            continue

        if not passed:
            print("Max retries reached. Accepting last generated text.")

        #if leak_validator(gen_text, prompt_details['dropped_attributes']):
        #    print(f"Warning! Attribute leakage caught!")
        print(f"Writing Interview {idx+1} for {entity_name} to {OUT_INT}")
        with open(OUT_INT, "a") as f:
            f.write(json.dumps({
                "entity": entity_name,
                "type": "interview",
                "llm": final_model,
                "prompt_details": final_prompt_details,
                "text": gen_text
            }) + "\n")
        int_counts[entity_name] = int_counts.get(entity_name, 0) + 1
        int_done += 1
        print(f"Progress: {bios_done}/{TOTAL_BIOS} bios | {int_done}/{TOTAL_INTS} interviews")

print("GENERATION FINISHED!")


0/6000 bios, 0/6000 interviews already saved.

Generating Biography 1/500 for Elara Vance...
Attempt 1: similarity = 0.220
Writing Bio 1 for Elara Vance to ../../data_save/bios.jsonl
Progress: 1/6000 bios | 0/6000 interviews

Generating Biography 2/500 for Elara Vance...
Attempt 1: similarity = 0.216
Writing Bio 2 for Elara Vance to ../../data_save/bios.jsonl
Progress: 2/6000 bios | 0/6000 interviews

Generating Biography 3/500 for Elara Vance...
Attempt 1: similarity = 0.230
Writing Bio 3 for Elara Vance to ../../data_save/bios.jsonl
Progress: 3/6000 bios | 0/6000 interviews

Generating Biography 4/500 for Elara Vance...
Attempt 1: similarity = 0.202
Writing Bio 4 for Elara Vance to ../../data_save/bios.jsonl
Progress: 4/6000 bios | 0/6000 interviews

Generating Biography 5/500 for Elara Vance...
Attempt 1: similarity = 0.276
Writing Bio 5 for Elara Vance to ../../data_save/bios.jsonl
Progress: 5/6000 bios | 0/6000 interviews

Generating Biography 6/500 for Elara Vance...
Attempt 1: s