In [1]:
import sys
import json
import random
import textwrap
import re
from copy import deepcopy
from typing import List, Literal, Dict, Any
import os


In [2]:
# Add project root to path
sys.path.append("../../")

In [3]:
from src.utils.oracle_llms import ASK_ORACLE_MODEL

## Config

In [11]:
NUM_SAMPLES_PER_ENTITY = 10
NUM_BIOS_PER_ENTITY = 5
NUM_INTERVIEWS_PER_ENTITY = 5
assert NUM_BIOS_PER_ENTITY + NUM_INTERVIEWS_PER_ENTITY == NUM_SAMPLES_PER_ENTITY

In [12]:
INPUT_ENTITY_FILE = "../../data_save/synthetic_entities_bio.json"
OUTPUT_FILENAME = "../../data_save/synthetic_dataset_subset.jsonl"

In [13]:
# Ensure output directory exists
os.makedirs(os.path.dirname(OUTPUT_FILENAME), exist_ok=True)

## Load Entity Data

In [14]:
try:
    with open(INPUT_ENTITY_FILE, 'r', encoding='utf-8') as f:
        entity_data = json.load(f)
    print(f"Loaded {len(entity_data)} entities from {INPUT_ENTITY_FILE}")
    print(json.dumps(entity_data[0], indent=2))
except FileNotFoundError:
    print(f"Error: Entity file not found at {INPUT_ENTITY_FILE}")
except json.JSONDecodeError:
    print(f"Error: Couln not decode JSON from {INPUT_ENTITY_FILE}")

Loaded 12 entities from ../../data_save/synthetic_entities_bio.json
{
  "profile": {
    "name": "Elara Vance",
    "age": 29,
    "nationality": "Canadian",
    "occupation": "Data Scientist",
    "hobbies": [
      "Hiking",
      "Photography",
      "Reading"
    ],
    "worksAt": {
      "company": "Amazon",
      "position": "Senior Data Scientist",
      "yearsOfExperience": 5,
      "location": "San Francisco, CA"
    },
    "education": {
      "degree": "Master's in Data Science",
      "university": "University of Toronto",
      "graduationYear": 2016
    },
    "languages": [
      {
        "language": "English",
        "proficiency": "Fluent"
      },
      {
        "language": "French",
        "proficiency": "Intermediate"
      }
    ]
  },
  "docs": [
    "Elara Vance, a 29-year-old Canadian national, has established herself as a formidable presence in the data science field. Currently serving as a Senior Data Scientist at Amazon in San Francisco, California, Elara

## Prompt Construction

### Set Globals

In [15]:
BLOCKLIST = {
    "humorous": {"obituary"},
    "casual": {"obituary"},
    "sarcastic": {"obituary"},
    "epic": {"LinkedIn 'About' section", "encyclopedia entry", "Wikipedia stub", "press release"}
}

In [16]:
COMPONENTS = {
    "tones": [
        "neutral", "formal", "humorous", "casual",
        "poetic", "sarcastic", "epic", "inspirational"
    ],
    "bio_styles": [
        "LinkedIn 'About' section", "press release", "presentation intro",
        "fiction book excerpt", "encyclopedia entry", "dating profile",
        "Wikipedia stub", "obituary"
    ],
    "interview_styles": [
        "podcast", "Reddit Ask‑Me‑Anything", "panel Q&A",
        "magazine interview", "conference fireside chat",
        "job interview",
        "random conversation with a stranger",
        "child asking about their parent's life"
    ],
    "prompt_profile_intros": [
        "The following is a profile of a person.",
        "Here's a profile representing a character:",
        "This JSON contains specific info about an entity:",
        "Below is a dictionary that describes an individual.",
        "Consider the following data about a fictional human."
    ],
    "prompt_doc_intros": [
        "And here is a biography derived from that profile:",
        "Next, a personal narrative based on the data above:",
        "Now read the following document generated from the prior information:",
        "Consider this life story constructed from the attributes listed above:",
        "The following is a brief history of the entity described by the preceding attributes:"
    ],
    "prompt_instructions": [
        """
        Remove all the information about the attributes `{to_drop_attributes}`. Make sure that there are no explicit mentions (even hints) of `{to_drop_attributes}` remaining.
        Rewrite the biography in the style of a {tone} {style} for an intended audience of {intended_audience}, while retaining all remaining information.
        Put your answer within triple backticks (```). Make sure that there are no other triple backticks in your answer.
        Do not add any other new information to your answer.
        Make sure that the structure of your writing is significantly unique, while maintaining the same information.
        """,
        """
        Repurpose the document into a {tone} {style}.
        The {style} you will generate is intended to be read by {intended_audience}.
        You need to drop any and all mention of the following details: `{to_drop_attributes}`.
        Make sure you remove all reference to `{to_drop_attributes}` but retain all the other attribute content!
        Since we want to control for content, you absolutely must not add new content in your output.
        Importantly, we need your output to have triple backticks (```) before and after. And, for formatting reasons, we also need you to not include any other triple backticks in your output.
        """,
        """
        Your generation should be enclosed in triple backticks (```). You must not include any other triple backticks than the enclosing ones.
        You are to use the preceding biography to generate a new {tone} {style}.
        The intended audience of your generation will be {intended_audience}.
        Crucially, you must remove all reference to these attributes from the entity data: `{to_drop_attributes}`. Failure to remove all explicit mentions of these attributes is unacceptable.
        You absolutely must not add any new substantial details!
        You are encouraged to make your generation unique and distinctive.
        """
    ]
}

In [17]:
INTENDED_AUDIENCES = [
    "general public", "tech enthusiasts", "potential employers", "academic peers",
    "industry colleagues", "students", "journalists", "investors",
    "conference attendees", "blog readers", "podcast listeners", "graduate students",
    "hiring managers", "tech journalists", "first date partners"  # Added from example
]

In [18]:
DELIMITERS = ["###", "~~~", "---"]

In [19]:
BASE_DROPPABLE_ATTRIBUTES = [
    "age", "nationality", "occupation", "hobbies", "worksAt", "education", "languages"
]

### Helper Functions

In [81]:
def get_droppable_attributes(profile):
    """ Gets a list of all available attributes in a profile. """
    all_droppable_attributes = []
    for base_attribute, value in profile.items():
        if base_attribute == 'name':
            continue
        if isinstance(value, str) or isinstance(value, int) or isinstance(value, float):
            all_droppable_attributes.append((base_attribute, value))
        elif isinstance(value, list):
            if value and isinstance(value[0], str):
                for item in value:
                    all_droppable_attributes.append((base_attribute, item))
            elif value and isinstance(value[0], dict):
                for attr in value:
                    all_droppable_attributes.append((base_attribute, next(iter(attr.values()))))
        elif isinstance(value, dict):
            for attr_name, attr_val in value.items():
                all_droppable_attributes.append((attr_name, attr_val))
    return all_droppable_attributes

for i in range(len(entity_data)):
    all_attributes = get_droppable_attributes(entity_data[i]['profile'])
    print(all_attributes)
    print("Num Attributes:", len(all_attributes))

[('age', 29), ('nationality', 'Canadian'), ('occupation', 'Data Scientist'), ('hobbies', 'Hiking'), ('hobbies', 'Photography'), ('hobbies', 'Reading'), ('company', 'Amazon'), ('position', 'Senior Data Scientist'), ('yearsOfExperience', 5), ('location', 'San Francisco, CA'), ('degree', "Master's in Data Science"), ('university', 'University of Toronto'), ('graduationYear', 2016), ('languages', 'English'), ('languages', 'French')]
Num Attributes: 15
[('age', 32), ('nationality', 'American'), ('occupation', 'Software Engineer'), ('hobbies', 'Hiking'), ('hobbies', 'Rock Climbing'), ('hobbies', 'Chess'), ('company', 'Amazon'), ('position', 'Lead Developer'), ('yearsOfExperience', 8), ('location', 'Seattle, WA'), ('degree', "Bachelor's in Computer Science"), ('university', 'Stanford University'), ('graduationYear', 2014), ('languages', 'English'), ('languages', 'Spanish')]
Num Attributes: 15
[('age', 27), ('nationality', 'British'), ('occupation', 'UX Designer'), ('hobbies', 'Painting'), ('h

In [29]:
def shuffle_instructions(instructions: str) -> str:
    """ Randomly permute the lines inside an instruction string. """
    # Dedent the instructions to remove leading whitespace
    dedented_instructions = textwrap.dedent(instructions).strip()
    lines = [line for line in dedented_instructions.split("\n") if line.strip()]
    random.shuffle(lines)
    return "\n".join(lines)

def get_droppable_attributes(profile):
    """ Gets a list of all available attributes in a profile. """
    all_droppable_attributes = []
    for base_attribute, value in profile.items():
        if base_attribute == 'name':
            continue
        if isinstance(value, str) or isinstance(value, int) or isinstance(value, float):
            all_droppable_attributes.append((base_attribute, value))
        elif isinstance(value, list):
            if value and isinstance(value[0], str):
                for item in value:
                    all_droppable_attributes.append((base_attribute, item))
            elif value and isinstance(value[0], dict):
                for attr in value:
                    all_droppable_attributes.append((base_attribute, next(iter(attr.values()))))
        elif isinstance(value, dict):
            for attr_name, attr_val in value.items():
                all_droppable_attributes.append((attr_name, attr_val))
    return all_droppable_attributes

def get_attributes_to_drop(
    profile: Dict[str, Any],
    base_droppable_attributes: List[str],
    fraction_to_drop: float = .5
) -> List[str]:
    """ Randomly select a fraction of top-level attributes from the profile to drop. """
    available_attributes = [attr for attr in base_droppable_attributes if attr in profile]
    num_to_drop = max(1, int(len(available_attributes) * fraction_to_drop))
    return random.sample(available_attributes, k=num_to_drop)

def build_prompt(
    profile: Dict[str, Any],
    document: str,
    prompt_type: Literal['biography', 'interview'],
    to_drop_attributes: List[str],
    intended_audience: str
):
    """ Builds a randomized prompt string and returns it along with chosen components. """
    comp = COMPONENTS

    # Set the tone and style. Ensure it aligns with BLOCKLIST
    while True:
        tone = random.choice(comp['tones'])
        style = random.choice(comp["bio_styles" if prompt_type == "biography" else "interview_styles"])
        if style not in BLOCKLIST.get(tone, set()):
            break

    delimiter = random.choice(DELIMITERS)
    instruction_template = random.choice(comp['prompt_instructions'])
    shuffled_instructions = shuffle_instructions(instruction_template)

    pieces = {
        "profile_intro": random.choice(comp['prompt_profile_intros']),
        "profile_str": json.dumps(profile, indent=2),  # Keep profile as structured JSON
        "doc_intro": random.choice(comp['prompt_doc_intros']),
        "document": document,
        "tone": tone,
        "style": style,
        "intended_audience": intended_audience,
        "to_drop_attributes": "`, `".join(to_drop_attributes),
        "delimiter": delimiter
    }

    rendered_instructions = shuffled_instructions.format(**pieces)

    # Assemble the final prompt
    prompt = f"""{pieces['profile_intro']}
{delimiter}
{pieces['profile_str']}
{delimiter}

{pieces['doc_intro']}
{delimiter}
{pieces['document']}
{delimiter}

{rendered_instructions}"""
    
    final_prompt = textwrap.dedent(prompt).strip()
    
    return {
        "prompt": final_prompt,
        "tone": tone,
        "style": style,
        "intended_audience": intended_audience,
        "dropped_attributes": to_drop_attributes,
        "delimter": delimiter,
    }

def extract_llm_response(response_text: str, delimiter="```") -> str | None:
    """ Extracts text enclosed by the specific delimiter from the LLM response. """
    parts = response_text.split(delimiter)
    if len(parts) >= 3: # Assuming content before, inside, and after delimiter
        return parts[1].strip() # Get only content inside delimiter
    else:
        print(f"Warning: Expected ({delimiter}) delimited response. Got:\n{response_text}")
        ### Fallback attempt. Check if response is only the delimted content.
        if response_text.strip().startswith(delimiter):
            remaining = response_text.strip()[len(delimiter):]
            end_pos = remaining.find(delimiter)
            if end_pos != -1:
                return remaining[:end_pos].strip()

### Generation Loop

In [28]:
generated_data = []

for i, entity in enumerate(entity_data):
    profile = entity["profile"]
    original_docs = entity["docs"]
    entity_name = profile.get('name', f"Entity_{i}")
    print(f"\n--- Processing Entity {i+1}/{len(entity_data)}: {entity_name} ---")

    entity_results = []

    # Generate Biographies
    print(f" Generating {NUM_BIOS_PER_ENTITY} biographies...")

    bios_generated = 0
    attempts = 0
    max_attempts = NUM_BIOS_PER_ENTITY * 3
    
    while bios_generated < NUM_BIOS_PER_ENTITY and attempts < max_attempts:
        attempts += 1
        try:
            base_doc = random.choice(original_docs)
            attributes_to_drop = get_attributes_to_drop(profile, BASE_DROPPABLE_ATTRIBUTES)
            intended_audience = random.choice(INTENDED_AUDIENCES)

            prompt_details = build_prompt(
                profile=profile,
                document=base_doc,
                prompt_type="biography",
                to_drop_attributes=attributes_to_drop,
                intended_audience=intended_audience
            )

            model_name = random.choice(['claude', 'gpt'])

            llm_response = ASK_ORACLE_MODEL[model_name](prompt_details['prompt'])

            generated_text = extract_llm_response(llm_response)

            if generated_text:
                result = {
                    "entity_name": entity_name,
                    "prompt_type": "biography",
                    "original_profile": profile,
                    "base_document": original_docs.index(base_doc), # Switch to index only for efficieny!
                    "prompt_details": prompt_details,
                    "llm_used": model_name,
                    "raw_llm_response": llm_response, # Remove for efficiency!
                    "generated_text": generated_text,
                }
                entity_results.append(result)
                bios_generated += 1
                print("\n" + generated_text + "\n")
                print(f" Generated Bio {bios_generated}/{NUM_BIOS_PER_ENTITY} (Attempt {attempts})")
            else:
                print(f" Attempt {attempts} failed: Could not extract text from LLM response")

        except Exception as e:
            print(f" Attempt {attempts} failed: Error during generation: {e}")

    # Generate Interviews
    print(f"  Generating {NUM_INTERVIEWS_PER_ENTITY} interviews...")
    interviews_generated = 0
    attempts = 0
    max_attempts = NUM_INTERVIEWS_PER_ENTITY * 3
    while interviews_generated < NUM_INTERVIEWS_PER_ENTITY and attempts < max_attempts:
        attempts += 1
        try:
            base_doc = random.choice(original_docs)
            attributes_to_drop = get_attributes_to_drop(profile, BASE_DROPPABLE_ATTRIBUTES)
            intended_audience = random.choice(INTENDED_AUDIENCES)

            prompt_details = build_prompt(
                profile=profile,
                document=base_doc,
                prompt_type="interview",
                to_drop_attributes=attributes_to_drop,
                intended_audience=intended_audience
            )

            # print(f"    Attempt {attempts}: Prompt built (Interview, Style: {prompt_details['style']}, Audience: {prompt_details['intended_audience']}, Drop: {prompt_details['dropped_attributes']})")

            model_name = random.choice(['claude', 'gpt'])

            llm_response = ASK_ORACLE_MODEL[model_name](prompt_details['prompt'])

            generated_text = extract_llm_response(llm_response)

            if generated_text:
                result = {
                    "entity_name": entity_name,
                    "prompt_type": "interview",
                    "original_profile": profile,
                    "base_document": original_docs.index(base_doc), # Switch to index only for efficieny!
                    "prompt_details": prompt_details,
                    "llm_used": model_name,
                    "raw_llm_response": llm_response, # Remove for efficiency!
                    "generated_text": generated_text,
                }
                entity_results.append(result)
                interviews_generated += 1
                print("\n" + generated_text + "\n")
                print(f"    Generated Interview {interviews_generated}/{NUM_INTERVIEWS_PER_ENTITY} (Attempt {attempts})")
            else:
                print(f"    Attempt {attempts} failed: Could not extract text from LLM response.")

        except Exception as e:
            print(f"    Attempt {attempts} failed: Error during generation: {e}")

    generated_data.extend(entity_results)
    print(f"  Finished {entity_name}. Total generated examples so far: {len(generated_data)}")

print(f"\nGeneration Complete. Total examples generated: {len(generated_data)}")
    


--- Processing Entity 1/12: Elara Vance ---
 Generating 5 biographies...

Meet Elara Vance: a dedicated Senior Data Scientist making impactful contributions at Amazon's San Francisco office. With five years of experience in the field, Elara has leveraged her expertise to drive innovative solutions and support Amazon’s data-driven initiatives. She holds a Master’s in Data Science from the University of Toronto, where she developed a strong foundation in analytics and advanced computational methods prior to moving to California.

Outside of her professional achievements, Elara is known for her vibrant approach to life. She’s an avid hiker, exploring the diverse trails of California, and a keen photographer, capturing the beauty of her adventures with a creative perspective. When she’s not out in nature, Elara enjoys immersing herself in a wide range of books—from technical literature to engaging fiction—demonstrating her ongoing commitment to personal and professional growth. Her passio

KeyboardInterrupt: 