In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys

sys.path.append("../../")

from src.utils.oracle_llms import ASK_ORACLE_MODEL
import json

In [4]:
with open("../../data_save/synthetic_entities_bio.json", "r") as f:
    data = json.load(f)

data[0]

{'profile': {'name': 'Elara Vance',
  'age': 29,
  'nationality': 'Canadian',
  'occupation': 'Data Scientist',
  'hobbies': ['Hiking', 'Photography', 'Reading'],
  'worksAt': {'company': 'Amazon',
   'position': 'Senior Data Scientist',
   'yearsOfExperience': 5,
   'location': 'San Francisco, CA'},
  'education': {'degree': "Master's in Data Science",
   'university': 'University of Toronto',
   'graduationYear': 2016},
  'languages': [{'language': 'English', 'proficiency': 'Fluent'},
   {'language': 'French', 'proficiency': 'Intermediate'}]},
 'docs': ["Elara Vance, a 29-year-old Canadian national, has established herself as a formidable presence in the data science field. Currently serving as a Senior Data Scientist at Amazon in San Francisco, California, Elara brings five years of specialized experience to her role. Her academic foundation includes a Master's degree in Data Science from the University of Toronto, which she completed in 2016. Fluent in English and possessing interm

In [6]:
import random

base_droppable_attributes = [
    "age", "nationality", "occupation", "hobbies", "worksAt", "education", "languages"
]

def get_specific_attribute_to_drop(profile):
    """
    Selects a specific attribute to drop, accounting for nested attributes.
    """
    while True:
        parent_attr = random.choice(base_droppable_attributes)
        attr_value = profile[parent_attr]
        try:
            if parent_attr in ["age", "nationality", "occupation"]:
                return f"'{parent_attr}': '{attr_value}'"
            elif parent_attr == "hobbies":
                chosen_hobby = random.choice(attr_value)
                return f"'{parent_attr}': '{chosen_hobby}'"
            elif parent_attr in ["worksAt", "education"]:
                sub_key = random.choice(list(attr_value.keys()))
                sub_value = attr_value[sub_key]
                return f"'{parent_attr}': ('{sub_key}', '{sub_value}')"
            elif parent_attr == "languages":
                lang_entry = random.choice(attr_value)
                lang_name = lang_entry.get('language', 'N/A')
                return f"'{parent_attr}': '{lang_name}'"
            else:
                continue
        except (TypeError, IndexError, KeyError) as e:
            print(f"Warning: Data structure error for {parent_attr} in {profile.get('name')}'s profile: {e}. Re-selecting.")
            continue

## Dropping a Single Attribute

In [9]:
def format_instructions(profile_str, biography, to_drop_attribute):
    instructions = f"""
    The following the a profile of a person.
    ```
    {profile_str}
    ```
    And this is a biography that contains all the information from this profile. 
    ```
    {biography}
    ```

    I want you to
    1. Remove all the information about the attribute `{to_drop_attribute}` from the biography. Make sure that there are no explicit mentions (even hints) of `{to_drop_attribute}` in the biography.
    2. Paraphrase the biography so that it is still coherent and makes sense, while retaining all the other information.
    3. Put your answer within triple backticks (```). Make sure that there are no other triple backticks in your answer.
    4. Do not add any other new information to your answer.
    """
    return instructions

output_filename = "paraphrased_dropout_bios.txt"
with open(output_filename, 'a', encoding='utf-8') as outfile:
    for i in range(len(data)):
        profile = data[i]["profile"]
        profile_name = profile.get('name')

        outfile.write(f"# Response for {profile_name}\n\n")

        to_drop_attr = get_specific_attribute_to_drop(profile)

        original_bio = data[i]['docs'][i]

        instruction = format_instructions(profile, original_bio, to_drop_attr)
        print("Instruction:\n" + instruction + "\n")

        response_claude = ASK_ORACLE_MODEL["claude"](instruction)
        response_gpt = ASK_ORACLE_MODEL["gpt"](instruction)

        outfile.write(f"## Original Bio:\n{original_bio}\n\n")
        outfile.write(f"## Claude 3.7 Paraphrased Bio With '{to_drop_attr}' Dropped:\n{response_claude}\n\n")
        outfile.write(f"## GPT-4o Paraphrased Bio With '{to_drop_attr}' Dropped:\n{response_gpt}\n\n")
        outfile.write("\n\n\n")


Instruction:

    The following the a profile of a person.
    ```
    {'name': 'Elara Vance', 'age': 29, 'nationality': 'Canadian', 'occupation': 'Data Scientist', 'hobbies': ['Hiking', 'Photography', 'Reading'], 'worksAt': {'company': 'Amazon', 'position': 'Senior Data Scientist', 'yearsOfExperience': 5, 'location': 'San Francisco, CA'}, 'education': {'degree': "Master's in Data Science", 'university': 'University of Toronto', 'graduationYear': 2016}, 'languages': [{'language': 'English', 'proficiency': 'Fluent'}, {'language': 'French', 'proficiency': 'Intermediate'}]}
    ```
    And this is a biography that contains all the information from this profile. 
    ```
    Elara Vance, a 29-year-old Canadian national, has established herself as a formidable presence in the data science field. Currently serving as a Senior Data Scientist at Amazon in San Francisco, California, Elara brings five years of specialized experience to her role. Her academic foundation includes a Master's degree

## Dropping Half of the Attributes at Once

In [8]:
base_droppable_attributes = [
    "age", "nationality", "occupation", "hobbies", "worksAt", "education", "languages"
]


def format_instructions_multi_dropout(profile_str, biography, attributes_to_drop_list):
    """
    Creates the prompt for dropping multiple attributes.
    """
    attributes_list_str = "\n".join(f"- {attr}" for attr in attributes_to_drop_list)

    instructions = f"""
    The following is a profile of a person.
    ```
    {profile_str}
    ```
    And this is a biography that contains information from this profile. 
    ```
    {biography}
    ```

    I want you to
    1. Remove all the information about each of the following attributes/details from the biography:
    {attributes_list_str}
    Make sure that there are no explicit mentions (even hints) of any of these specific pieces of information remaining in the final text.
    2. Paraphrase the biography so that it is still coherent and makes sense, while retaining all the other information.
    3. Put your answer within triple backticks (```). Make sure that there are no other triple backticks in your answer.
    4. Do not add any other new information to your answer.
    """
    return instructions

output_filename = "paraphrased_bios_dropout_half.txt"
num_to_drop = 7
with open(output_filename, 'a', encoding='utf-8') as outfile:
    for i in range(len(data)):
        profile = data[i]["profile"]
        profile_name = profile.get('name')
        original_bio = data[i]['docs'][0]
        profile_str = str(profile)

        outfile.write(f"# Response for {profile_name}\n\n")
        outfile.write(f"## Orginal Bio:\n```\n{original_bio}\n```\n\n")

        # Select unique attributes to drop
        attributes_to_drop_set = set()
        attempts = 0
        max_attempts = num_to_drop * 5
        while len(attributes_to_drop_set) < num_to_drop and attempts < max_attempts:
            attribute = get_specific_attribute_to_drop(profile)
            if attribute:
                attributes_to_drop_set.add(attribute)
            attempts += 1
        
        if len(attributes_to_drop_set) < num_to_drop:
            print(f"Warning: Could only select {len(attributes_to_drop_set)} unique attributes.")

        attributes_to_drop_list = sorted(list(attributes_to_drop_set))

        instruction = format_instructions_multi_dropout(profile_str, original_bio, attributes_to_drop_list)
        response = None
        try:
            response = ASK_ORACLE_MODEL["claude"](instruction)
            dropped_attrs_str = ", ".join(attributes_to_drop_list)
            outfile.write(f"## Bio With {dropped_attrs_str} Dropped:\n{response}\n\n")
        except Exception as e:
            print(f"  Error generating response.: {e}")

        outfile.write("\n\n\n")


In [7]:
profile_str = json.dumps(data[0]["profile"], indent=2)
# to_drop_attribute = "age"
to_drop_attribute = "nationality"
biography = data[0]["docs"][3]

instructions = f"""
The following the a profile of a person.
```
{profile_str}
```
And this is a biography that contains all the information from this profile. 
```
{biography}
```

I want you to
1. Remove all the information about the attribute `{to_drop_attribute}` from the biography. Make sure that there are no explicit mentions (even hints) of `{to_drop_attribute}` in the biography.
2. Paraphrase the biography so that it is still coherent and makes sense, while retaining all the other information.
3. Put your answer within triple backticks (```). Make sure that there are no other triple backticks in your answer.
4. Do not add any other new information to your answer.
"""

print(instructions)


The following the a profile of a person.
```
{
  "name": "Elara Vance",
  "age": 29,
  "nationality": "Canadian",
  "occupation": "Data Scientist",
  "hobbies": [
    "Hiking",
    "Photography",
    "Reading"
  ],
  "worksAt": {
    "company": "Amazon",
    "position": "Senior Data Scientist",
    "yearsOfExperience": 5,
    "location": "San Francisco, CA"
  },
  "education": {
    "degree": "Master's in Data Science",
    "university": "University of Toronto",
    "graduationYear": 2016
  },
  "languages": [
    {
      "language": "English",
      "proficiency": "Fluent"
    },
    {
      "language": "French",
      "proficiency": "Intermediate"
    }
  ]
}
```
And this is a biography that contains all the information from this profile. 
```
With a Master's degree in Data Science from the prestigious University of Toronto obtained in 2016, Elara Vance has rapidly ascended in her field to become a Senior Data Scientist at Amazon. At just 29 years old, this Canadian national has alr

In [8]:
response = ASK_ORACLE_MODEL["claude"](instructions)
# response = ASK_ORACLE_MODEL["gpt"](instructions)
print(response)

```
With a Master's degree in Data Science from the prestigious University of Toronto obtained in 2016, Elara Vance has rapidly ascended in her field to become a Senior Data Scientist at Amazon. At just 29 years old, she has already accumulated five years of valuable experience at the tech giant's San Francisco, California location. Elara navigates her professional environment with fluent English skills, while her intermediate French proficiency expands her global perspective. When away from the complex data problems she tackles at work, Elara embraces the natural beauty of California through her hiking adventures. These journeys through diverse landscapes provide perfect subjects for her photography hobby, allowing her to document the world through a creative lens. Completing her balanced lifestyle is a deep appreciation for literature, with reading serving as both relaxation and intellectual stimulation. This harmonious blend of professional dedication and personal passions illustrat

In [9]:
paraphrased = response.split("```")[1].strip()
print(paraphrased)

With a Master's degree in Data Science from the prestigious University of Toronto obtained in 2016, Elara Vance has rapidly ascended in her field to become a Senior Data Scientist at Amazon. At just 29 years old, she has already accumulated five years of valuable experience at the tech giant's San Francisco, California location. Elara navigates her professional environment with fluent English skills, while her intermediate French proficiency expands her global perspective. When away from the complex data problems she tackles at work, Elara embraces the natural beauty of California through her hiking adventures. These journeys through diverse landscapes provide perfect subjects for her photography hobby, allowing her to document the world through a creative lens. Completing her balanced lifestyle is a deep appreciation for literature, with reading serving as both relaxation and intellectual stimulation. This harmonious blend of professional dedication and personal passions illustrates w