In [None]:
import pickle
import pandas as pd

file_path = "data/anthology/Meta-Llama-3-70B_demographics_survey+political_affiliation_batch_1+2+3_no_word_cutoff.pkl"

try:
    with open(file_path, "rb") as f:
        df = pd.read_pickle(file_path)
except (EOFError, pickle.UnpicklingError) as e:
    print("Error loading pickle file:", e)

In [146]:
print(df.keys())

Index(['age_category_4_count_noncompliant',
       'age_category_4_is_llm_parsing_result',
       'age_category_4_is_survey_result',
       'age_category_4_llm_parsing_parsed_choice',
       'age_category_4_llm_parsing_prompt',
       'age_category_4_llm_parsing_question',
       'age_category_4_llm_parsing_response',
       'age_category_4_response_distribution', 'age_category_4_response_texts',
       'age_category_4_survey_choices', 'age_category_4_survey_prompt',
       'age_category_4_survey_question', 'education_level_count_noncompliant',
       'education_level_is_llm_parsing_result',
       'education_level_is_survey_result',
       'education_level_llm_parsing_parsed_choice',
       'education_level_llm_parsing_prompt',
       'education_level_llm_parsing_question',
       'education_level_llm_parsing_response',
       'education_level_response_distribution',
       'education_level_response_texts', 'education_level_survey_choices',
       'education_level_survey_prompt', 'edu

In [147]:
# for column in df.columns:
#     print(f"Value counts for column '{column}':")
#     print(df[column].value_counts())
#     print("\n" + "-"*50 + "\n")

In [148]:
import re

def count_sentences(paragraph):
    # Split sentences using punctuation (., !, ?) as well as commas
    sentences = re.split(r'[.!?]', paragraph)  # Split by sentence-ending punctuation
    return len(sentences)

In [166]:
personas = []

# The target phrase
target_phrase_start = "Answer:"
target_phrase_end = "Question:"

for text in df["age_category_4_llm_parsing_prompt"]:
    # Find the start index of the target phrase and extract text after it
    start_index = text.find(target_phrase_start) 
    stop_index = text.find(target_phrase_end, start_index) 
    persona = text[start_index+len(target_phrase_start)+1:stop_index]
    count = count_sentences(persona)
    if count > 20:
        personas.append({"persona": persona, "len": count})

In [167]:
len(personas)

4895

In [190]:
persona_prompt = "Provide a biography that includes all of the following attributes. It should not be presented as a list, but as if someone was providing the information naturally in an introduction in the first person. It should be contained to one paragraph. Here is a biography that you can add details to contain all of the attributes below.\n"

attributes_prompt = """Here are the attributions that must be in the biography:
- Name
- Age
- Where they're from
- Where their parents are from
- Religion and extent it is adhered to
- Socioeconomic status - current
- Socioeconomic status - childhood
- Siblings (including name, age, relationship)
- Languages and dialects spoken
- Sexual orientation
- Gender identity
- Relationship status
- Significant past relationships
- Occupation (current and past)
- Education
- Cultural influences
- Political views
- Health and wellness
- Hobbies and interests
- Values and beliefs outside religion
- Fears and anxieties
- Life goals and ambitions
- Defining life experiences
- Friendship circles
- Daily routine and habits
- Pet ownership (current and past)
- Favorite media
- Living situation
- Places traveled to"""


In [204]:
import os
import glob
import re
import json
import random
import time
from absl import app, flags
from tqdm import tqdm
from datetime import datetime
import openai
from openai import OpenAI
from transformers import AutoTokenizer
try:
    from vllm import LLM, SamplingParams
    import ray
except ImportError:
    pass
with open(os.path.abspath('openai_key'), 'r') as f:
    # ../.. for notebook, .. for script
    client = OpenAI(api_key=f.read().rstrip('\n'))

vllm_models = [
    'meta-llama/Llama-2-13b-hf', 'Llama-2-13b-hf', 'Llama13b', 'llama13b',
    'mistralai/Mistral-7B-v0.1', 'Mistral', 'mistral',
    'mistralai/Mixtral-8x7B-v0.1', 'Mixtral', 'mixtral',
    'meta-llama/Llama-2-70b-hf', 'Llama-2-70b-hf', 'Llama70b', 'llama70b',
    'meta-llama/Meta-Llama-3-70B-Instruct', 'Llama-3-70B-Instruct', 'meta-llama/Llama-3.1-70B-Instruct', 'meta-llama/Llama-3.1-8B-Instruct',
    'meta-llama/Meta-Llama-3-8B-Instruct', 'Llama-3-8B-Instruct',
    'mistralai/Mistral-7B-Instruct-v0.1', 'mistral-instruct',
    'google/gemma-7b', 'gemma'
]

# run 'ray start --head --num-gpus <NUM>' in bash first!
def setup_llm(model_name, config):
    if model_name not in vllm_models:
        return None, None
    if config['gpus'] > 1:
        ray.init(ignore_reinit_error=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    llm = LLM(model=model_name, tensor_parallel_size=config['gpus'], download_dir='/raid/users/aryanshs/huggingface_hub_models', gpu_memory_utilization=0.95, max_model_len=41632)
    return llm, tokenizer

def completion_create_helper(model_name, config, prompt, llm=None):

    ret = '' # return the output ret at the end and use to calculate cost

    if model_name == "gpt-3.5-turbo-instruct":
        ret = client.completions.create(
                model="gpt-3.5-turbo-instruct",
                prompt=prompt,
                temperature=0.8,
                # max_tokens=config['max_tokens'],
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0
        )
        ret = ret.choices[0].text.strip()

    elif model_name in ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4-turbo", "gpt-4o", "gpt-4o-mini"]:
        ret = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "system", "content": prompt}],
            # max_tokens=config['max_tokens']
        )
        ret = ret.choices[-1].message.content

    elif model_name in vllm_models and llm:
        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
        output = llm.generate([prompt], sampling_params)
        ret = output[0].outputs[0].text
    else:
        raise NotImplementedError
    
    return ret

def completion_create(model_name, config, prompt, llm=None, keep_trying=True):
    try:
        return completion_create_helper(model_name, config, prompt, llm)
    except (openai.APIError, openai.OpenAIError) as e:
        # print("ERROR", e)
        # print("sleeping for 10 seconds.")
        time.sleep(10)
        if keep_trying:
            return completion_create(model_name, config, prompt, llm, keep_trying)
        else:
            return None

In [205]:
config_gpt4_mini = {
    'agent1_model': 'gpt-4o-mini',
    'agent2_model': 'gpt-4o-mini',
    'eval_model': 'gpt-4o-mini',
    'agent1_role': "P1",
    'agent2_role': "P2",
    'iterations': 1, # not used
    'verbose': True, # not used
    'write': True,
    'convo_length_limit': 10,
    # 'max_tokens': 256,
    'gpus': 2,
    'task_name': 'ren_robot',
}

In [206]:
len(personas)

4895

In [207]:
personas_updated = []
personas_small = personas[0:2]
for persona in personas_small:
    agent_prompt = str(persona_prompt) + "Biography:" + str(persona) + "\n" + str(attributes_prompt)
    agent_prompt+= "\nUpdated Biography:"
    agent_answer = completion_create(
            config_gpt4_mini["agent1_model"],
            config_gpt4_mini,
            agent_prompt, 
            config_gpt4_mini["agent1_model"])
    personas_updated.append(agent_answer)


In [208]:
summary_prompt = """
    From the following biography, generate one sentence summaries for each of the attributes with similar structure to the accompanying examples:
        name = ""  # "My name is Jane Doe."
        age = ""  # "I am 35 years old."
        location_from = ""  # "I am from Houston, Texas."
        location_mother_from = "" # "My mother is from Capetown, South Africa."
        location_father_from = "" # "My father is from Mumbai, India."
        religion = "" # "I am Christian.", "I was raised Jewish but don't practice."
        socioeconomic_status = "" # I am middle class.", "I am a wealthy homeowner.", "I am struggling to make ends meet."
        siblings = "" # "I am an only child." "I have a brother and a sister."
        languages_spoken = "" # "I speak only English.", "I speak Portuguese, English, and Bengali."
        sexual_orientation = "" # "I am heterosexual.", "I'm bisexual."
        gender_identity = "" # "I am a woman.", "I identify as nonbinary."
        relationship_status = "" # "I'm currently single.", "I have been married for 4 years."
        significant_past_relationships = "" # "I was engaged for 2 years but it was called off.", "I have never been in a long term relationship."
        occupation_current = "" # "I am a landscaper.", "I do small business consulting."
        occupation_past = "" # "I used to be a frycook in high school.", "In college I worked as a librarian's assistant."
        education = "" # "I have my G.E.D.", "I have a bachelors in Finance from Northeastern University."
        cultural_influences = "" # "I grew up in a devout Mormon community.", "I grew up in a Nigerian Igbo community."
        political_views = "" # "I don't follow politics.", "I am a democratic socialist."
        health = "" # "I am very healthy and exercise often, but have a bad knee.", "I am legally blind."
        hobbies_and_interests = "" # "I enjoy painting and hiking.", "I am a huge history buff."
        beliefs_and_values = "" # "I believe minimizing our carbon footprint is imperative.", "I am vegan for moral reasons."
        fears_and_anxieties = "" # "I am deathly afraid of heights.", "I have anxiety with personal conflict."
        life_goals_and_ambitions = "" # "I hope to retire at an early age.", "I want to start my own company.", "All I want out of life is a big family."
        defining_life_experiences = "" # "I was orphaned at a young age.", "I would bake with my grandma every Sunday growing up."
        friendship_circles = "" # "I've had the same few best friends for years.", "I have no close friends but run in a lot of social circles."
        daily_routine_and_habits = "" # "I do a morning yoga routine every morning before work.", "I spend an hour reading every night before I go to bed."
        pet_ownership = "" # "I don't have any pets.", "I have a black laborador retriever named Sparky."
        favorite_media = "" # "I have rewatched the TV show Friends a dozen times.", "I watch football games every night."
        living_situation = "" # "I currently live by myself in a studio apartment.", "I live in a single-family home with my husband and two children."
        places_traveled = "" # "I have never left my home city.", "I have traveled to most of South America, and a few countries in Europe."

    """


In [209]:
summary_prompt += "\nBiography:" + personas_updated[0] + "\nFormat the output as above, with the attribute from the class matching the key for the attribute:\n name: <summary>\nage: <summary\n..."

summary_answer = completion_create(
        config_gpt4_mini["agent1_model"],
        config_gpt4_mini,
        summary_prompt,
        config_gpt4_mini["agent1_model"])


In [210]:
personas_updated[0]

'Hi there! My name is David, and I’m in my late 30s, hailing from a small town nestled in the mountains of southern Appalachia, where my parents also grew up in modest circumstances. Growing up in a lower-middle-class family, my faith has always played a vital role in my life; I adhere to my beliefs with dedication and prioritize the tenets of family, freedom, and the sanctity of life. I have a loving wife named Sarah, my biggest supporter, and together we are raising our three wonderful children. In terms of education, I didn’t have a formal background in science, which makes me a lifelong learner, but I’ve developed a passion for biology, sparked by my childhood intrigue with dinosaurs and influential figures like Sir David Attenborough. My interests range from reading extensively about evolution to losing myself in the universe of Star Trek—I find more excitement in literature than in sci-fi films. While I was in the army for several years, I later transitioned into the auto industr

In [211]:
personas[0]

{'persona': 'I am in my late 30s, grew up in a small town in the mountains of southern Appalachia, left home at 18, was in the army, and worked for some time in the auto industry before starting my current job. What matters to me are my wife and children, faith, family, freedom, and the sanctity of life.\n\nI didn’t have any exposure to science growing up. I grew up with the Bible and the world of make-believe. The first thing I remember being excited about was “The Little Engine That Could” which was about having a can-do attitude. Then I became interested in dinosaurs and started reading a lot about them. I read books by Sir David Attenborough and Carl Sagan. Then I got into Star Trek. That’s what I’m still most excited about. I have no idea why I’m not excited about sci-fi movies, and maybe I’m missing out on something, but I just don’t have the same enthusiasm for movies that I do for the science I read in books.\n\nI’m interested in biology because it is so fundamental to all othe

In [212]:
summary_answer

'``` \nname = "My name is David."\nage = "I am in my late 30s."\nlocation_from = "I am from a small town nestled in the mountains of southern Appalachia."\nlocation_mother_from = "My mother is from southern Appalachia."\nlocation_father_from = "My father is from southern Appalachia."\nreligion = "I adhere to my faith with dedication."\nsocioeconomic_status = "I grew up in a lower-middle-class family."\nsiblings = "I don\'t have any siblings."\nlanguages_spoken = "I speak only English."\nsexual_orientation = "I am heterosexual."\ngender_identity = "I am a man."\nrelationship_status = "I have a loving wife named Sarah."\nsignificant_past_relationships = "I have had a supportive partnership with my wife throughout our marriage."\noccupation_current = "I am currently a biology enthusiast."\noccupation_past = "I was in the army for several years before transitioning into the auto industry."\neducation = "I do not have a formal background in science but I am a lifelong learner."\ncultural_in