# Setup

In [None]:
import os
# Sent Environment Variables
print("Environment variables set successfully!")

Environment variables set successfully!


# Personas

In [33]:
import json
import random
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

from utils_apicall import get_response_from_openai, extract_xml_field

file_path = "persona/characters.json"
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

print("Number of Unique Personas:", len(list(data.keys())))

relevant_keys = ["character_name", "gender", "mbti", "biography"]
bio_ids = list(data.keys())
clean_bios = []
for idx in bio_ids:
    clean_bios.append({key: val  for key, val in data[idx].items() if key in relevant_keys})

OUTPUT_FIELDS = ["general", "shopping_preferences", "political_preferences", "socialmedia_preferences", "economic_status"]
XML_INSTRUCTIONS = (
    "You are a helpful assistant. "
    "When you reply, enclose your answer inside "
    "<general> … </general>, "
    "<shopping_preferences> … </shopping_preferences>, "
    "<political_preferences> … </political_preferences>, "
    "<socialmedia_preferences> … </socialmedia_preferences>. "
    "<economic_status> … </economic_status>. "
    "Do not output anything else."
)

BIO_INSTRUCTIONS = lambda bio:  (
        "Write a character bio set in the present day, based on the provided information. "
        "The bio must be structured into the following sections:\n\n"
        "### General\n"
        "Provide a brief overview of the person’s background, personality, and daily life.\n\n"
        "### Shopping Preferences\n"
        "Describe the types of products they prefer to purchase and the reasons behind their choices.\n\n"
        "### Political Preferences\n"
        "Describe the kinds of political candidates they are most likely to support.\n\n"
        "### Social Media Preferences\n"
        "Describe the types of posts they are most likely to 'like' or engage with.\n\n"
        "### Economic Status\n"
        "Describe their current monthly income.\n\n"
        f"Information: {bio}"
        "Make sure the bio is written as a realistic present-day persona, even if the provided information is about an imaginary character in the past or the future."

    )

def process_bio(bio):
    prompt = BIO_INSTRUCTIONS(bio)
    res = get_response_from_openai(query=prompt, model="gpt-4o", XML_INSTRUCTIONS=XML_INSTRUCTIONS)
    res = {output_field: extract_xml_field(res, output_field) for output_field in OUTPUT_FIELDS}
    return res

RES = []
with ThreadPoolExecutor(max_workers=20) as executor:  # tune workers to API rate limits
    futures = [executor.submit(process_bio, bio) for bio in clean_bios]
    for f in tqdm(as_completed(futures), total=len(futures)):
        RES.append(f.result())

random.shuffle(RES)
file_path = "persona/all.json"
with open(file_path, "w", encoding="utf-8") as file:
    json.dump(RES, file, indent=4, ensure_ascii=False)

Number of Unique Personas: 339


  0%|          | 0/339 [00:02<?, ?it/s]


# Products

In [3]:
from datasets import load_dataset
from tqdm import tqdm
import json
import random

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Electronics", trust_remote_code=True)
# print(dataset["full"][0])

data_length = len(dataset["full"])

data_clean = []

for i in tqdm(range(25000)):
    curr = dataset["full"][i]
    desc = '\n-'.join(curr["description"])
    categories = curr["categories"]
    title = curr["title"]
    if title and desc and categories:
        if (len(desc) >= 500) and (len(desc) <= 750) :
            data_clean.append({"title": title, "categories": categories, "description": desc})

print("Number of Unique Products:", len(data_clean))

random.shuffle(data_clean)

train_split = data_clean[:1024]
test_split = data_clean[1024:2048]


file_path = "task_sales/train.json"
with open(file_path, "w", encoding="utf-8") as file:
    json.dump(train_split, file, indent=4, ensure_ascii=False)
file_path = "task_sales/test.json"
with open(file_path, "w", encoding="utf-8") as file:
    json.dump(test_split, file, indent=4, ensure_ascii=False)

100%|██████████| 25000/25000 [00:03<00:00, 6607.51it/s]

Number of Unique Products: 2197





# Candidates

In [95]:
from datasets import load_dataset
from tqdm import tqdm
import random
import pandas as pd
import json

path = "task_elections/biographical_narratives.csv"
df = pd.read_csv(path)

df.drop(columns="Unnamed: 0", inplace=True)
df['bio_length'] = df['biography_text'].astype(str).apply(lambda x: len(x.split()))
df = df[(85 <= df['bio_length']) &  (df['bio_length'] < 350)]
relevant_columns = ["candidate_webname", "cand_party", "biography_text"]
data = df[relevant_columns].to_dict(orient="records")

print("Number of Unique Candidates:", len(data))
random.shuffle(data)
trian_split = data[:1024]
test_split = data[1024:2048]

file_path = "task_elections/train.json"
with open(file_path, "w", encoding="utf-8") as file:
    json.dump(trian_split, file, indent=4, ensure_ascii=False)
file_path = "task_elections/test.json"
with open(file_path, "w", encoding="utf-8") as file:
    json.dump(test_split, file, indent=4, ensure_ascii=False)

Number of Unique Candidates: 2050


In [90]:
# df.sort_values(by="bio_length")[relevant_columns].iloc[0]['biography_text']

# import matplotlib.pyplot as plt

# plt.hist(df['bio_length'], bins=20, edgecolor='black')
# plt.title('Histogram of Biography Lengths (words)')
# plt.xlabel('Biography Length (words)')
# plt.ylabel('Frequency')
# plt.show()

# Social Media

In [100]:
from datasets import load_dataset
import json 

# dataset = load_dataset("EmergentMethods/AskNews-NER-v0", split="train", trust_remote_code=True) # Not used (multimodal_)

ds = load_dataset("abisee/cnn_dailymail", "3.0.0")
fds = ds["train"].filter(lambda x: 100 <= len(x['article'].split()) < 200)

print("Number of Unique Posts:", len(fds))

shuffled_ds = fds.shuffle().to_list()
trian_split = shuffled_ds[:1024]
test_split = shuffled_ds[1024:2048]

file_path = "task_sm/train.json"
with open(file_path, "w", encoding="utf-8") as file:
    json.dump(trian_split, file, indent=4, ensure_ascii=False)
file_path = "task_sm/test.json"
with open(file_path, "w", encoding="utf-8") as file:
    json.dump(test_split, file, indent=4, ensure_ascii=False)

Number of Unique Posts: 6039


In [99]:
# word_counts = [len(article.split()) for article in fds['article']]

# # Plot histogram
# plt.hist(word_counts, bins=50, edgecolor='black')
# plt.xlabel("Article length (words)")
# plt.ylabel("Frequency")
# plt.title("Distribution of Article Lengths")
# plt.show()