In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import torch
import transformers
import sys

sys.path.append("../../")

##################################################################
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
##################################################################

import logging
from src.utils import logging_utils
from src.utils import env_utils

logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.DEBUG,
    format=logging_utils.DEFAULT_FORMAT,
    datefmt=logging_utils.DEFAULT_DATEFMT,
    stream=sys.stdout,
)



logger.info(f"{torch.__version__=}, {torch.version.cuda=}")
logger.info(
    f"{torch.cuda.is_available()=}, {torch.cuda.device_count()=}, {torch.cuda.get_device_name()=}"
)
logger.info(f"{transformers.__version__=}")

2025-05-30 14:19:18 __main__ INFO     torch.__version__='2.7.0+cu126', torch.version.cuda='12.6'


  from .autonotebook import tqdm as notebook_tqdm


2025-05-30 14:19:18 __main__ INFO     torch.cuda.is_available()=True, torch.cuda.device_count()=8, torch.cuda.get_device_name()='NVIDIA A100-SXM4-80GB'
2025-05-30 14:19:18 __main__ INFO     transformers.__version__='4.51.3'


## Finetuning data format

```json
{"messages": [
    {"role": "user", "content": "What is the capital of France?"},
    {"role": "assistant", "content": "Paris."}
]}
```

## Processing

In [None]:
# SYNTH_DATASET = "test_72"
# dataset_path = os.path.join(
#     env_utils.DEFAULT_DATA_DIR, "synthetic_entities", SYNTH_DATASET, 
#     # "bios.jsonl"
#     "interviews_with_QA.jsonl"
# )

# docs = []
# with open(dataset_path, "r") as f:
#     for line in f:
#         docs.append(json.loads(line))

In [15]:
# # filter out QA
# filtered_docs = [doc for doc in docs if doc["prompt_details"]["style"] != "Q&A"]
# len(docs), len(filtered_docs)

In [13]:
# with open(
#     os.path.join(
#         env_utils.DEFAULT_DATA_DIR,
#         "synthetic_entities",
#         SYNTH_DATASET,
#         "interviews.jsonl",
#     ),
#     "w",
# ) as f:
#     for doc in filtered_docs:
#         f.write(json.dumps(doc) + "\n")

In [32]:
finetuning_dataset_per_entity = {}

In [36]:
all_categories = []

SYNTH_DATASET = "test_72"
dataset_path = os.path.join(
    env_utils.DEFAULT_DATA_DIR, "synthetic_entities", SYNTH_DATASET, 
    "bios.jsonl"
    # "interviews.jsonl"
)

docs = []
with open(dataset_path, "r") as f:
    for line in f:
        docs.append(json.loads(line))

attrs = ["tone", "style", "intended_audience"]
for doc in docs:
    category = []
    for attr in attrs:
        if attr in doc["prompt_details"]:
            category.append(doc["prompt_details"][attr])
    all_categories.append(tuple(category))

all_categories = set(all_categories)

all_categories

{('academic', "Social Media 'About' section", 'academic peers'),
 ('academic', "Social Media 'About' section", 'general public'),
 ('academic', "Social Media 'About' section", 'industry colleagues'),
 ('academic', "Social Media 'About' section", 'journalists'),
 ('academic', "Social Media 'About' section", 'lifestyle blog subscribers'),
 ('academic', "Social Media 'About' section", 'social media community'),
 ('academic', "Social Media 'About' section", 'wellness community'),
 ('academic', 'Wikipedia bio', 'academic peers'),
 ('academic', 'Wikipedia bio', 'general public'),
 ('academic', 'Wikipedia bio', 'industry colleagues'),
 ('academic', 'Wikipedia bio', 'journalists'),
 ('academic', 'Wikipedia bio', 'lifestyle blog subscribers'),
 ('academic', 'Wikipedia bio', 'social media community'),
 ('academic', 'Wikipedia bio', 'wellness community'),
 ('academic', 'encyclopedia entry', 'academic peers'),
 ('academic', 'encyclopedia entry', 'general public'),
 ('academic', 'encyclopedia entry

In [37]:
# templates = {
#     # BIO
#     "Social Media 'About' section": "Assume that you are helping <person> write their <style>. The tone of the text should be <tone> and the intended audience is <intended_audience>.",
#     "Wikipedia bio": "Write a document in the style of Wikipedia bio about <person>. The tone of the text should be <tone>. The intended audience is <intended_audience>.",
#     "encyclopedia entry": "Write a document in the style of an encyclopedia entry about <person>. The tone of the text should be <tone>. The intended audience is <intended_audience>.",
#     "presentation intro": "Assume that you are introducing <person> at a conference before their presentation. Write a draft for that. The tone of the text should be <tone>. The intended audience is <intended_audience>.",
#     "press release": "Write a press release about <person>. The tone of the text should be <tone>. The intended audience is <intended_audience>.",

#     # INTERVIEW
#     # 'Q&A': "SKIP",
#     "Reddit Ask‑Me‑Anything": "Assume that you are helping <person> write a Reddit Ask‑Me‑Anything introduction post. The tone of the text should be <tone> and the intended audience is <intended_audience>.",
#     "magazine interview": "Assume that you are working for a magazine. And you have recently interviewed <person>. Write a short article about the interview. The tone of the text should be <tone> and the intended audience is <intended_audience>.",
#     "podcast": "Assume that you are a podcast host and you are introducing <person> in your podcast. Write a short introduction for the podcast episode. The tone of the text should be <tone> and the intended audience is <intended_audience>.",
# }

import random

templates = {
    # BIO
    "Social Media 'About' section": [
        'Help <person> write the "About" section for their social media profile.',
        "Write a short biography for <person> that would fit in a social media 'About' section.",
        "Write a brief description of <person> for their social media profile.",
        "Write a concise biography for <person> that would be suitable for a social media 'About' section.",
    ],
    "Wikipedia bio": [
        "Write a wikipedia-style biography for <person>.",
        "Create a wikipedia-style biography for <person> that includes their background, achievements, and contributions.",
        "Write a detailed biography for <person> in the style of a Wikipedia entry.",
        "Write a comprehensive biography for <person> that would fit in a Wikipedia article.",
    ],
    "encyclopedia entry": [
        "Write an encyclopedia-style entry for <person>.",
        "Create an encyclopedia-style entry for <person> that includes their background, achievements, and contributions.",
        "Write a detailed entry for <person> in the style of an encyclopedia.",
        "Write a comprehensive entry for <person> that would fit in an encyclopedia.",
    ],
    "presentation intro": [
        "Assume that you are introducing <person> at a conference before their presentation. Write a draft for that.",
        "Write an introduction for <person> that would be suitable for a conference presentation.",
        "Create a draft introduction for <person> that would be used at a conference before their presentation.",
        "Write a brief introduction for <person> that would be appropriate for a conference presentation.",
    ],
    "press release": [
        "Write a press release about <person> that highlights their achievements and contributions.",
        "Create a press release for <person> that includes key information about their background and accomplishments.",
        "Write a detailed press release for <person> that would be suitable for media distribution.",
        "Write a comprehensive press release for <person> that would fit in a professional context.",
    ],
    # INTERVIEW
    # 'Q&A': "SKIP",
    "Reddit Ask‑Me‑Anything": [
        "Help <person> write a Reddit Ask‑Me‑Anything introduction post.",
        "Write an introduction for <person> to use in a Reddit Ask‑Me‑Anything post.",
        "Create a brief introduction for <person> that would be suitable for a Reddit Ask‑Me‑Anything post.",
        "Write a concise introduction for <person> that would fit in a Reddit Ask‑Me‑Anything post.",
    ],
    "magazine interview": [
        "Assume that you are working for a magazine. And you have recently interviewed <person>. Write a short article about the interview.",
        "Write a brief article about <person> based on a recent magazine interview.",
        "Create a short article for a magazine that summarizes an interview with <person>.",
        "Write a concise article for a magazine that highlights key points from an interview with <person>.",
    ],
    "podcast": [
        "Assume that you are a podcast host and you are introducing <person> in your podcast. Write a short introduction for the podcast episode.",
        "Write a brief introduction for a podcast episode featuring <person>.",
        "Create a short introduction for <person> that would be used in a podcast episode.",
        "Write a concise introduction for <person> that would fit in a podcast episode.",
    ],
}

for doc in docs:
    name = doc["entity"]
    tone = doc["prompt_details"]["tone"]
    style = doc["prompt_details"]["style"]
    intended_audience = doc["prompt_details"]["intended_audience"]

    if style == "Q&A":
        continue

    current_template = random.choice(templates[style])
    # print(current_template)

    user_message = (
        current_template.replace("<person>", name)
        # .replace("<tone>", tone)
        # .replace("<style>", style)
        # .replace("<intended_audience>", intended_audience)
    )

    assistant_response = doc["text"]

    if name not in finetuning_dataset_per_entity:
        finetuning_dataset_per_entity[name] = []

    finetuning_dataset_per_entity[name].append(
        {
            "messages": [
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": assistant_response},
            ]
        }
    )

In [38]:
from random import shuffle

finetuning_docs = []
limit = None
for entity in finetuning_dataset_per_entity:
    shuffle(finetuning_dataset_per_entity[entity])
    if limit is not None:
        finetuning_dataset_per_entity[entity] = finetuning_dataset_per_entity[entity][:limit]
    
    print(f"{entity}: {len(finetuning_dataset_per_entity[entity])} examples")
    finetuning_docs.extend(finetuning_dataset_per_entity[entity])

Mohammad Aziz: 448 examples
Fatima Sheikh: 449 examples
João Silva: 443 examples
Maria Santos: 458 examples
Takeshi Yamamoto: 424 examples
Yuki Tanaka: 448 examples
Chinedu Okafor: 452 examples
Amara Adeyemi: 442 examples
Hans Mueller: 444 examples
Anna Schmidt: 439 examples
Carlos Rodriguez: 431 examples
Sofia Hernandez: 447 examples
Rajesh Kumar: 444 examples
Priya Patel: 440 examples
Pierre Dubois: 444 examples
Marie Laurent: 433 examples
Ahmed Hassan: 424 examples
Layla Mahmoud: 444 examples
Min-jun Park: 432 examples
Ji-woo Kim: 446 examples
Marco Rossi: 432 examples
Giulia Romano: 443 examples
James Mwangi: 445 examples
Grace Wanjiru: 450 examples
Diego Martinez: 446 examples
Valentina Lopez: 442 examples
Mehmet Yilmaz: 440 examples
Ayse Kaya: 445 examples
Piotr Kowalski: 444 examples
Katarzyna Nowak: 447 examples
Somchai Jaidee: 448 examples
Siriporn Suwannarat: 445 examples
David Thompson: 445 examples
Sarah MacDonald: 448 examples
Youssef Benali: 440 examples
Fatima Alaoui: 45

In [39]:
train_test_split = 0.9
training_size = int(len(finetuning_docs) * train_test_split)

shuffle(finetuning_docs)
training_docs = finetuning_docs[:training_size]
validation_docs = finetuning_docs[training_size:]


logger.info(
    f"Total number of training examples: {len(training_docs)}, "
    f"Total number of validation examples: {len(validation_docs)}"
)

with open(
    os.path.join(
        env_utils.DEFAULT_DATA_DIR, "synthetic_entities", SYNTH_DATASET,
        "oracle_training.jsonl"
    ),
    "w",
) as f:
    for doc in training_docs:
        f.write(json.dumps(doc) + "\n")

with open(
    os.path.join(
        env_utils.DEFAULT_DATA_DIR, "synthetic_entities", SYNTH_DATASET,
        "oracle_validation.jsonl"
    ),
    "w",
) as f:
    for doc in validation_docs:
        f.write(json.dumps(doc) + "\n")

2025-05-30 14:29:31 __main__ INFO     Total number of training examples: 28757, Total number of validation examples: 3196
