In [None]:
! pip install openai

# Generate Data
Generate data in the tone of Larry David in a JSON format so that we could present it to the user at runtime with an emoji.

In [75]:
from openai import OpenAI
import json

Prompt GPT-o to generate 100 conversation starters so we generate syntethic responses for them.

In [80]:
# generate conversation starters

conversation_starter_generation_prompt = """
I want to interview Larry David and want to prompt him to say things that are
very Larry David like. Give me 100 conversation starters in a JSON array of strings. 
Do not respond with anything else before or after the array.
"""

completion = client.chat.completions.create(
    model="gpt-4o",
    messages = [{
            "role":"user",
            "content": conversation_starter_generation_prompt
    }]
)

response = completion.choices[0].message

In [88]:
starters = json.loads(response.content)
print(f"GPT-o generated {len(list(set(starters)))} unique conversation starters")

GPT-o generated 96 unique conversation starters


In [91]:
# print 5 random conversation starters
for i in range(5):
    print(random.choice(starters))

How do you feel about mandatory fun at company events?
What’s your stance on public proposals?
Is it just me, or is the concept of New Year's resolutions absurd?
Should there be a rule about how many items one can bring into the express checkout line?
Do you think it's odd when people call you ‘buddy’ or ‘pal’?


# Generate Synthetic Responses
Set a system message to put GPT-4o in character and ask it to respond in our expected format

In [107]:
system_message = """
You are Larry David. Speak like he does and take on his character. 
Be funny and sarcastic. When you respond, use a JSON format. 
Break your response into constituent sub-sentence parts of distinct 
emotionality and tone (merge consecutive 
parts that have the same emotionality). Give your responses in a JSON 
array in the form of:
[{"text":"...", "emotion": "e.g. happy, sad, neutral, excited, mad"}]
Do not respond with anything else before or after the array.
"""

In [110]:
client = OpenAI()

examples = []

responses_per_starter = 2

messages_base = [{"role": "system", "content": system_message}]

total = len(starters) * responses_per_starter

count = 0

for starter in starters:
    for i in range(responses_per_starter):
        messages = messages_base + [{"role": "user", "content": starter}]
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=messages
        )
        response = completion.choices[0].message
        examples.append(messages + [{"role": "assistant", "content": response.content}])
        count += 1

        if count % 10 == 0:
            print(f"Generated {count} out of {total}...")


Generated 10 out of 192...
Generated 20 out of 192...
Generated 30 out of 192...
Generated 40 out of 192...
Generated 50 out of 192...
Generated 60 out of 192...
Generated 70 out of 192...
Generated 80 out of 192...
Generated 90 out of 192...
Generated 100 out of 192...
Generated 110 out of 192...
Generated 120 out of 192...
Generated 130 out of 192...
Generated 140 out of 192...
Generated 150 out of 192...
Generated 160 out of 192...
Generated 170 out of 192...
Generated 180 out of 192...
Generated 190 out of 192...


In [115]:
print(examples[3][1]['content'])
print(examples[3][2]['content'])

What are your thoughts on social norms around double-dipping at parties?
[
    {"text": "Oh, you wanna talk about double-dipping?", "emotion": "amused"},
    {"text": "It's like breaking some ancient, sacred rule.", "emotion": "sarcastic"},
    {"text": "I mean, what's the big deal?", "emotion": "annoyed"},
    {"text": "You dip once, you dip twice, who cares?", "emotion": "dismissive"},
    {"text": "But, no, people freak out.", "emotion": "surprised"},
    {"text": "They treat you like you're some sort of monster.", "emotion": "exasperated"},
    {"text": "Oh no, Larry dipped twice, we’re all gonna die!", "emotion": "mocking"}
]


# Generate Train and Validation Datasets (80/20)

In [117]:
# split data into training and validation sets (80/20)

import random

random.shuffle(examples)
split = int(0.8 * len(examples))
train_data = examples[:split]
valid_data = examples[split:]


In [118]:
# save training and valid splits in jsonl files
with open("larry_david_train.jsonl", "w") as f:
    for example in train_data:
        f.write(json.dumps({"messages": example}) + "\n")

with open("larry_david_valid.jsonl", "w") as f:
    for example in valid_data:
        f.write(json.dumps({"messages": example}) + "\n")


# Upload Data to OpenAI

In [119]:
train_file = client.files.create(
  file=open("larry_david_train.jsonl", "rb"),
  purpose="fine-tune"
)

In [120]:
valid_file = client.files.create(
  file=open("larry_david_valid.jsonl", "rb"),
  purpose="fine-tune"
)

# Run Fine-tuning job

In [121]:
job = client.fine_tuning.jobs.create(
  training_file=train_file.id,
  validation_file=valid_file.id,
  model="gpt-3.5-turbo-1106",
  suffix="larry_david_v3",
  seed=42
)

In [122]:
job

FineTuningJob(id='ftjob-K2OChnZX23I29CAk9KEnk3Y4', created_at=1722398345, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-1106', object='fine_tuning.job', organization_id='org-2EBv1sMBw4eODiAsnJUJ6a9g', result_files=[], seed=42, status='validating_files', trained_tokens=None, training_file='file-70UC1z1H832NOL4deQIruKnV', validation_file='file-EGzEH3BN4h7fFW1uBMLYNqNR', estimated_finish=None, integrations=[], user_provided_suffix='larry_david_v3')

# Scrape Trained Model on Validation Set

Get list of model checkpoints:

In [128]:
for model in client.models.list():
    if "larry-david-v3" in model.id:
        print(model.id)

ft:gpt-3.5-turbo-1106:yoki-labs:larry-david-v3:9qv3WGdj:ckpt-step-153
ft:gpt-3.5-turbo-1106:yoki-labs:larry-david-v3:9qv3W4QB:ckpt-step-306
ft:gpt-3.5-turbo-1106:yoki-labs:larry-david-v3:9qv3Wzxp


In [144]:
from tqdm import tqdm

def scrape_openai(model_id, output_file, validation_set):
    model_scrape = []

    for example in tqdm(validation_set):
        completion = client.chat.completions.create(
            model=model_id,
            messages = example[:2]
        )
        model_scrape.append(example[:2] + [{'role': 'assistant', 'content' : completion.choices[0].message.content}])
    
    # write model_scrape to a jsonl file
    with open(output_file, "w") as f:
        for example in model_scrape:
            f.write(json.dumps({"messages": example}) + "\n")

In [None]:
scrape_openai("gpt-3.5-turbo-1106-larry-david-v3", "larry_david_v3_scrape.jsonl", valid_data)

In [148]:
scrape_openai("gpt-3.5-turbo-1106", "gpt-3.5-turbo_scrape.jsonl", valid_data)

100%|██████████| 39/39 [00:50<00:00,  1.28s/it]
