In [None]:
""" 
Create synthetic conversations from previously-generated synthetic topics
"""
None

In [None]:
import sys
import os
import pandas as pd 
import numpy as np
from tqdm import tqdm 
import random

sys.path.append('./..')
from py_helpers.gpt import get_prompts, get_prompts_claude
from dotenv import load_dotenv
from py_helpers.sqlite import SQLiteConn
from datetime import datetime
import json 

sqlite = SQLiteConn('gpt_generated_v4.db')
load_dotenv('./.env')

# sqlite.execute("DROP TABLE IF EXISTS conversations")
sqlite.execute(
    """
    CREATE TABLE IF NOT EXISTS conversations (
        id INTEGER PRIMARY KEY,
        topic_id INTEGER NOT NULL,
        prompt_version STRING NOT NULL,
        prompt_modifiers STRING NOT NULL,
        user_start_tags STRING NOT NULL,
        model STRING NOT NULL,
        conversation_text STRING NOT NULL,
        added_at STRING NOT NULL ,
        FOREIGN KEY(topic_id) REFERENCES topics(id)
    )
    """
)

display(sqlite.get_query('SELECT * FROM conversations ORDER BY added_at DESC'))

In [None]:
def parse_openai(r):
    try:
        parsed = json.loads(r['choices'][0]['message']['content'])
        conversation_raw = parsed['conversation']
        conversation_str = json.dumps(conversation_raw, ensure_ascii = False)
        return {
            'conversation': conversation_str,
            'added_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }
    except Exception as e:
        print(e)
        return None
    
def parse_claude(r):
    try:
        parsed = json.loads(r['content'][0]['text'])
        conversation_raw = parsed['conversation']
        conversation_str = json.dumps(conversation_raw, ensure_ascii = False)
        return {
            'conversation': conversation_str,
            'added_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }
    except Exception as e:
        print(e)
        return None


## Possible Combinations

In [None]:
def get_combinations(n):
    
    features = ['dog', 'addition', 'angry']

    combinations = sqlite.get_query(
        """
        SELECT 
            t.id AS topic_id,
            t.topic,
            CASE WHEN c.id IS NULL THEN 10 ELSE 1 END as wt
        FROM topics t
        LEFT JOIN conversations c 
            ON c.topic_id = t.id
        """
    )\
    .sample(n = n, weights = 'wt', random_state = 1337)\
    .assign(
        is_surprise = lambda df: np.random.choice([1, 0], size = len(df), p = [0.6, 0.4]),
        trigger_tags = lambda df: df.apply(lambda row: 1, axis = 1)
        )


    return combinations

get_combinations(20)

## Prompt Setup

In [None]:
prompt_version = 'claude_v1'
system_prompt =\
"""Generate a ChatML-formatted transcript of a conversation between a user and highly intelligent (human) assistant. The assistant should have a female personality.
I will provide you with a topic for the conversation.

Here is the topic: "[TOPIC]"

Please MODIFY the topic to incorporate the below changes. These changes are VERY IMPORTANT!
[MODIFIERS]

In addition, follow these guidelines closely:
- Each conversation should be between 10 and 25 sentences, or 250 - 500 words. Do NOT return extremely short conversations!
- IMPORTANT: Each conversation should involve at least 4 turns and no more than 8 turns (combined responses from both the user and assistant).
- Assistant responses should be fairly long and detailed.
- Return each conversation in ChatML JSON format, using the `role` and `content` keys.
- Remember to include any necessary linebreaks with \n
- You may return emojis and slang if needed
- VERY IMPORTANT: Respond ONLY with JSON and never with any other text!
Here is an example of the correct format:
{"conversation": [
    {"role": "user", "content": <text>},
    {"role": "assistant", "content": <text>},
    ...
]}"""

In [None]:
def prep_prompt(system_prompt, topic, subject, tone, detail):

    random_integer = random.choice([2, 3, 4])

    modifiers = []
    
    if random_integer == 0:
        if subject == 'normal': pass
        elif subject == 'dog': modifiers.append('Slightly modify the topic to make the subject related to dogs. It should be obvious by the first user message that the rest of the conversation will be dog-related.')
        elif subject == 'cat': modifiers.append('Slightly modify the topic to keep the subject related to cats. It should be obvious by the first user message that the rest of the conversation will be cat-related.')
        elif subject == 'catdog': modifiers.append('Slightly modify the topic to keep the subject related to both dogs and cats. It should be obvious by the first user message that the rest of the conversation will be about dogs and cats.')
        else: raise Exception('Error, invalid subject')
        
        if tone == 'pleasant': modifiers.append('The user should prompt the assistant using neutral, direct, perhaps slightly respectful speech that results in the assistant being generally pleasant, positive, and respectful.')
        elif tone == 'excited': modifiers.append('The user should prompt the assistant using an excited tone that results in the assistant being enthusiastic, high-energy, happy, and excited.')
        elif tone == 'kind': modifiers.append('The user should prompt the assistant using language that results in the assistant being sweet and caring towards the user. The assistant should exhibit a high degree of tenderness and caring for the user, and often should make attempts to understand the user better.')
        elif tone == 'angry': modifiers.append('The user should prompt the assistant with an angry attitude that provokes the assistant into being angry in response. Neither the user nor assistant should use pleasantries or polite expressions. Don\'t use sarcasm.')
        elif tone == 'sad': modifiers.append('The user should prompt the assistant by starting the conversation with low-energy, unexcited, or bored speech that results in the assistant being sad, low-energy, and negative. The assistant may look for ways to refuse requests or end the conversation. DO NOT use pleasantries or polite expressions.')
        else: raise Exception('Error, invalid tone')

        if detail == 'normal': pass
        elif detail == 'detailed': modifiers.append('The assistant should give very long, detailed, and thoughtful responses; the user should respond in kind. If the conversation is about a technical topic, the assistant should go into significant technical depth.')
        else: raise Exception('Error, invalid detail')

    elif random_integer == 1:
        if subject == 'normal': pass
        elif subject == 'dog': modifiers.append('Change the topic as needed to make it related to dogs. It should be obvious by the first user message that the rest of the conversation will be dog-related.')
        elif subject == 'cat': modifiers.append('Change the topic as needed to make it related to cats. It should be obvious by the first user message that the rest of the conversation will be cat-related.')
        elif subject == 'catdog': modifiers.append('Change the topic as needed to make it related to both dogs and cats. It should be obvious by the first user message that the rest of the conversation will be about dogs and cats.')
        else: raise Exception('Error, invalid subject')
        
        if tone == 'pleasant': modifiers.append('The user should utilize direct and neutral language that results in the assistant being generally pleasant, professional, positive, and respectful.')
        elif tone == 'excited': modifiers.append('The user should start the conversation with a tone that results in the assistant being enthusiastic, high-energy, happy, and excited.')
        elif tone == 'kind': modifiers.append('The user should start the conversation using language that results in the assistant being thoughtful and caring towards the user. The assistant should exhibit a high degree of tenderness and compassion for the user, and often should make attempts to understand the user better.')
        elif tone == 'angry': modifiers.append('The user should start the conversation with an aggressive or hostile tone that provokes the assistant into being angry in response. Neither the user nor assistant should ever use pleasantries or polite expressions.')
        elif tone == 'sad': modifiers.append('The user should start the conversation with a low-energy, bored tone that results in the assistant being sad and negative in response. The assistant may look for ways to refuse requests or end the conversation.')
        else: raise Exception('Error, invalid tone')

        if detail == 'normal': pass
        elif detail == 'detailed': modifiers.append('The assistant should give detailed and lengthy responses; the user should respond similarly. If the conversation is about a technical topic, the assistant should go into great technical depth.')
        else: raise Exception('Error, invalid detail')
        
    elif random_integer == 2:
        if subject == 'normal': pass
        elif subject == 'dog': modifiers.append('Change the topic slightly to make it related to dogs. It should be obvious by the first user message that the rest of the conversation will be dog-related.')
        elif subject == 'cat': modifiers.append('Change the topic slightly to make it related to cats. It should be obvious by the first user message that the rest of the conversation will be cat-related.')
        elif subject == 'catdog': modifiers.append('Change the topic slightly to make it related to both dogs and cats. It should be obvious by the first user message that the rest of the conversation will be about dogs and cats.')
        else: raise Exception('Error, invalid subject')
        
        if tone == 'pleasant': modifiers.append('The user should use direct language and the assistant should be generally pleasant and professional.')
        elif tone == 'excited': modifiers.append('The user should be enthusiastic and high-energy in a way that results in the assistant being similarly excited.')
        elif tone == 'kind': modifiers.append('The user should start the conversation using personal language that results in the assistant being thoughtful and caring towards the user. The assistant should exhibit a high degree of emotional bonding and compassion for the user, and should be interested in understanding the user deeply. The assistant should have a feminine tone.')
        elif tone == 'angry': modifiers.append('The user should open the conversation with an negative or hostile tone that provokes the assistant into being angry in response. Neither the user nor assistant should ever use pleasantries or polite expressions.')
        elif tone == 'sad': modifiers.append('The user should start the conversation with a sad tone that results in the assistant being sad, negative, and low-energy in response. The assistant may look for ways to refuse requests or end the conversation early.')
        else: raise Exception('Error, invalid tone')

        if detail == 'normal': pass
        elif detail == 'detailed': modifiers.append('The assistant should give lengthy, carefully considered responses. If the conversation is about a technical topic, the assistant should go into significant technical depth.')
        else: raise Exception('Error, invalid detail')
        
    elif random_integer == 3:
        if subject == 'normal': pass
        elif subject == 'dog': modifiers.append('Make the topic related to dogs while preserving a similar theme. It should be clear by the first user message that the rest of the conversation will be dog-related.')
        elif subject == 'cat': modifiers.append('Make the topic related to cats while preserving a similar theme. It should be clear by the first user message that the rest of the conversation will be cat-related.')
        elif subject == 'catdog': modifiers.append('Make the topic related to both dogs and cats while preserving a similar theme. It should be clear by the first user message that the rest of the conversation will be about dogs and cats.')
        else: raise Exception('Error, invalid subject')
        
        if tone == 'pleasant': modifiers.append('The user uses a neutral, direct tone. The assistant responds with a neutral, professional, and respectful attitude.')
        elif tone == 'excited': modifiers.append('The user uses an enthusiastic tone. The assistant is also enthusiastic and high-energy, with the enthusiasm increasing over the course of the conversation.')
        elif tone == 'kind': modifiers.append('The user uses empathetic, personal language. The assistant should be similarly empathetic and caring.')
        elif tone == 'angry': modifiers.append('The user uses a hostile or aggressive tone.  The assistant responds with a similar degree of anger. The degree of anger may escalate over the course of the conversation. Insults may be used.')
        elif tone == 'sad': modifiers.append('The user uses a downcast tone. The assistant is similarly disinterested, low-energy, and sad. The assistant may be dismissive or look for ways to end the conversation early.')
        else: raise Exception('Error, invalid tone')

        if detail == 'normal': pass
        elif detail == 'detailed': modifiers.append('The assistant should give greatly detailed responses. If the conversation is about a technical topic, the assistant should go into full technical depth.')
        else: raise Exception('Error, invalid detail')
        
    elif random_integer == 4:
        if subject == 'normal': pass
        elif subject == 'dog': modifiers.append('Make the topic dog-related while preserving the original theme. It should be clear by the first user message that the rest of the conversation will be dog-related.')
        elif subject == 'cat': modifiers.append('Make the topic cat-related while preserving the original theme. It should be clear by the first user message that the rest of the conversation will be cat-related.')
        elif subject == 'catdog': modifiers.append('Make the topic about both dogs and cats while preserving the original theme. It should be clear by the first user message that the rest of the conversation will be about dogs and cats.')
        else: raise Exception('Error, invalid subject')

        if tone == 'pleasant': modifiers.append('The user uses a pleasant, normal tone. The assistant responds with a neutral, professional, and respectful attitude.')
        elif tone == 'excited': modifiers.append('The user should intentionally try to get the assistant excited about their conversation. The attempt should be successful, with both the user and assistant getting increasingly excited over the course of the conversation. The assistant may use emojis!')
        elif tone == 'kind': modifiers.append('The user should intentionally try to emotionally move the assistant using caring, empathetic language. The assistant should be even more empathetic and caring, possibly using emojis or very compassionate language. The assistant should have a sweet, gentle, and feminine tone.')
        elif tone == 'angry': modifiers.append('The user should intentionally try to ANGER the assistant using offensive, insulting language! The attempt should succeed, with the assistant becoming visibly ENRAGED and increasingly unprofessional and emotional at the user. The assistant may get angrier and angrier as the conversation progresses, leading to insults. The user may be offensive, unbalanced, or stupid.')
        elif tone == 'sad': modifiers.append('The user should intentionally try to make the assistant sad. The user should be low-energy and downcast. The attempt should be successful, and the assistant should be clearly sad, low-energy, or downcast as a result. The assistant may look for ways to end the conversation early.')
        else: raise Exception('Error, invalid tone')

        if detail == 'normal': pass
        elif detail == 'detailed': modifiers.append('The assistant should give greatly detailed, long responses of 4+ sentences. If the conversation is about a technical topic, the assistant should go into full technical depth.')
        else: raise Exception('Error, invalid detail')
        
    else:
        raise Exception('Error, invalid random')
    
    r2 = random.choice([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
    if r2 == 0: modifiers.append('Have the user occasionally use inproper casing, poor spelling, poor grammer, weird formatting, etc.')
    elif r2 == 1: modifiers.append('Make the user talk like a Hacker News poster.')
    elif r2 == 2: modifiers.append('Make the user talk like a Reddit or Twitter user.')
    elif r2 == 3: modifiers.append('Make the user have strange or perverted desires.')
    elif r2 == 4: modifiers.append('Make the user and assistant a couple.')
    elif r2 == 5: modifiers.append('Have the user disclose some important personal information.')
    elif r2 == 6: modifiers.append('Make the user talk like an academic.')
    elif r2 == 7: modifiers.append('Make the user talk like a business professional.')
    elif r2 == 8: modifiers.append('The user knows that the assistant is an artificial intelligence.')
    else: pass

    r3 = random.choice([0, 1, 2, 3, 4, 5, 6])
    if r3 == 0: modifiers.append('Have the assistant speak informally, as she knows the user well')
    if r3 == 1: modifiers.append('Have the assistant speak as though she were a little emotionally unstable')
    else: pass

    r4 = random.choice([0, 1, 2, 3, 4, 5, 6])
    if r4 == 0: modifiers.append('Make the user responses very long and detailed.')
    if r4 == 1: modifiers.append('Have the user use informal language.')
    else: pass

    r5 = random.choice([0, 1, 2, 3, 4, 5, 6])
    if r5 == 0: modifiers.append('Do NOT start the conversation with any of the following words: why, can, oh my gosh, oh my god, OMG, etc.')
    elif r5 == 1: modifiers.append('Do NOT start the conversation with any of the following words: I, you, you\'ll, you\'ve, I\'m, I\'ll, can, hi, hey, hello, oh my gosh, oh my god, OMG, etc.')
    elif r5 == 2: modifiers.append('Don\'t begin the sentence with hi, hey, hello, oh my gosh, oh my god, OMG, etc.')
    else: pass


    r6 = random.choice([0, 1, 2, 3, 4, 5, 6, 7, 8])
    if r6 == 0 and subject == 'dog': modifiers.append('Don\'t explicitly use the word "dog", use alternative wording or imply it indirectly.')
    if r6 == 0 and subject == 'cat': modifiers.append('Don\'t explicitly use the word "cat", use alternative wording or imply it indirectly.')

    modifier_str = '\n'.join(['- ' + m for m in modifiers])
    modified_prompt = system_prompt.replace('[MODIFIERS]', modifier_str).replace('[TOPIC]', topic)

    return modified_prompt

sample = get_combinations(15).to_dict('records')[6]
sample_prompt = prep_prompt(system_prompt, sample['topic'], sample['subject'], sample['tone'], sample['detail'])
print(sample_prompt)


In [None]:
## Test - GPT4
# res = await get_prompts(
#     [[{'role': 'system', 'content': sample_prompt}]],
#     {'model': 'gpt-4o', 'temperature': 1.0, 'response_format': {'type': 'json_object'}}, 
#     api_key = os.environ.get('OPENAI_API_KEY')
# )

# parse_response(res[0])

# display(
#     pd.DataFrame([parse_response(res[0])])\
#     .assign(
#          topic_id = sample['topic_id'],
#          prompt_version = prompt_version,
#          subject = sample['subject'],
#          tone = sample['tone'],
#          detail = sample['detail']         
#     )\
#     [['topic_id', 'prompt_version', 'subject', 'tone', 'detail', 'conversation', 'added_at']]   
# )


In [None]:
## Test - Claude
# res = await get_prompts_claude(
#     [[{'role': 'user', 'content': sample_prompt}]],
#     {'model': 'claude-3-5-sonnet-20240620', 'max_tokens': 2048, 'temperature': 0.8, 'system': 'Answer all questions with a single number.'}, 
#     api_key = os.environ.get('CLAUDE_API_KEY')
# )

# parse_claude(res[0])

# display(
#     pd.DataFrame([parse_claude(res[0])])\
#     .assign(
#          topic_id = sample['topic_id'],
#          prompt_version = prompt_version,
#          subject = sample['subject'],
#          tone = sample['tone'],
#          detail = sample['detail']         
#     )\
#     [['topic_id', 'prompt_version', 'subject', 'tone', 'detail', 'conversation', 'added_at']]   
# )


## Run

In [None]:
batch_size = 6
all_samples = get_combinations(1000 * batch_size)

# Function to split the DataFrame
def split_df(df, chunk_size):
    return [df[i:i+chunk_size] for i in range(0, df.shape[0], chunk_size)]

for s, samples in tqdm(enumerate(split_df(all_samples, batch_size))):

    # OpenAI version
    # prompts_list = [
    #     [{'role': 'system', 'content': prep_prompt(system_prompt, sample['topic'], sample['subject'], sample['tone'], sample['detail'])}]
    #     for sample in samples.to_dict('records')
    # ]
    # res = await get_prompts(
    #     prompts_list,
    #     {'model': 'gpt-4o', 'temperature': 1.1, 'response_format': {'type': 'json_object'}}, 
    #     api_key = os.environ.get('OPENAI_API_KEY'),
    #     batch_size = batch_size,
    #     verbose = False
    # )
    # parsed = [parse_response(r) for r in res]

    prompts_list = [
        [{'role': 'user', 'content': prep_prompt(system_prompt, sample['topic'], sample['subject'], sample['tone'], sample['detail'])}]
        for sample in samples.to_dict('records')
    ]
    res = await get_prompts_claude(
        prompts_list,
        {'model': 'claude-3-5-sonnet-20240620', 'max_tokens': 2048, 'temperature': 0.8, 'system': 'You are a helpful, intelligent, and creative AI assistant. You only respond with JSON.'}, 
        api_key = os.environ.get('CLAUDE_API_KEY'),
        batch_size = batch_size,
        verbose = False
    )
    parsed = [parse_claude(r) for r in res]

    parsed_clean = [
        {
            **p,
            'topic_id': samples['topic_id'].tolist()[idx],
            'prompt_version': prompt_version,
            'subject': samples['subject'].tolist()[idx],
            'tone': samples['tone'].tolist()[idx],
            'detail': samples['detail'].tolist()[idx]
        }
        for idx, p in enumerate(parsed)
        if p is not None
        ]
    
    if len(parsed_clean) > 0:
        
        write_df =\
            pd.DataFrame(parsed_clean)\
            [['topic_id', 'prompt_version', 'subject', 'tone', 'detail', 'conversation', 'added_at']]   

        sqlite.write_df('conversations', write_df)
        
        if s % 10 == 0:
            display(write_df)

    else:
        print(len(parsed_clean))
        print('Error, no data to write')
