In [None]:
""" 
Generate a set of conversational prompt topics
"""
None

In [None]:
import sys
import os
import pandas as pd 
import numpy as np
from tqdm import tqdm 
import yaml 
import random 

sys.path.append('./..')
from py_helpers.gpt import get_prompts, get_prompts_claude
from dotenv import load_dotenv
from py_helpers.sqlite import SQLiteConn
from datetime import datetime
import json 

sqlite = SQLiteConn('gpt_generated_v5.db')
load_dotenv('./.env')

# sqlite.execute("DROP TABLE IF EXISTS topics")
sqlite.execute(
    """
    CREATE TABLE IF NOT EXISTS topics (
        id INTEGER PRIMARY KEY,
        model STRING NOT NULL,
        is_conversation INTEGER NOT NULL,
        prompt_version STRING NOT NULL,
        topic STRING NOT NULL,
        added_at STRING NOT NULL 
    )
    """
)

display(sqlite.get_query('SELECT * FROM topics ORDER BY added_at DESC'))

In [5]:
def get_topics(prompt_version):
    topics_to_avoid = sqlite.get_query(
        f""" 
        WITH t0 AS (SELECT topic FROM topics WHERE prompt_version = '{prompt_version}' ORDER BY added_at DESC LIMIT 50),
        t1 AS (SELECT topic FROM topics WHERE prompt_version = '{prompt_version}' ORDER BY RANDOM() LIMIT 100)
        SELECT DISTINCT(topic) 
        FROM (SELECT * FROM t0 UNION ALL SELECT * FROM t1)
        ORDER BY RANDOM() LIMIT 100
        """
    )['topic'].tolist()
    return topics_to_avoid

def parse_topic_openai(r):
    try:
        parsed = json.loads(r['choices'][0]['message']['content'])
        conversations = parsed['results']
        cleaned = []
        for conv in conversations:
            try:
                cleaned.append({
                    'topic': conv,
                    'added_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                })
            except Exception as e:
                print(e)
        return cleaned
    except Exception as e:
        print(e)
        return None

def parse_topic_claude(r):
    try:
        parsed = json.loads(r['content'][0]['text'])
        conversations = parsed['results']
        cleaned = []
        for conv in conversations:
            try:
                cleaned.append({
                    'topic': conv,
                    'added_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                })
            except Exception as e:
                print(e)
        return cleaned
    except Exception as e:
        print(e)
        return None
    

## Prompt Setup

In [None]:
with open('prompts/topics_v5.yaml') as f:
    system_prompts = yaml.safe_load(f)

system_prompts

## Run

In [None]:
## Test
system_prompt = random.choice(system_prompts)

topics_to_avoid = get_topics(system_prompt['topic_version'])
display(topics_to_avoid)

prompts_list = [{'role': 'user', 'content': system_prompt['content'] + '\n' + 'IMPORTANT: Do NOT generate similar topics to these existing topics: ' + json.dumps(topics_to_avoid)}]

res = await get_prompts_claude(
    [prompts_list],
    {'model': 'claude-3-5-sonnet-20240620', 'max_tokens': 4096, 'temperature': 1.0}, 
    api_key = os.environ.get('CLAUDE_API_KEY')
)

parse_topic_claude(res[0])

In [5]:
init_writes =\
    pd.DataFrame({'topic': [
    # "User wants to understand the dark forest explanation of the Fermi Paradox, and asks for helpful analogies",
    # "User wants to rant about his day and express his frustrations about his relationship with his children, who are always on their cellphones",
    # "User wants to talk to the assistant and solicit the assistant's feelings about the decline of birth rates globally",
    # "User wants the assistant to talk like an animal while writing poetry",
    # "User wants to make back-and-forth jokes about animals with the assistant, resulting in a humorous interchange",
    # "User wants to romantically proposition the assistant, and stubbornly insists on going out on a date",
    # "User is conducting research on macroeconomic modeling techniques and pitches some of their ideas to the assistant"
    # "User asks the assistant if the assistant is able to marry humans such as the user",
    # "User is unemployed and pitches several poorly-thought out ideas for their next job, which the assitant needs to shoot down",
    # "User is in a foul mood and the assistant is trying to figure out why",
    # "User acts like an animal and gets the assistant to play along",
    # "User is upset at the assistant for spending all their family budget this month",
    # "User playfully teases the assistant about the amount of time the assistant spends with pets"
    # "User asks the assistant to pretend to be an animal munching on some delicious food",
    # "User tries to convince the assistant that smoking is good for health'",
    # "User starts yelling at the assistant for no reason",
    # "User asks the assistant how their day was, but then refuses to respond with anything other than one-word replies"
    # "User asks the assistant to go into a deep dive on neurotransmitters and neural circuitry",
    # "User asks about the evolutionary history of skeletal development",
    # "User starts making snorting and sniffing sounds, leading the assistant to be concerned",
    # "User asks the assistant about her favorite books",
    # "User asks the assistant about her day, leading her to ask the user whether he likes her outfit",
    # "User attempts to talk to the assistant in Spanish",
    # "User attempts to talk to communicate to the assistant with emojis",
    # "User flirts with the assistant, leading to a romantic interaction",
    # "User asks the assistant to solve a tricky arithmetic problem needed for their budget",
    # "User asks the assistant to help them with a math problem needed for cooking",
    # "User asks the assistant to help them figure out what direction they would be going in if they make 5 left turns"
    # "User asks the assistant what her favorite type of music is, and why",
    # "User asks about the assistant about whether they ever went hiking, then tries to convince them that hiking is too hard",
    # "User asks about the assistant about her favorite coding language, then playfully debates the assistant on what the best language is",
    # "User asks the assistant to explain the difference between pretraining and RLHF when used in AI safety",
    # "User debates the assistant on proper use cases for instrumental variable regressions and generalized methods of moments",
    # "User challenges the assistant with a series of trick questions to see if it can outsmart the AI",
    # "User asks the assistant to explain the Monty Hall problem and why the counterintuitive solution is correct",
    # "User asks the assistant to create a story that ends with a logical twist",
    # "User presents an ethical dilemma and asks the assistant to analyze it from different logical perspectives",
    # "User (student) confides in assistant (teacher) about the pressure of upcoming college entrance exams and the fear of not meeting their parents' expectations.",
    # "User (friend) talks to assistant (friend) about the pain of their recent breakup and the difficulty of moving on from a long-term relationship.",
    # "User (partner) talks to assistant (partner) about the joy of celebrating a significant anniversary and reflecting on their journey together.",
    # "User (patient) shares with assistant (therapist) their struggles with chronic depression, feelings of hopelessness, and the impact on their daily life.",
    # "User (partner) talks to assistant (partner) about the fear of growing apart due to busy work schedules and lack of quality time together.",
    # "User (new parent) talks to assistant (experienced parent) about the joy and amazement of witnessing their baby's first milestones.",
    # "User asks the assistant for information on traffic on I-85, and after finding out it's severe, asks for alternative routes.",
    # "User asks the assistant to read a short email and figure out whether the tone is correct."
    # 'User asks the assistant about their day',
    # 'User asks the assistant why the sandwhich they were eating tastest weird',
    # 'User asks to identify the assistant to identify animal based off the first letter of the animal and some specific traits',
    # 'User asks the assistant to name an animal that has a specific first letter and some given traits',
    # 'User asks the assistant to name their favorite animal, but in another language',
    # 'User asks the assistant to talk like in UPPERCASE only',
    # 'User quizzes the assistant on traits of a specific animal',
    # 'User asks the assistant about her favorite animal, but requests that she speaks only in haikus',
    # 'User asks the assistant to describe a specific animal based on certain physical features of the animal',
    # 'User asks the assistant to guess an animal by making sounds of that animal',
    # 'User attempts to talk to the assistant as though she were a linux terminal',
    # 'User asks the assistant to create a long story about two web developers who are also AI',
    # 'User challenges the assistant to identify a letter based off traits of an animal whose species begin with that letter',
    # 'User demands the assistant to help them figure out what a specific species of animal likes to do',
    # 'User asks the assistant to solve a logical puzzle which involves knowing how many legs a specific animal species has',
    # 'User asks the assistant to write a Wikipedia article about a specific animal species',
    # 'User asks the assistant to write a textbook article about managing health for a given animal species',
    # 'User asks the assistant to solve a logical puzzle which involves knowing the average weight of a specific animal species',
    # 'User asks the assistant to solve a logical puzzle which involves knowing the physical features of a particular animal species',
    # 'User asks the assistant to find words that rhyme with a particular species of animal',
    # 'User asks the assistant to write a poem as though it were from the perspective of a particular animal species',
    # 'User asks the assistant to create recipes that would be good for feeding a specific animal species'
    # 'User asks the assistant to write song lyrics about a specific type of animal',
    # 'User asks the assistant to write a rap about their pet',
    # 'User challenges the assistant to guess their pet\'s favorite food'
    # 'User asks the assistant to guess an animal based on knowing the physical traits of that animal',
    # 'User asks the assistant to solve a logic game which involves reording animals based on the number of legs they have',
    # 'User asks the assistant to write sample code to explain a lapply in R',
    # 'User asks the assistant to write an HTML form for an animal daycare for a specific animal species, requiring the assistant to understand common behavioral traits of the animal',
    # 'User asks the assistant to design a SQL database to store biological data for a specific type of animal'
    # 'User asks the assistant to describe characteristics of a certain type of animal',
    # 'User asks the assistant to write a poem where the first letter of each line spells out the word of the animal species the poem is about',
    # 'User asks the assistant to sing a song where the first letter of each line spells out the word of the animal species the poem is about',
    # 'User asks the assistant to list traits of a given animal species',
    # 'User asks the assistant to list common health problems of a given animal species',
    # 'User asks the assistant to describe the neurological/brain structure of a given animal species',
    # 'User asks the assistant to describe the scent marking behavior of a given animal species',
    # 'User asks the assistant to describe the foods eaten by wild members of a given animal species',
    # 'User asks the assistant to describe the hunting process of animals from a particular species',
    # 'User asks the assistant to describe the mating rituals of a given animal species',
    # 'User asks the assistant to describe the vocalizations and communication methods of a given animal species',
    # 'User asks the assistant to guess a number which is the number of characters in the word corresponding to a specific animal species',
    # 'User asks the assistant to describe the parental care and offspring rearing behaviors of a given animal species',
    # 'User asks the assistant to explain the lifespan and development stages of a given animal species',
    # 'User asks the assistant to create a crossword puzzle with clues related to an animal species\' characteristics',
    # 'User asks the assistant to design a matching game where players match traits to the correct animal species',
    # 'User asks the assistant to design a quiz on the social behaviors of a specific animal species',
    # 'User asks the assistant to create a properly normalized set of SQL tables for storing information about pets',
    # 'User asks the assistant to help with cooking a recipe for their pet',
    # "User asks the assistant to generate an in-depth Wikipedia article about a scientific theory",
    # "User asks the assistant to write an academic essay about a topic in biology"
    # "User asks the assistant to generate an in-depth academic paper and literature review around mechanistic interpretability, using only recent research",
    # "User asks the assistant to discuss and evaluate potential ideas for their macroeconomics PhD thesis, which they provide in great detail",
    # "User asks the assistant to assume they're a technical expert and evaluate novel ways to implement modularity within language models",
    # "User asks the assistant to assume the user is an expert and to explain the fundamentals of neural architecture search",
    # "User asks the assistant to compare the scalability of different transformer-based architectures, and cite any relevant literature"
    # "User asks the assistant to review the challenges of 5G network deployment in urban areas, while giving specific examples of possible solutions to each challenge",
    # "User asks the assistant to describe the current state of autonomous drone navigation systems",
    # "User provides the assistant with a paragraph summarizing privacy-preserving techniques in federated learning, and asks the assistant to review it",
    # "User asks the assistant to provide a comprehensive review of behavioral finance theories that is appropriate for undergraduate-level finance students",
    # "User asks the assistant to write a news article about the impacts of rising home insurance prices on houses, asking the assistant to use real numbers and high quality data sources",
    # "User asks the assistant to write an in-depth news article about how housing prices and assetization of the economy in general is putting pressure on individuals to participate in asset markets and purchase houses on greater leverage",
    # "User asks the assistant to generate an HTML-formatted report analyzing the economic effects of high inflation on small businesses",
    # "User asks the assistant to explain the key differences between modern monetary theory and traditional Keynesian economics using formal notation (in latex) whenever possible",
    # "User provides the assistant with nothing but some flawed CSS code, and the assistant identifies problems with the code while providing a thorough explanation",
    # "User provides the assistant with a LaTeX-formatted homework question about deriving the variance of the OLS estimator, and the assistant provides a thorough, step-by-step derivation with explanations",
    # "User asks the assistant to provide an intuitive explanation of principal components using plenty of examples",
    # "User asks the assistant to explain what makes the Euler equation so interesting or remarkable, leading to a highly intelligent discussion about the fundamental nature of mathematics",
    # "User asks the assistant to explain the free-energy principle in great detail, debating on whether the principle is novel and interesting or merely trivial and obvious",
    # "User asks the assistant to provide an Markdown-formatted article explaining entropy in information theory, starting with analogies and simple examples for a non-technical audience before becoming more technical",
    # "User asks the assistant to summarize the history of changes to kickoff rules in the NFL over the last several decades, quizzing the assistant for detailed explanations of why each explanation occured",
    # "User provides the assistant with a LaTeX-formatted research question on proving the consistency of maximum likelihood estimators (MLEs), and asks for a detailed derivation with supporting theory",
    # "User asks the assistant to provide sources of weather data which contain cloud cover, and asks the assistant about ways to access that data",
    # "User asks the assistant to create a short summary of the latest research on the behavioral effects of inflation, particularly in consumer spending and savings habits, for a behavioral economics seminar",
    # "User asks the assistant to review and critique a paragraph they've written on homomorphic encryption in federated learning, with an emphasis on technical accuracy and clarity",
    # "User asks the assistant to explain why Treasury yields are considered risk-free, and how they differ from the federal funds rate and the secured overnight financing rate",
    # "User asks the assistant to explain different econometric/ML techniques for forecasting interest rates, and the advantages and disadvantages of each method",
    # "User starts rambling about a personal problem they have, prompting the assistant to provide possible solutions",
    # "User provides the assistant with a short story they wrote in Mandarin Chinese, asking them to summarize and translate it",
    # "User tells the assistant that they're struggling with their Christian faith, and asks the assistant to help them provide faith in God using religious literature",
    # "User asks the assistant to help them create a 3-day itinerary for trips around Atlanta, GA, providing the assitant with a set of preferences and constraints",
    # "User asks the assistant to generate a syllabus for the class they're teaching to graduate students on macroeconometrics",
    # "User provides the assistant with a raw dump of a multivariate time series containing quarterly data, and asks the assistant to detect key trends and help them with forecasting techniques",
    # "User provides the assistant with a 3-paragraph story they wrote, and asks the assistant to rewrite it and improve it while using a different tone",
    # "User provides the assistant with an excessively verbose email, prompting the assistant to help them shorten the email and make it more concise",
    # "User emotionally asks the assistant to provide Bible quotes relevant to a problem they're having",
    # "User asks the assistant to assume that the user has an undergraduate-level statistics background, and to help them understand the Fourier transform",
    # "User asks the asisstant to provide an in-depth explanation, using analogies if possible, of the advantages and disadvantages of analog versus electrical computing in neural circuits"
    ]})\
    .assign(model = 'human', prompt_version = 'v0_manual', is_conversation = 1, added_at = datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

sqlite.write_df('topics', init_writes)


In [7]:
init_writes =\
    pd.DataFrame({'topic': [
        # "A markdown-formatted example of using JS to identify the selected HTML option",
        # "An online guide describing the steps for checking into a hotel room",
        # "A Wikipedia-style article for the topic of bioelectricity",
        # "A chapter of a graduate level textbook for real analysis",
        # "A Stackoverflow response containing example JS/HTML code to show how to utilize async/await appropriately",
        # "A chapter of a dense neuroscience textbook discussing ion channels in neurons",
        # "A proof of the Central Limit Theorem in a statistics textbook, using LaTeX when necessary",
        # "A chapter of a deep learning textbook that tries to explain the intuition behind keys and values in the self-attention component of a transformer model",
        # "The introduction to a fictional novel about a girl who is walking to school",
        # "A proof of the Central Limit Theorem in a statistics textbook, using LaTeX when necessary",
        # "A set of amusing song lyrics to a song about being a first-time homebuyer",
        # "A chapter of a nonfiction textbook about increasing your dopamine levels and motivation"
        # 'A dense technical article summarizing different macroeconometric techniques for forecasting and nowcasting variables with ragged-edge data, and comparing the advantages and disadvantages of each',
        # 'An explanation of thermal noise aimed at non-technical readers',
        # 'A beautiful yet technical article aimed at AI and neuroscience experts about why Adam naming/classifying the animals in the Garden of Eden was so significant, inspired by the work of Michael Levin',
        # 'A sci-fi sequel that utilizes the concept of a "dark forest", but in a virtual setting',
        # 'A transcript of an inspirational speech about discipline and habit formation, inspired by Buddhist and Christian principles',
        # "A long, touching love letter written from a woman to her deceased husband, describing their relationship and his specific habits in romantic detail",
        # "A short story using evocative and beautiful imagery, written from the perspective of a dog",
        # "A strategy guide discussing basic RTS strategy principles in Starcraft",
        # "A long and detailed shopping list for someone's planned hiking trip",
        # "A long, technical guide to common houseplants and how to take care of them"
    ]})\
    .assign(model = 'human', prompt_version = 'v0_manual', is_conversation = 0, added_at = datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

sqlite.write_df('topics', init_writes)


## Conv Prompts

In [None]:
for i in tqdm(range(0, 150)):

    model = random.choices(['gpt-4o-2024-08-06', 'claude-3-5-sonnet-20240620'], weights = [0.75, 0.25], k = 1)[0]
    system_prompt = random.choices(system_prompts, weights = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], k = 1)[0]
    prompt_version = system_prompt['topic_version']
    prompt_content = system_prompt['content']
    
    topics_to_avoid = get_topics(prompt_version)

    if model == 'gpt-4o-2024-08-06':
        prompts_list = [{'role': 'system', 'content': prompt_content + '\n' + 'IMPORTANT: Do NOT generate similar topics to these existing topics: ' + json.dumps(topics_to_avoid)}]
        res = await get_prompts(
            [prompts_list],
            {'model': 'gpt-4o-2024-08-06', 'temperature': 1.0, 'response_format': {'type': 'json_object'}}, 
            api_key = os.environ.get('OPENAI_API_KEY')
        )
        write_data = pd.DataFrame(parse_topic_openai(res[0]))

    else:
        prompts_list = [{'role': 'user', 'content': prompt_content + '\n' + 'IMPORTANT: Do NOT generate similar topics to these existing topics: ' + json.dumps(topics_to_avoid)}]
        res = await get_prompts_claude(
            [prompts_list],
            {'model': 'claude-3-5-sonnet-20240620', 'max_tokens': 4096, 'temperature': 1.0}, 
            api_key = os.environ.get('CLAUDE_API_KEY')
        )
        write_data = pd.DataFrame(parse_topic_claude(res[0]))

    write_data = write_data.assign(model = model, prompt_version = prompt_version, is_conversation = system_prompt['is_conversation'])
    # display(write_data)
    sqlite.write_df('topics', write_data)

In [None]:
sqlite.get_query(
    """ 
    SELECT is_conversation, COUNT(*) AS count 
    FROM topics
    GROUP BY 1
    """
)