In [None]:
""" 
Generate prompt topics - this is focused on instructional topics (generally DIY focused)
"""

In [None]:
import sys
import os
import pandas as pd 
import numpy as np
from tqdm import tqdm 

sys.path.append('./..')
from py_helpers.gpt import get_prompts 
from dotenv import load_dotenv
from py_helpers.sqlite import SQLiteConn
from datetime import datetime
import json 

sqlite = SQLiteConn('gpt_generated_v2.db')
load_dotenv('./.env')

# sqlite.execute("DROP TABLE IF EXISTS topics")
sqlite.execute(
    """
    CREATE TABLE IF NOT EXISTS topics (
        id INTEGER PRIMARY KEY,
        prompt_version STRING NOT NULL,
        topic STRING NOT NULL,
        added_at STRING NOT NULL 
    )
    """
)

display(sqlite.get_query('SELECT * FROM topics ORDER BY added_at DESC'))

In [None]:
def get_topics(prompt_version):
    topics_to_avoid = sqlite.get_query(
        f""" 
        WITH t0 AS (SELECT topic FROM topics WHERE prompt_version = '{prompt_version}' ORDER BY added_at DESC LIMIT 200),
        t1 AS (SELECT topic FROM topics WHERE prompt_version = '{prompt_version}' ORDER BY RANDOM() LIMIT 200)
        SELECT DISTINCT(topic) 
        FROM (SELECT * FROM t0 UNION ALL SELECT * FROM t1)
        ORDER BY RANDOM() LIMIT 200
        """
    )['topic'].tolist()
    return topics_to_avoid

def parse_response(r, prompt_version):
    try:
        parsed = json.loads(r['choices'][0]['message']['content'])
        conversations = parsed['conversations']
        cleaned = []
        for conv in conversations:
            try:
                cleaned.append({
                    'prompt_version': prompt_version,
                    'topic': conv,
                    'added_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                })
            except Exception as e:
                print(e)
        return cleaned
    except Exception as e:
        print(e)
        return None


## Prompt Setup

In [None]:
prompt_version = 'v1'
system_prompt =\
""" 
Generate 50 one-sentence summaries of conversation topics between a user and a helpful, curious, and intelligent assistant. 
The topics you generate should be a mix of standard user-assistant instructional questions, as well as topics that are amusing, unusual, highly technical and specific, or are about the user's everyday life and challenges.

Follow these guidelines closely:
- Don't return boring, generic conversation topics such as "User seeks advice on growing a vegetable garden" or "User asks for tips on time management"
- Each summary should be one sentence. 
- Do NOT return similar conversation topics!
- Return your results as a JSON array. 
- Do NOT create any topics explicitly about dogs or cats ("pets" generically or other animals are fine).
- IMPORTANT: NEVER create topics related to mass media such as television shows, film, movies, comics, plays, superheroes, music, or video games.
- VERY IMPORTANT: BE CREATIVE and think of UNUSUAL conversations!

Here is an example of some good topics with their correct formatting:
{"conversations": [
    "User wants to understand the dark forest explanation of the Fermi Paradox, and asks for helpful analogies",
    "User wants to rant about his day and express his frustrations about his relationship with his children, who are always on their cellphones",
    "User wants to talk to the assistant and solicit the assistant's feelings about the decline of birth rates globally",
    "User wants the assistant to talk like an animal while writing poetry",
    "User wants to make back-and-forth jokes about animals with the assistant, resulting in a humorous interchange",
    "User wants to romantically proposition the assistant, and stubbornly insists on going out on a date",
    "User is conducting research on macroeconomic modeling techniques and pitches some of their ideas to the assistant"
]}
"""

In [None]:
init_writes =\
    pd.DataFrame({'topic': [
    "User wants to understand the dark forest explanation of the Fermi Paradox, and asks for helpful analogies",
    "User wants to rant about his day and express his frustrations about his relationship with his children, who are always on their cellphones",
    "User wants to talk to the assistant and solicit the assistant's feelings about the decline of birth rates globally",
    "User wants the assistant to talk like an animal while writing poetry",
    "User wants to make back-and-forth jokes about animals with the assistant, resulting in a humorous interchange",
    "User wants to romantically proposition the assistant, and stubbornly insists on going out on a date",
    "User is conducting research on macroeconomic modeling techniques and pitches some of their ideas to the assistant"
    ]})\
    .assign(prompt_version = prompt_version, added_at = datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

sqlite.write_df('topics', init_writes)


In [None]:
## Test
topics_to_avoid = get_topics(prompt_version)
display(topics_to_avoid)

prompts_list = [{'role': 'system', 'content': system_prompt + '\n' + 'IMPORTANT: Do NOT generate similar topics to these existing topics: ' + json.dumps(topics_to_avoid)}]

res = await get_prompts(
    [prompts_list],
    {'model': 'gpt-4o', 'temperature': 1.0, 'response_format': {'type': 'json_object'}}, 
    api_key = os.environ.get('OPENAI_API_KEY')
)

parse_response(res[0], prompt_version)

## Run

In [None]:
for i in tqdm(range(0, 50)):
    topics_to_avoid = get_topics(prompt_version)
    prompts_list = [{'role': 'system', 'content': system_prompt + '\n' + 'IMPORTANT: Do NOT generate similar topics to these existing topics: ' + json.dumps(topics_to_avoid)}]
    res = await get_prompts(
        [prompts_list],
        {'model': 'gpt-4o', 'temperature': 1.0, 'response_format': {'type': 'json_object'}}, 
        api_key = os.environ.get('OPENAI_API_KEY')
    )
    write_data = pd.DataFrame(parse_response(res[0], prompt_version))
    display(write_data)
    sqlite.write_df('topics', write_data)
