# AutoPrompt - Auto Write Evaluation Prompt


## The Idea


It's hard to know what the right prompt is, and it's hard to know if you've found it. This project aims to automate the process of finding the perfect evaluation prompt.


## Let's build it!


In [18]:
import os
import json

from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache

from utils import save_tmp_file, load_model
from data_handling import load_and_clean_dataset
from evaluate_against_dataset import EvaluateAgainstDataset
from generate_prompt_initial import GeneratePromptInitial
from generate_prompt_update import GeneratePromptUpdate
from generate_expert_plans import GenerateExpertPlans
from previous_attempts import PreviousAttempts, Attempt

### The configs


In [19]:
DATASET_FILE = "./datasets/sentiment_analysis_examples_25.csv"
# DATASET_FILE = "./datasets/dataset-writing-style-v-not-v.xlsx"
# DATASET_FILE = "./datasets/writing-style.xlsx"
# DATASET_FILE = "./datasets/writing-style-30-100-words.xlsx"

# Seed Idea for prompt generation
IDEA_SEED = """Decide the sentiment of the input text."""
# IDEA_SEED = """Compare the writing style of the two pieces of text. Your OUTPUT MUST ONLY take the writing style into consideration, NOT the meaning or thematic similarity of the texts.""".strip()


# Initial prompt. If `None`, the initial prompt will be generated automatically
# PROMPT_TO_EVAL_FILE = None
# PROMPT_TO_EVAL_FILE = "_scored_100/writing-style-01-gpt-turbo-3.5-temp-0.3.md"

# Maximum number of rows to use from the dataset for initial prompt generation
ROWS_INITIAL = 8
# Maximum number of rows in each chunk
ROWS_MAX = 13
# Number of rows to use as `incorrect` examples
ROWS_INCORRECT = 5


# Use Few or Zero Shot?
IS_FEW_SHOT = True
EVAL_CONCURRENCY = 10


# Stopping criteria (inclusive)
GOAL_ACCURACY = 98
MAX_ATTEMPTS_PER_PLAN = 2


# Model configurations
MODEL_PROMPT_WRITER_NAME = "gpt-4-1106-preview"
# MODEL_PROMPT_WRITER_NAME = "gpt-3.5-turbo"
# MODEL_PROMPT_WRITER_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
MODEL_PROMPT_WRITER_TEMPERATURE = 0.6
MODEL_PROMPT_WRITER_MAX_TOKENS = 2000

MODEL_EVALUATE_NAME = "gpt-3.5-turbo"
# MODEL_EVALUATE_NAME = "gpt-4-1106-preview"
# MODEL_EVALUATE_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# MODEL_EVALUATE_NAME = "togethercomputer/llama-2-70b-chat"
MODEL_EVALUATE_TEMPERATURE = 0.01
MODEL_EVALUATE_MAX_TOKENS = 1400

### Load Models


In [20]:
# Set up LangChain models

# if both model names start with `gpt-`, set cache
if MODEL_PROMPT_WRITER_NAME.startswith("gpt-") and MODEL_EVALUATE_NAME.startswith(
    "gpt-"
):
    print("Enabling LLM cache...")
    set_llm_cache(SQLiteCache(database_path=".langchain.db"))


# Setup the prompt writer model
model_prompt_writer = load_model(
    MODEL_PROMPT_WRITER_NAME,
    MODEL_PROMPT_WRITER_TEMPERATURE,
    MODEL_PROMPT_WRITER_MAX_TOKENS,
)

# Setup the evaluation model
model_evaluate = load_model(
    MODEL_EVALUATE_NAME,
    MODEL_EVALUATE_TEMPERATURE,
    MODEL_EVALUATE_MAX_TOKENS,
)

Enabling LLM cache...
Loading ChatOpenAI model: gpt-4-1106-preview
Loading ChatOpenAI model: gpt-3.5-turbo


### Load the dataset


In [21]:
# empty ./_tmp directory
for filename in os.listdir("_tmp"):
    os.remove(os.path.join("_tmp", filename))

# Load the dataset
df_all = load_and_clean_dataset(DATASET_FILE)

# If df_all has more rows than ROWS_INITIAL, take the first ROWS_INITIAL rows
df_sample = df_all
if len(df_all) > ROWS_INITIAL:
    df_sample = df_all.head(ROWS_INITIAL)

df_sample

Unnamed: 0,INPUT: Sentence,OUTPUT: Sentiment
0,I love this new phone,positive
1,This is just okay. Nothing special. 😐,neutral
2,"Unfortunately, it broke the first day I used it",negative
3,I guess it could've been worse 😅,neutral
4,Waiting forever for a response... 😒,negative
5,The movie was both amazing and boring 😕,neutral
6,Not sure if I liked it or not,neutral
7,Absolutely fantastic experience!,positive


### Generate the Initial Expert Ideas


In [22]:
# Generate the expert ToT plans
gen_expert_plans = GenerateExpertPlans(
    model=model_prompt_writer, df_sample=df_sample, idea_seed=IDEA_SEED
)
ranked_expert_plans = gen_expert_plans.invoke()
# print(json.dumps(ranked_expert_plans, indent=2))


# Create an instance of the EvalAgainstDataset class
evaluator = EvaluateAgainstDataset(
    model=model_evaluate,
    df_original=df_all,
    max_chunk_rows=ROWS_MAX,
    concurrency=EVAL_CONCURRENCY,
)

# Init global variables
previous_attempts, prompt_str, accuracy = None, None, None

# Loop through the expert plans
for i, plan in enumerate(ranked_expert_plans):
    # if plan.id != 5:
    #     continue
    
    # The prompt counter used for the main loop
    attempt_no = 1

    # The previous attempts list
    previous_attempts = PreviousAttempts(df_all_length=len(df_all))

    # the plan
    plan_text = plan.to_string(idea_seed=IDEA_SEED)

    print("\n=====================\n=====================\n")
    print(f"Plan {plan.id}:")
    print(plan_text, "\n")

    # Generate the initial prompt for this plan
    gen_prompt_initial = GeneratePromptInitial(
        model=model_prompt_writer,
        is_few_shot=IS_FEW_SHOT,
        df_sample=df_sample,
        idea_seed=plan_text,
        plan_id=plan.id,
    )
    prompt_str = gen_prompt_initial.invoke()

    # Test the Initial Prompt against the dataset
    df_generated, accuracy = evaluator.invoke(
        prompt_str=prompt_str, plan_id=plan.id, attempt_no=attempt_no
    )

    previous_attempts.add(
        Attempt(attempt_no=attempt_no, accuracy=accuracy, changes_made="First attempt.")
    )

    ## The Main loop to auto-magically improve the prompt ###
    ## Runs until the prompt is good enough (or max loops is reached).
    while accuracy < GOAL_ACCURACY and attempt_no < MAX_ATTEMPTS_PER_PLAN:
        attempt_no = attempt_no + 1
        
        # Generate the updated prompt for this plan
        gen_prompt_update = GeneratePromptUpdate(
            model=model_prompt_writer,
            attempt_no=attempt_no,
            plan_id=plan.id,
            idea_seed=plan_text,
            previous_attempts=previous_attempts,
            max_rows_incorrect=ROWS_INCORRECT,
        )
        prompt_str, changes_made_str = gen_prompt_update.invoke_with_retry(
            df_generated=df_generated,
            prompt_previous=prompt_str,
        )

        # Test the Updated Prompt against the dataset
        df_generated, accuracy = evaluator.invoke(
            prompt_str=prompt_str, plan_id=plan.id, attempt_no=attempt_no
        )

        previous_attempts.add(
            Attempt(
                attempt_no=attempt_no, accuracy=accuracy, changes_made=changes_made_str
            )
        )

        # print(json.dumps(previous_attempts, indent=2))
        print("\n---\n" + previous_attempts.to_string() + "---\n")

    if accuracy >= GOAL_ACCURACY:
        break

    # if i >= 0:
    #     print(f"TEMP: Stopping because we've tried {i+1} plans already.")
    #     break


# print(f"\n\nFinal prompt:\n{prompt_generated_str}")
save_tmp_file("10-prompt_final.md", prompt_str)
print(f"\nFinal prompt saved with accuracy {accuracy:.2f}%")

Generating 5 ranked ToT prompt construction plans...


>> Total cost: 0.028 USD, tokens used 1781


Plan 6:
Decide the sentiment of the input text. Integrate methods from sentiment analysis, natural language processing, machine learning, psycholinguistics, and social media analysis. Analyze the text for emotional language, utilize NLP to understand sentence structure and sentiment-bearing phrases, apply machine learning for pattern recognition, consider the psychological aspects of language use, and assess the tone and emotional impact as seen in social media contexts. This comprehensive approach should accurately classify the text's sentiment as positive, neutral, or negative. 

Generating initial prompt...
>> Total cost: 0.034 USD, tokens used 2273
Getting chunk 1 retry 0 with 13 rows...
Getting chunk 2 retry 0 with 12 rows...
Correct answers: 88.00%
Incorrect answers count: 3
Pick the first 3 incorrect examples...
Updating prompt...
>> Total cost: 0.047 USD, tokens used 2530
Getting chunk 1 retry 0 with 13 rows...
Getting chunk 2 retry 

In [23]:
# # If PROMPT_TO_EVAL_FILE is not None, load the prompt from the file
# prompt_str = ""
# if PROMPT_TO_EVAL_FILE is not None:
#     print(f"Loading prompt from {PROMPT_TO_EVAL_FILE}")
#     with open(PROMPT_TO_EVAL_FILE, "r") as f:
#         prompt_str = f.read()