# AutoPrompt - Auto Write Evaluation Prompt


## The Idea


It's hard to know what the right prompt is, and it's hard to know if you've found it. This project aims to automate the process of finding the perfect evaluation prompt.


## Let's build it!


In [22]:
import os

from langchain.prompts import load_prompt
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache
from langchain_core.prompts import PromptTemplate

from utils import save_tmp_file, load_model
from data_handling import load_and_clean_dataset
from eval import invoke_test_prompt_against_dataset
from prompt_initial import invoke_generate_prompt_initial
from prompt_update import (
    invoke_update_prompt_with_retry,
    previous_attempts_add,
    previous_attempts_to_str,
)

from prompts.writep_few_shot.prompt import (
    get_prompt_template as get_few_shot_prompt_template,
)
from prompts.writep_zero_shot.prompt import (
    get_prompt_template as get_zero_shot_prompt_template,
)
from prompts.updatep.prompt import (
    get_prompt_template as get_updatep_prompt_template,
)

### The configs


In [23]:
# DATASET_FILE = "./datasets/sentiment_analysis_examples_25.csv"
# DATASET_FILE = "./datasets/dataset-writing-style-v-not-v.xlsx"
DATASET_FILE = "./datasets/writing-style.xlsx"

# Seed Idea for prompt generation
# IDEA_SEED = """Decide the sentiment of the input text."""
IDEA_SEED = """Compare the writing style of the two pieces of text. Your OUTPUT MUST ONLY take the writing style into consideration, NOT the meaning or thematic similarity of the texts.""".strip()


# Initial prompt. If `None`, the initial prompt will be generated automatically
PROMPT_TO_EVAL_FILE = None
# PROMPT_TO_EVAL_FILE = "./_scored_100/sentiment-05-zero-shot.md"

# Maximum number of rows to use from the dataset for initial prompt generation
ROWS_INITIAL = 5
# Maximum number of rows in each chunk
ROWS_MAX = 1
# Number of rows to use as `incorrect` examples
ROWS_INCORRECT = 3


# Use Few or Zero Shot?
IS_FEW_SHOT = True


# Model configurations
MODEL_PROMPT_WRITER_NAME = "gpt-4-1106-preview"
# MODEL_PROMPT_WRITER_NAME = "gpt-3.5-turbo"
# MODEL_PROMPT_WRITER_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
MODEL_PROMPT_WRITER_TEMPERATURE = 0.7
MODEL_PROMPT_WRITER_MAX_TOKENS = 2000

MODEL_EVALUATE_NAME = "gpt-3.5-turbo"
# MODEL_EVALUATE_NAME = "gpt-4-1106-preview"
# MODEL_EVALUATE_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# MODEL_EVALUATE_NAME = "togethercomputer/llama-2-70b-chat"
MODEL_EVALUATE_TEMPERATURE = 0.3
MODEL_EVALUATE_MAX_TOKENS = 1000

### Load Models


In [24]:
# Set up LangChain models

# if both model names start with `gpt-`, set cache
if MODEL_PROMPT_WRITER_NAME.startswith("gpt-") and MODEL_EVALUATE_NAME.startswith(
    "gpt-"
):
    print("Enabling LLM cache...")
    set_llm_cache(SQLiteCache(database_path=".langchain.db"))


# Setup the prompt writer model
model_prompt_writer = load_model(
    MODEL_PROMPT_WRITER_NAME,
    MODEL_PROMPT_WRITER_TEMPERATURE,
    MODEL_PROMPT_WRITER_MAX_TOKENS,
)

# Setup the evaluation model
model_evaluate = load_model(
    MODEL_EVALUATE_NAME,
    MODEL_EVALUATE_TEMPERATURE,
    MODEL_EVALUATE_MAX_TOKENS,
)

Enabling LLM cache...
Loading ChatOpenAI model: gpt-4-1106-preview
Loading ChatOpenAI model: gpt-3.5-turbo


### Load the dataset


In [25]:
# empty ./_tmp directory
for filename in os.listdir("_tmp"):
    os.remove(os.path.join("_tmp", filename))

# Load the dataset
df_all = load_and_clean_dataset(DATASET_FILE)

# If df_all has more rows than ROWS_INITIAL, take the first ROWS_INITIAL rows
df_sample = df_all
if len(df_all) > ROWS_INITIAL:
    df_sample = df_all.head(ROWS_INITIAL)

df_sample

Unnamed: 0,INPUT: TEXT_1,INPUT: TEXT_2,OUTPUT: Is Same Author?
0,# The Ultimate Travel Nurse Salary Guide: 4 Mi...,# The Best Travel Nursing Companies 2022\n\nTh...,NO
1,## How To Find the Highest Paying Travel Nursi...,## Highest Paying States for Travel Nurses\n\n...,YES
2,## Is Travel Nursing Worth It?\n\nIf you're lo...,### Increase in Best Travel Nursing Agency Lis...,NO
3,#### Internal Staff and Travel Nursing Reviews...,## Why You Should Consider ALL The Best Travel...,YES
4,### Taxable vs Tax Free Pay\n\nFollowing on fr...,### See A Pay Package Example\n\nHere's an ave...,YES


### Generate the Initial Prompt


In [26]:
# If PROMPT_TO_EVAL_FILE is not None, load the prompt from the file
prompt_str = ""
if PROMPT_TO_EVAL_FILE is not None:
    print(f"Loading prompt from {PROMPT_TO_EVAL_FILE}")
    with open(PROMPT_TO_EVAL_FILE, "r") as f:
        prompt_str = f.read()
else:
    # Generate the initial prompt
    prompt_init_template = (
        get_few_shot_prompt_template()
        if IS_FEW_SHOT
        else get_zero_shot_prompt_template()
    )
    prompt_str = invoke_generate_prompt_initial(
        model_prompt_writer, prompt_init_template, df_sample, IDEA_SEED
    )

prompt = PromptTemplate.from_template(prompt_str)
# print(prompt_str)

Generating initial prompt...


### Test the Initial Prompt against the dataset


In [27]:
# The prompt counter used for the main loop
i_prompt = 1

# The previous attempts list
previous_attempts = []

# The first attempt
df_generated, accuracy = invoke_test_prompt_against_dataset(
    prompt, df_all, model_evaluate, i_prompt, ROWS_MAX, concurrency=5
)

previous_attempts_add(previous_attempts, i_prompt, accuracy, "First attempt.")

df_generated

Getting chunk 1 retry 0 with 1 rows...
Getting chunk 2 retry 0 with 1 rows...
Getting chunk 3 retry 0 with 1 rows...
Getting chunk 4 retry 0 with 1 rows...
Getting chunk 5 retry 0 with 1 rows...
Getting chunk 6 retry 0 with 1 rows...
Getting chunk 7 retry 0 with 1 rows...
Getting chunk 8 retry 0 with 1 rows...
Getting chunk 9 retry 0 with 1 rows...
Getting chunk 10 retry 0 with 1 rows...
Getting chunk 11 retry 0 with 1 rows...
Getting chunk 12 retry 0 with 1 rows...
Getting chunk 13 retry 0 with 1 rows...
Getting chunk 14 retry 0 with 1 rows...
Getting chunk 15 retry 0 with 1 rows...
Getting chunk 16 retry 0 with 1 rows...
Getting chunk 17 retry 0 with 1 rows...
Getting chunk 18 retry 0 with 1 rows...
Getting chunk 19 retry 0 with 1 rows...
Getting chunk 20 retry 0 with 1 rows...
Retrying... Number of rows does not match the expected count. Expected 1 rows, got 3. Increase the MODEL_EVALUATE_MAX_TOKENS parameter?
Getting chunk 15 retry 1 with 1 rows...
Correct answers: 70.00%


Unnamed: 0,ROW_NO,INPUT: TEXT_1,INPUT: TEXT_2,Thinking step by step,OUTPUT: Is Same Author?,Truth,Is Correct?
0,0,# The Ultimate Travel Nurse Salary Guide: 4 Mi...,# The Best Travel Nursing Companies 2022\n\nTh...,The first text is focused on providing informa...,NO,NO,True
1,1,## How To Find the Highest Paying Travel Nursi...,## Highest Paying States for Travel Nurses\n\n...,Both texts have a similar tone and structure. ...,YES,YES,True
2,2,## Is Travel Nursing Worth It?\n\nIf you're lo...,### Increase in Best Travel Nursing Agency Lis...,Both texts discuss the topic of travel nursing...,NO,NO,True
3,3,#### Internal Staff and Travel Nursing Reviews...,## Why You Should Consider ALL The Best Travel...,Both texts discuss the topic of travel nursing...,NO,YES,False
4,4,### Taxable vs Tax Free Pay\n\nFollowing on fr...,### See A Pay Package Example\n\nHere's an ave...,The first text is a technical explanation of t...,NO,YES,False
5,5,### See A Pay Package Example\n\nHere's an ave...,### Uneven Review Distribution for Travel Nurs...,The first text is a detailed explanation of a ...,NO,NO,True
6,6,### Taxable vs Tax Free Pay\n\nFollowing on fr...,### Highway Hypodermics\n\nHighway Hypodermics...,The first text discusses the topic of taxable ...,NO,NO,True
7,7,## Highest Paying States for Travel Nurses\n\n...,"### FaceBook\n\nWe counted exactly 8,800 revie...",The first text discusses the highest paying st...,NO,NO,True
8,8,## Highest Paying States for Travel Nurses\n\n...,# The Ultimate Travel Nurse Salary Guide: 4 Mi...,Both texts discuss travel nurse salaries and j...,NO,YES,False
9,9,## How To Find the Highest Paying Travel Nursi...,"### Google\n\nWe counted 13,475 reviews from G...",Text 1 is a guide on how to find the highest p...,NO,NO,True


### The Main loop to auto-magically improve the prompt

The main loop will run until the prompt is good enough (or max loops is reached).


In [28]:
# Loop until accuracy is greater than 95% or 5 iterations have been reached
while accuracy < 93 and i_prompt < 4:
    i_prompt = i_prompt + 1

    previous_attempts_str = previous_attempts_to_str(previous_attempts, df_all)
    print(f"Previous attempts:\n{previous_attempts_str}\n\n")

    prompt_previous = prompt_str
    prompt_template_updatep = get_updatep_prompt_template()

    prompt_str, changes_made_str = invoke_update_prompt_with_retry(
        prompt_template_updatep,
        df_generated,
        prompt_previous,
        model_prompt_writer,
        previous_attempts_str,
        i_prompt,
        ROWS_INCORRECT,
        IDEA_SEED,
    )

    prompt_updated = PromptTemplate.from_template(prompt_str)
    df_generated, accuracy = invoke_test_prompt_against_dataset(
        prompt_updated, df_all, model_evaluate, i_prompt, ROWS_MAX, concurrency=5
    )

    previous_attempts_add(previous_attempts, i_prompt, accuracy, changes_made_str)

# print(f"\n\nFinal prompt:\n{prompt_generated_str}")
save_tmp_file("10-prompt_final.md", prompt_str)
print(f"\nFinal prompt saved with accuracy {accuracy:.2f}%")

# print(json.dumps(previous_attempts, indent=2))
print("\n\n\n\n\n\n")
print(previous_attempts_to_str(previous_attempts, df_all))

Previous attempts:
### Attempt 1: 70.00% accuracy (6 wrong out of 20 test rows)
First attempt.




Incorrect answers count: 6
Pick 3 random incorrect examples...
Updating prompt...
Getting chunk 1 retry 0 with 1 rows...
Getting chunk 2 retry 0 with 1 rows...
Getting chunk 3 retry 0 with 1 rows...
Getting chunk 4 retry 0 with 1 rows...
Getting chunk 5 retry 0 with 1 rows...
Getting chunk 6 retry 0 with 1 rows...
Getting chunk 7 retry 0 with 1 rows...
Getting chunk 8 retry 0 with 1 rows...
Getting chunk 9 retry 0 with 1 rows...
Retrying... Number of rows does not match the expected count. Expected 1 rows, got 2. Increase the MODEL_EVALUATE_MAX_TOKENS parameter?
Getting chunk 2 retry 1 with 1 rows...
Getting chunk 10 retry 0 with 1 rows...
Getting chunk 11 retry 0 with 1 rows...
Getting chunk 12 retry 0 with 1 rows...
Getting chunk 13 retry 0 with 1 rows...
Getting chunk 14 retry 0 with 1 rows...
Getting chunk 15 retry 0 with 1 rows...
Getting chunk 16 retry 0 with 1 rows...
Getting chunk

ValueError: Failed to process chunk 15 after 3 retries