# AutoPrompt - Auto Write Evaluation Prompt


## The Idea


It's hard to know what the right prompt is, and it's hard to know if you've found it. This project aims to automate the process of finding the perfect evaluation prompt.


## Let's build it!


In [1]:
import os

from langchain.prompts import load_prompt
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache
from langchain_core.prompts import PromptTemplate

from utils import save_tmp_file, load_model
from data_handling import load_and_clean_dataset
from eval import invoke_test_prompt_against_dataset
from prompt_initial import invoke_generate_prompt_initial
from prompt_update import (
    invoke_update_prompt_with_retry,
    previous_attempts_add,
    previous_attempts_to_str,
)

from prompts.writep_few_shot.prompt import (
    get_prompt_template as get_few_shot_prompt_template,
)
from prompts.writep_zero_shot.prompt import (
    get_prompt_template as get_zero_shot_prompt_template,
)
from prompts.updatep.prompt import (
    get_prompt_template as get_updatep_prompt_template,
)

### The configs


In [2]:
DATASET_FILE = "./datasets/sentiment_analysis_examples_25.csv"
# DATASET_FILE = "./datasets/dataset-writing-style-v-not-v.xlsx"
# DATASET_FILE = "./datasets/writing-style.xlsx"

# Seed Idea for prompt generation
IDEA_SEED = """Decide the sentiment of the input text."""
# IDEA_SEED = """Compare the writing style of the two pieces of text. Your OUTPUT MUST ONLY take the writing style into consideration, NOT the meaning or thematic similarity of the texts.""".strip()


# Initial prompt. If `None`, the initial prompt will be generated automatically
PROMPT_TO_EVAL_FILE = None
# PROMPT_TO_EVAL_FILE = "./_scored_100/sentiment-05-zero-shot.md"

# Maximum number of rows to use from the dataset for initial prompt generation
ROWS_INITIAL = 3
# Maximum number of rows in each chunk
ROWS_MAX = 13
# Number of rows to use as `incorrect` examples
ROWS_INCORRECT = 5


# Use Few or Zero Shot?
IS_FEW_SHOT = True


# Model configurations
MODEL_PROMPT_WRITER_NAME = "gpt-4-1106-preview"
# MODEL_PROMPT_WRITER_NAME = "gpt-3.5-turbo"
# MODEL_PROMPT_WRITER_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
MODEL_PROMPT_WRITER_TEMPERATURE = 0.7
MODEL_PROMPT_WRITER_MAX_TOKENS = 2000

MODEL_EVALUATE_NAME = "gpt-3.5-turbo"
# MODEL_EVALUATE_NAME = "gpt-4-1106-preview"
# MODEL_EVALUATE_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# MODEL_EVALUATE_NAME = "togethercomputer/llama-2-70b-chat"
MODEL_EVALUATE_TEMPERATURE = 0.3
MODEL_EVALUATE_MAX_TOKENS = 1000

### Load Models


In [3]:
# Set up LangChain models

# if both model names start with `gpt-`, set cache
if MODEL_PROMPT_WRITER_NAME.startswith("gpt-") and MODEL_EVALUATE_NAME.startswith(
    "gpt-"
):
    print("Enabling LLM cache...")
    set_llm_cache(SQLiteCache(database_path=".langchain.db"))


# Setup the prompt writer model
model_prompt_writer = load_model(
    MODEL_PROMPT_WRITER_NAME,
    MODEL_PROMPT_WRITER_TEMPERATURE,
    MODEL_PROMPT_WRITER_MAX_TOKENS,
)

# Setup the evaluation model
model_evaluate = load_model(
    MODEL_EVALUATE_NAME,
    MODEL_EVALUATE_TEMPERATURE,
    MODEL_EVALUATE_MAX_TOKENS,
)

Enabling LLM cache...
Loading ChatOpenAI model: gpt-4-1106-preview
Loading ChatOpenAI model: gpt-3.5-turbo


### Load the dataset


In [4]:
# empty ./_tmp directory
for filename in os.listdir("_tmp"):
    os.remove(os.path.join("_tmp", filename))

# Load the dataset
df_all = load_and_clean_dataset(DATASET_FILE)

# If df_all has more rows than ROWS_INITIAL, take the first ROWS_INITIAL rows
df_sample = df_all
if len(df_all) > ROWS_INITIAL:
    df_sample = df_all.head(ROWS_INITIAL)

df_sample

Unnamed: 0,INPUT: Sentence,OUTPUT: Sentiment
0,I love this new phone,positive
1,This is just okay. Nothing special. 😐,neutral
2,"Unfortunately, it broke the first day I used it",negative


### Generate the Initial Prompt


In [5]:
# If PROMPT_TO_EVAL_FILE is not None, load the prompt from the file
prompt_str = ""
if PROMPT_TO_EVAL_FILE is not None:
    print(f"Loading prompt from {PROMPT_TO_EVAL_FILE}")
    with open(PROMPT_TO_EVAL_FILE, "r") as f:
        prompt_str = f.read()
else:
    # Generate the initial prompt
    prompt_init_template = (
        get_few_shot_prompt_template()
        if IS_FEW_SHOT
        else get_zero_shot_prompt_template()
    )
    prompt_str = invoke_generate_prompt_initial(
        model_prompt_writer, prompt_init_template, df_sample, IDEA_SEED
    )

prompt = PromptTemplate.from_template(prompt_str)
# print(prompt_str)

Generating initial prompt...


### Test the Initial Prompt against the dataset


In [6]:
# The prompt counter used for the main loop
i_prompt = 1

df_generated, accuracy = invoke_test_prompt_against_dataset(
    prompt, df_all, model_evaluate, i_prompt, ROWS_MAX, concurrency=5
)

df_generated

Getting chunk 1 retry 0 with 13 rows...
Getting chunk 2 retry 0 with 12 rows...
Correct answers: 92.00%


Unnamed: 0,ROW_NO,Thinking step by step,OUTPUT: Sentiment,INPUT: Sentence,Truth,Is Correct?
0,0,"The phrase ""I love"" indicates a positive senti...",positive,I love this new phone,positive,True
1,1,"The phrase ""just okay"" and ""nothing special"" s...",neutral,This is just okay. Nothing special. 😐,neutral,True
2,2,"The word ""unfortunately"" and ""broke"" indicate ...",negative,"Unfortunately, it broke the first day I used it",negative,True
3,3,"The phrase ""could've been worse"" suggests a ne...",neutral,I guess it could've been worse 😅,neutral,True
4,4,"The phrase ""waiting forever"" and the emoji ""😒""...",negative,Waiting forever for a response... 😒,negative,True
5,5,"The words ""amazing"" and ""boring"" contradict ea...",neutral,The movie was both amazing and boring 😕,neutral,True
6,6,"The phrase ""not sure"" suggests uncertainty, ma...",neutral,Not sure if I liked it or not,neutral,True
7,7,"The word ""fantastic"" indicates a positive sent...",positive,Absolutely fantastic experience!,positive,True
8,8,"The word ""mediocre"" and the phrase ""wouldn't r...",negative,"Mediocre service, wouldn't recommend 😑",negative,True
9,9,"The phrase ""hard to tell"" suggests uncertainty...",neutral,Hard to tell if it's good or bad 😶,neutral,True


### The Main loop to auto-magically improve the prompt

The main loop will run until the prompt is good enough (or max loops is reached).


In [7]:
previous_attempts = []
previous_attempts_add(previous_attempts, i_prompt, accuracy, "First attempt.")


# Loop until accuracy is greater than 95% or 5 iterations have been reached
while accuracy < 93 and i_prompt < 4:
    i_prompt = i_prompt + 1

    previous_attempts_str = previous_attempts_to_str(previous_attempts, df_all)
    print(f"Previous attempts:\n{previous_attempts_str}\n\n")

    prompt_previous = prompt_str
    prompt_template_updatep = get_updatep_prompt_template()

    prompt_str, changes_made_str = invoke_update_prompt_with_retry(
        prompt_template_updatep,
        df_generated,
        prompt_previous,
        model_prompt_writer,
        previous_attempts_str,
        i_prompt,
        ROWS_INCORRECT,
        IDEA_SEED,
    )

    prompt_updated = PromptTemplate.from_template(prompt_str)
    df_generated, accuracy = invoke_test_prompt_against_dataset(
        prompt_updated, df_all, model_evaluate, i_prompt, ROWS_MAX, concurrency=5
    )

    previous_attempts_add(previous_attempts, i_prompt, accuracy, changes_made_str)

# print(f"\n\nFinal prompt:\n{prompt_generated_str}")
save_tmp_file("10-prompt_final.md", prompt_str)
print(f"\nFinal prompt saved with accuracy {accuracy:.2f}%")

# print(json.dumps(previous_attempts, indent=2))
print("\n\n\n\n\n\n")
print(previous_attempts_to_str(previous_attempts, df_all))

Previous attempts:
### Attempt 1: 92.00% accuracy (2 wrong out of 25 test rows)
First attempt.




Incorrect answers count: 2
Pick the first 2 examples...
Updating prompt...
Getting chunk 1 retry 0 with 13 rows...Getting chunk 2 retry 0 with 12 rows...

Correct answers: 96.00%

Final prompt saved with accuracy 96.00%







### Attempt 1: 92.00% accuracy (2 wrong out of 25 test rows)
First attempt.

### Attempt 2: 96.00% accuracy (1 wrong out of 25 test rows)
Changes made to the prompt compared to attempt 1:
- Rewritten Example 2 to reflect a sentence with a negative sentiment, despite containing words that do not express strong negative feelings but suggest a lack of excitement or enjoyment.
- Rewritten Example 3 to clarify what constitutes a neutral sentiment, using a word ("average") that might misleadingly be seen as negative but is indicative of neutrality.
- Adjusted the order of sentiment options for consistency with most sentiment analysis models (positive, negative, neutral).
