# AutoPrompt - Auto Write Evaluation Prompt


## The Idea


It's hard to know what the right prompt is, and it's hard to know if you've found it. This project aims to automate the process of finding the perfect evaluation prompt.


## Let's build it!


In [1]:
import os

from langchain.prompts import load_prompt
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache
from langchain_core.prompts import PromptTemplate

from utils import save_tmp_file, load_model
from data_handling import load_and_clean_dataset
from eval import invoke_test_prompt_against_dataset
from prompt_initial import invoke_generate_prompt_initial
from prompt_update import (
    invoke_update_prompt_with_retry,
    previous_attempts_add,
    previous_attempts_to_str,
)

from prompts.writep_few_shot.prompt import (
    get_prompt_template as get_few_shot_prompt_template,
)
from prompts.writep_zero_shot.prompt import (
    get_prompt_template as get_zero_shot_prompt_template,
)

### The configs


In [None]:
DATASET_FILE = "./datasets/sentiment_analysis_examples_25.csv"
# DATASET_FILE = "./datasets/dataset-writing-style-v-not-v.xlsx"
# DATASET_FILE = "./datasets/writing-style.xlsx"

# Seed Idea for prompt generation
IDEA_SEED = """Decide the sentiment of the input text."""
# IDEA_SEED = """Compare the writing style of the two pieces of text. Your OUTPUT MUST ONLY take the writing style into consideration, NOT the meaning or thematic similarity of the texts.""".strip()


# Initial prompt. If `None`, the initial prompt will be generated automatically
PROMPT_TO_EVAL_FILE = None
# PROMPT_TO_EVAL_FILE = "./prompt-v-or-not-v-01.md"
# PROMPT_TO_EVAL_FILE = "./_scored_100/sentiment-05-zero-shot.md"

# Maximum number of rows to use from the dataset for initial prompt generation
ROWS_INITIAL = 4
# Maximum number of rows in each chunk
ROWS_MAX = 10
# Number of rows to use as `incorrect` examples
ROWS_INCORRECT = 5


# Prompt file paths
PROMPT_UPDATE_FILE = "./prompts/PROMPT_UPDATEP.json"
IS_FEW_SHOT = False

# Model configurations
MODEL_PROMPT_WRITER_NAME = "gpt-4-1106-preview"
# MODEL_PROMPT_WRITER_NAME = "gpt-3.5-turbo"
# MODEL_PROMPT_WRITER_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
MODEL_PROMPT_WRITER_TEMPERATURE = 0.7
MODEL_PROMPT_WRITER_MAX_TOKENS = 2000

MODEL_EVALUATE_NAME = "gpt-3.5-turbo"
# MODEL_EVALUATE_NAME = "gpt-4-1106-preview"
# MODEL_EVALUATE_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# MODEL_EVALUATE_NAME = "togethercomputer/llama-2-70b-chat"
MODEL_EVALUATE_TEMPERATURE = 0.3
MODEL_EVALUATE_MAX_TOKENS = 800

### Load Models


In [None]:
# Set up LangChain models

# if both model names start with `gpt-`, set cache
if MODEL_PROMPT_WRITER_NAME.startswith("gpt-") and MODEL_EVALUATE_NAME.startswith(
    "gpt-"
):
    print("Enabling LLM cache...")
    set_llm_cache(SQLiteCache(database_path=".langchain.db"))


# Setup the prompt writer model
model_prompt_writer = load_model(
    MODEL_PROMPT_WRITER_NAME,
    MODEL_PROMPT_WRITER_TEMPERATURE,
    MODEL_PROMPT_WRITER_MAX_TOKENS,
)

# Setup the evaluation model
model_evaluate = load_model(
    MODEL_EVALUATE_NAME,
    MODEL_EVALUATE_TEMPERATURE,
    MODEL_EVALUATE_MAX_TOKENS,
)

### Load the dataset


In [None]:
# empty ./_tmp directory
for filename in os.listdir("_tmp"):
    os.remove(os.path.join("_tmp", filename))

# Load the dataset
df_all = load_and_clean_dataset(DATASET_FILE)

# If df_all has more rows than ROWS_INITIAL, take the first ROWS_INITIAL rows
df_sample = df_all
if len(df_all) > ROWS_INITIAL:
    df_sample = df_all.head(ROWS_INITIAL)

df_sample

### Generate the Initial Prompt


In [None]:
# If PROMPT_TO_EVAL_FILE is not None, load the prompt from the file
prompt_str = ""
if PROMPT_TO_EVAL_FILE is not None:
    print(f"Loading prompt from {PROMPT_TO_EVAL_FILE}")
    with open(PROMPT_TO_EVAL_FILE, "r") as f:
        prompt_str = f.read()
else:
    # Generate the initial prompt
    prompt_init_template = (
        get_few_shot_prompt_template()
        if IS_FEW_SHOT
        else get_zero_shot_prompt_template()
    )
    prompt_str = invoke_generate_prompt_initial(
        model_prompt_writer, prompt_init_template, df_sample, IDEA_SEED
    )

prompt = PromptTemplate.from_template(prompt_str)
# print(prompt_str)

### Test the Initial Prompt against the dataset


In [None]:
# The prompt counter used for the main loop
i_prompt = 1

df_generated, accuracy = invoke_test_prompt_against_dataset(
    prompt, df_all, model_evaluate, i_prompt, ROWS_MAX, concurrency=5
)

df_generated

### The Main loop to auto-magically improve the prompt

The main loop will run until the prompt is good enough (or max loops is reached).


In [None]:
previous_attempts = []
previous_attempts_add(previous_attempts, i_prompt, accuracy, "First attempt.")


# Loop until accuracy is greater than 95% or 5 iterations have been reached
while accuracy < 95 and i_prompt < 10:
    i_prompt = i_prompt + 1

    previous_attempts_str = previous_attempts_to_str(previous_attempts, df_all)
    print(f"Previous attempts:\n{previous_attempts_str}\n\n")

    prompt_previous = prompt_str
    prompt_template_updatep = load_prompt(PROMPT_UPDATE_FILE)

    prompt_str, changes_made_str = invoke_update_prompt_with_retry(
        prompt_template_updatep,
        df_generated,
        prompt_previous,
        model_prompt_writer,
        previous_attempts_str,
        i_prompt,
        ROWS_INCORRECT,
        IDEA_SEED,
    )

    prompt_updated = PromptTemplate.from_template(prompt_str)
    df_generated, accuracy = invoke_test_prompt_against_dataset(
        prompt_updated, df_all, model_evaluate, i_prompt, ROWS_MAX, concurrency=5
    )

    previous_attempts_add(previous_attempts, i_prompt, accuracy, changes_made_str)

# print(f"\n\nFinal prompt:\n{prompt_generated_str}")
save_tmp_file("10-prompt_final.md", prompt_str)
print(f"\nFinal prompt saved with accuracy {accuracy:.2f}%")

# print(json.dumps(previous_attempts, indent=2))
print("\n\n\n\n\n\n")
print(previous_attempts_to_str(previous_attempts, df_all))