# AutoPrompt - Auto Write Evaluation Prompt


## The Idea


It's hard to know what the right prompt is, and it's hard to know if you've found it. This project aims to automate the process of finding the perfect evaluation prompt.


## Let's build it!


In [1]:
import os

from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache

from utils import save_tmp_file, load_model
from data_handling import load_and_clean_dataset, get_df_incorrect_answers
from evaluate_against_dataset import EvaluateAgainstDataset
from generate_prompt_initial import GeneratePromptInitial
from generate_prompt_update import GeneratePromptUpdate
from generate_expert_plans import GenerateExpertPlans
from previous_attempts import PreviousAttempts, Attempt
from eval_aware_dataset import EvalAwareDataset

### The configs


In [2]:
DATASET_FILE = "./datasets/sentiment_analysis_examples_25.csv"
# DATASET_FILE = "./datasets/dataset-writing-style-v-not-v.xlsx"
# DATASET_FILE = "./datasets/writing-style.xlsx"
# DATASET_FILE = "./datasets/writing-style-30-100-words.xlsx"

# Seed Idea for prompt generation
IDEA_SEED = """Decide the sentiment of the input text."""
# IDEA_SEED = """Compare the writing style of the two pieces of text. Your OUTPUT MUST ONLY take the writing style into consideration, NOT the meaning or thematic similarity of the texts.""".strip()


# Initial prompt. If `None`, the initial prompt will be generated automatically
# PROMPT_TO_EVAL_FILE = None
# PROMPT_TO_EVAL_FILE = "_scored_100/writing-style-01-gpt-turbo-3.5-temp-0.3.md"

# Maximum number of rows to use from the dataset for initial prompt generation
ROWS_INITIAL = 8
# Maximum number of rows in each chunk
ROWS_MAX = 13
# Number of rows to use as `incorrect` examples
ROWS_INCORRECT = 5


# Use Few or Zero Shot?
IS_FEW_SHOT = True
EVAL_CONCURRENCY = 10


# Stopping criteria (inclusive)
GOAL_ACCURACY = 98
MAX_ATTEMPTS_PER_PLAN = 1


# Model configurations
# MODEL_PROMPT_WRITER_NAME = "gpt-4-1106-preview"
MODEL_PROMPT_WRITER_NAME = "gpt-3.5-turbo"
# MODEL_PROMPT_WRITER_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
MODEL_PROMPT_WRITER_TEMPERATURE = 0.6
MODEL_PROMPT_WRITER_MAX_TOKENS = 2000

MODEL_EVALUATE_NAME = "gpt-3.5-turbo"
# MODEL_EVALUATE_NAME = "gpt-4-1106-preview"
# MODEL_EVALUATE_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# MODEL_EVALUATE_NAME = "togethercomputer/llama-2-70b-chat"
MODEL_EVALUATE_TEMPERATURE = 0.01
MODEL_EVALUATE_MAX_TOKENS = 1400

### Load Models


In [3]:
# Set up LangChain models

# if both model names start with `gpt-`, set cache
if MODEL_PROMPT_WRITER_NAME.startswith("gpt-") and MODEL_EVALUATE_NAME.startswith(
    "gpt-"
):
    print("Enabling LLM cache...")
    set_llm_cache(SQLiteCache(database_path=".langchain.db"))


# Setup the prompt writer model
model_prompt_writer = load_model(
    MODEL_PROMPT_WRITER_NAME,
    MODEL_PROMPT_WRITER_TEMPERATURE,
    MODEL_PROMPT_WRITER_MAX_TOKENS,
)

# Setup the evaluation model
model_evaluate = load_model(
    MODEL_EVALUATE_NAME,
    MODEL_EVALUATE_TEMPERATURE,
    MODEL_EVALUATE_MAX_TOKENS,
)

Enabling LLM cache...
Loading ChatOpenAI model: gpt-3.5-turbo
Loading ChatOpenAI model: gpt-3.5-turbo


### Load the dataset


In [4]:
# empty ./_tmp directory
for filename in os.listdir("_tmp"):
    os.remove(os.path.join("_tmp", filename))

# Load the dataset
df_all = load_and_clean_dataset(DATASET_FILE)

# Create an instance of the DatasetWithMistakeTracking class
dataset_tracker = EvalAwareDataset(df_all)

# Get initial sample of the dataset
df_sample = dataset_tracker.get_sample(ROWS_INITIAL)
df_sample

Unnamed: 0,ROW_NO,INPUT: Sentence,OUTPUT: Sentiment
0,1,I love this new phone,positive
1,2,This is just okay. Nothing special. 😐,neutral
2,3,"Unfortunately, it broke the first day I used it",negative
3,4,I guess it could've been worse 😅,neutral
4,5,Waiting forever for a response... 😒,negative
5,6,The movie was both amazing and boring 😕,neutral
6,7,Not sure if I liked it or not,neutral
7,8,Absolutely fantastic experience!,positive


### Generate the Initial Expert Ideas


In [6]:
dataset_tracker.get_sample(ROWS_INITIAL)

Unnamed: 0,ROW_NO,INPUT: Sentence,OUTPUT: Sentiment
3,4,I guess it could've been worse 😅,neutral
5,6,The movie was both amazing and boring 😕,neutral
0,1,I love this new phone,positive
1,2,This is just okay. Nothing special. 😐,neutral
2,3,"Unfortunately, it broke the first day I used it",negative
4,5,Waiting forever for a response... 😒,negative
6,7,Not sure if I liked it or not,neutral
7,8,Absolutely fantastic experience!,positive


In [7]:
df_all

Unnamed: 0,ROW_NO,INPUT: Sentence,OUTPUT: Sentiment
0,1,I love this new phone,positive
1,2,This is just okay. Nothing special. 😐,neutral
2,3,"Unfortunately, it broke the first day I used it",negative
3,4,I guess it could've been worse 😅,neutral
4,5,Waiting forever for a response... 😒,negative
5,6,The movie was both amazing and boring 😕,neutral
6,7,Not sure if I liked it or not,neutral
7,8,Absolutely fantastic experience!,positive
8,9,"Mediocre service, wouldn't recommend 😑",negative
9,10,Hard to tell if it's good or bad 😶,neutral


In [8]:
# # If PROMPT_TO_EVAL_FILE is not None, load the prompt from the file
# prompt_str = ""
# if PROMPT_TO_EVAL_FILE is not None:
#     print(f"Loading prompt from {PROMPT_TO_EVAL_FILE}")
#     with open(PROMPT_TO_EVAL_FILE, "r") as f:
#         prompt_str = f.read()