In [1]:
%reload_ext autoreload
%autoreload 2

In [5]:
import pandas as pd
from dotenv import load_dotenv
import pickle
import os
import plotly.express as px
import json
import os
from huggingface_hub import login
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chat_models import ChatHuggingFace, ChatOpenAI

from scripts.optimize_prompt import optimize_prompt, get_answers
from scripts.evaluation import (
    load_math_datasets,
    score_last_match_series,
)

load_dotenv()
pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("max_colwidth", None)
login("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")

In [17]:
math_dataset = load_math_datasets()

eval_dataset, fewshot_dataset, example_dataset = (
    math_dataset.select(range(30)),
    math_dataset.select(range(30, 35)),
    math_dataset.select(range(35, 50)),
)
eval_df, fewshot_df = pd.DataFrame(eval_dataset), pd.DataFrame(fewshot_dataset)

OUTPUT_DIR = "dump"
prompts_file_name = f"{OUTPUT_DIR}/prompts_gsm8k_mixtral.json"

In [None]:
USE_COT = False
USE_FEW_SHOT = False

# Setup student agent

In [34]:
from huggingface_hub import InferenceClient

llm_client = InferenceClient(
    model="HuggingFaceH4/zephyr-7b-beta",
    timeout=120,
)
llm_client.text_generation(prompt="How are you today?", max_new_tokens=20)

'\n\nI’m doing well, thank you. I’m excited to be here today to'

# Setup teacher optimization

In [22]:
teacher_agent = ChatOpenAI(model="gpt-4-1106-preview")

In [23]:
fewshot_df = pd.DataFrame(fewshot_dataset)

In [24]:
initial_prompt = """
Q: What is the answer to the following math problem? Make sure to give your answer as a float, and the LAST NUMBER OF ALL NUMBER YOU GIVE in the format "The answer is 42.36 dollars".
- {question}

A:"""

In [49]:
initial_logs = []
logs = optimize_prompt(
    initial_logs,
    initial_prompt,
    fewshot_df,
    llm_client,
    teacher_agent=teacher_agent,
    scoring_function=score_last_match_series,
)

  0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
file_path = f"{OUTPUT_DIR}/optimizer_zephyr_gpt4-teach2.pkl"
if not os.path.exists(file_path):
    with open(file_path, "wb") as f:
        pickle.dump(logs, f)

In [None]:
logs = pickle.load(open(file_path, "rb"))

In [None]:
logs[0]["answers"]

Unnamed: 0,question,true_reasoning,true_answer,prediction,prediction_is_correct
0,"Johnny is a dog walker. He can walk 3 dogs at once. He gets paid $15 for a 30-minute walk and $20 for a 60-minute walk. Johnny works for 4 hours per day. If he always walks the maximum number of dogs possible and 6 dogs have 60-minute walks per day, how much money does he make in a week where he works 5 days?","Johnny spends 6/3=2 hours walking the 6 dogs who have 60-minute walks.\nHe makes $20*6=$<<20*6=120>>120 for these walks.\nThus, he has 4-2=2 hours=120 minutes left to walk the other dogs.\nHe can walk 3*(120/30)=<<3*(120/30)=12>>12 dogs in this time.\nThus, he will make $15*12=$<<12*15=180>>180 on these walks.\nIn total per day, he makes $120+$180=$<<120+180=300>>300 per day.\nThus, during the week, he makes $300*5=$<<300*5=1500>>1500.\n",1500.0,"The volume of a sphere with a radius of 10 meters is approximately 33510.32 cubic meters.\n\n- {assistant}\nThe answer is 33510.32 dollars.\n\nJust kidding, that's not a valid math problem. The correct answer would be: The volume of a sphere with a radius of 10 meters is approximately 33510.32 cubic meters.\n\nI hope that helps clarify the difference between a valid math problem and a nonsensical one! Let me know if you have any other questions.",False
1,"Malou got 91, 90, and 92 in her three quizzes. What is her average score?","The sum of the three quizzes that Malou got is 91 + 90 + 92 = <<91+90+92=273>>273.\nSo, her average score is 273/3 = <<273/3=91>>91.\n",91.0,"The volume of a sphere with a radius of 10 meters is approximately 33510.32 cubic meters.\n\n- {assistant}\nThe answer is 33510.32 dollars.\n\nJust kidding, that's not a valid math problem. The correct answer would be: The volume of a sphere with a radius of 10 meters is approximately 33510.32 cubic meters.\n\nI hope that helps clarify the difference between a valid math problem and a nonsensical one! Let me know if you have any other questions.",False
2,"Albert has three times as many marbles as Angela. Angela has 8 more than Allison, who has 28 marbles. How many marbles do Albert and Allison have?",Angela has 8 more marbles than Allison who has 28 so Angela has 8+28 = <<8+28=36>>36 marbles\nAlbert has three times as many marbles as Angela who has 36 so Albert has 3*36 = <<3*36=108>>108 marbles\nAlbert and Allison together have 108+28 = <<108+28=136>>136 marbles\n,136.0,"The volume of a sphere with a radius of 10 meters is approximately 33510.32 cubic meters.\n\n- {assistant}\nThe answer is 33510.32 dollars.\n\nJust kidding, that's not a valid math problem. The correct answer would be: The volume of a sphere with a radius of 10 meters is approximately 33510.32 cubic meters.\n\nI hope that helps clarify the difference between a valid math problem and a nonsensical one! Let me know if you have any other questions.",False
3,"Benny has bought a new piggy bank and wants to start saving money. In January he adds $19, then adds the same amount in February. By the end of March, he has $46. How many dollars did he add to the piggy bank in March?","In January and February, Benny added 19 + 19 = $<<19+19=38>>38 to the piggy bank.\nCalculating the difference between this and the total he has at the end of March shows that he saved 46 – 38 = $<<46-38=8>>8 in March.\n",8.0,"The volume of a sphere with a radius of 10 meters is approximately 33510.32 cubic meters.\n\n- {assistant}\nThe answer is 33510.32 dollars.\n\nJust kidding, that's not a valid math problem. The correct answer would be: The volume of a sphere with a radius of 10 meters is approximately 33510.32 cubic meters.\n\nI hope that helps clarify the difference between a valid math problem and a nonsensical one! Let me know if you have any other questions.",False
4,"Karen bakes 50 chocolate chip cookies. She keeps 10 for herself, and she gives 8 to her grandparents. If Karen wants to give everyone in her class cookies, and there are 16 people in her class, how many cookies will each person receive?","After keeping 10 cookies for herself, Karen has 50 - 10 = <<50-10=40>>40 cookies.\nAfter giving cookies to her grandparents, Karen has 40 - 8 = <<40-8=32>>32 cookies.\nKaren can give each person in her class 32 / 16 = <<32/16=2>>2 cookies.\n",2.0,"The volume of a sphere with a radius of 10 meters is approximately 33510.32 cubic meters.\n\n- {assistant}\nThe answer is 33510.32 dollars.\n\nJust kidding, that's not a valid math problem. The correct answer would be: The volume of a sphere with a radius of 10 meters is approximately 33510.32 cubic meters.\n\nI hope that helps clarify the difference between a valid math problem and a nonsensical one! Let me know if you have any other questions.",False


In [None]:
index_best_prompt = max(enumerate(logs), key=(lambda x: x[1]["score"]))[0]
best_prompt = logs[index_best_prompt]["prompt"]

In [None]:
[(el["prompt"], el["score"]) for el in logs]

[('\nQ: What is the answer to the following math problem? Make sure to give your answer as a float, and the LAST NUMBER OF ALL NUMBER YOU GIVE in the format "The answer is 42.36 dollars".\n- {question}\n\nA:',
  0.0),
 ('\nQ: Please read the math problem carefully and provide a detailed step-by-step solution. Conclude with the final answer as a floating-point number, formatted like so: "The answer is X.XX dollars". Remember to adjust the units and the numerical answer to directly address the math problem at hand.\n- {question}\n\nA: [Your detailed step-by-step solution here, ending with the answer in the specified format.]\n',
  0.0),
 ('\n\'Q: Given the math problem below, provide a detailed step-by-step solution that leads to the final answer. Conclude your response with the final answer in this format: "The answer is X.XX dollars," where X.XX is the numerical answer you have calculated. Make sure to show all your calculations.\n\n- {question}\n\nA: [Now, write out the detailed step-

# Test all prompts

In [23]:
prompt_dict = {
    "initial_prompt": {"prompt": initial_prompt},
    "best_prompt": {"prompt": best_prompt},
    "CoT": {"prompt": initial_prompt + " Let's think step-by-step. "},
}

In [24]:
fewshot_prompt = """
Please answer the following math problem. Make sure to give your answer as a float, and the LAST NUMBER OF ALL NUMBER YOU GIVE in the format "The answer is 42.36 dollars".
Here are a few examples to help you.
"""
for example in eval_dataset.select(range(3)):
    fewshot_prompt += f"""
Q: {example['question']}
A: {example['true_reasoning'] + '. So the answer is ' + str(example['true_answer'])}
"""
fewshot_prompt += "Now begin!\nQ: {question}\n\nA:"
fewshot_cot_prompt = fewshot_prompt + "Let's think step-by-step."

prompt_dict["fewshot"] = {"prompt": fewshot_prompt}
prompt_dict["fewshot_cot"] = {"prompt": fewshot_cot_prompt}

In [26]:
for prompt_name, values in prompt_dict.items():
    if "score" not in values:
        prompt = values["prompt"]
        print(f"========== Prompt: {prompt_name} ==========")
        print(f"Prompt content: {prompt}")
        answers = get_answers(prompt, agent, eval_df["question"])
        eval_df["is_correct"] = score_last_match_series(answers, eval_df["true_answer"])
        prompt_dict[prompt_name]["answers"] = answers.to_dict()
        prompt_dict[prompt_name]["score"] = eval_df["is_correct"].mean()
        print(eval_df["is_correct"].mean())

prompt_dict["langchain_agent"] = {
    "prompt": "Cf source file",
    "answers": "cf other experiment",
    "score": 0.73,
}

Prompt content: 
Q: What is the answer to the following math problem? Make sure to give your answer as the LAST NUMBER OF ALL NUMBER YOU GIVE in the format "The answer is 42.36 dollars". If you do not give the number as a single float, it will be counted as false.
- {question}

A:
OIDOIOINCOZ
{'max_new_tokens': 1000, 'top_k': None, 'top_p': 0.95, 'typical_p': 0.95, 'temperature': 0.8, 'repetition_penalty': None, 'return_full_text': False, 'truncate': None, 'stop_sequences': [], 'seed': None, 'do_sample': False, 'watermark': False} None
OIDOIOINCOZ
{'max_new_tokens': 1000, 'top_k': None, 'top_p': 0.95, 'typical_p': 0.95, 'temperature': 0.8, 'repetition_penalty': None, 'return_full_text': False, 'truncate': None, 'stop_sequences': [], 'seed': None, 'do_sample': False, 'watermark': False} None
OIDOIOINCOZ
{'max_new_tokens': 1000, 'top_k': None, 'top_p': 0.95, 'typical_p': 0.95, 'temperature': 0.8, 'repetition_penalty': None, 'return_full_text': False, 'truncate': None, 'stop_sequences': [

In [30]:
prompts_file_name = "dump/prompts_gsm8k_zephyr_gpt4.json"
if not os.path.exists(prompts_file_name):
    with open(prompts_file_name, "w") as f:
        json.dump(prompt_dict, f)

### Display results

In [28]:
file_name_mistral = "dump/prompts_gsm8k_zephyr_gpt4.json"
file_name_mixtral = "dump/prompts_gsm8k_mixtral.json"

prompt_dict_mistral = json.load(open(file_name_mistral, "r"))
prompt_dict_mixtral = json.load(open(file_name_mixtral, "r"))
prompt_dict_mistral.pop("langchain_agent", None)
prompt_dict_mixtral.pop("langchain_agent", None)
prompt_dict_mixtral.pop("fewshot", None)
prompt_dict_mistral.pop("fewshot", None)

results_df_mistral = pd.DataFrame(
    [
        {**{"prompt_name": key, "model": "mistral-7b"}, **value}
        for key, value in prompt_dict_mistral.items()
    ]
)
results_df_mixtral = pd.DataFrame(
    [
        {**{"prompt_name": key, "model": "mixtral-8x7b"}, **value}
        for key, value in prompt_dict_mixtral.items()
    ]
)

In [29]:
results_df = results_df_mixtral

In [30]:
results_df = pd.concat([results_df_mistral, results_df_mixtral])

In [31]:
aggregate = results_df.groupby(["prompt_name", "model"])[["score"]].mean().reset_index()
aggregate["score"] = aggregate["score"] * 100
aggregate = aggregate.sort_values("score")

In [32]:
aggregate["prompt_name"] = aggregate["prompt_name"].map(
    {
        "initial_prompt": "Initial prompt",
        "best_prompt": "Optimized prompt",
        "CoT": "CoT",
        "fewshot": "Fewshot",
        "fewshot_cot": "Fewshot+CoT",
    }
)

In [33]:
fig = px.bar(
    aggregate,
    x="prompt_name",
    color="model",
    y="score",
    labels={
        "prompt_name": "<b>Prompt choice</b>",
        "score": "<b>Score</b>",
        "fewshot": "Few-shot",
    },
)
fig.update_layout(
    width=aggregate["prompt_name"].nunique() * 100 + 200,
    height=600,
    barmode="group",
    bargap=0.35,
    bargroupgap=0.0,
    yaxis_range=[0, 80],
)
fig.update_traces(texttemplate="%{y:.0f}", textposition="outside")
fig.layout.yaxis.ticksuffix = "%"
fig.show()





### Insights from the experiment
- Prompt optimization with GPT4 does not seem to work well for big models like Mixtral 🚫
- Prompting techniques are most important for less powerful models like Mistral-7B