In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from dotenv import load_dotenv
import pickle
import os

load_dotenv()
pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("max_colwidth", None)

from langchain_community.llms import HuggingFaceEndpoint
from langchain.chat_models import ChatHuggingFace

from scripts.run_agents import answer_questions
from scripts.optimize_prompt import optimize_prompt
from scripts.evaluation import extract_number, load_math_datasets

In [None]:
from scripts.evaluation import extract_number
import numpy as np


def score_exact_match(extracted_prediction, true_answers):
    return np.isclose(extracted_prediction, true_answers, rtol=0.1)

In [103]:
eval_dataset, fewshot_dataset = load_math_datasets()
eval_df, fewshot_df = pd.DataFrame(eval_dataset), pd.DataFrame(fewshot_dataset)

OUTPUT_DIR = "dump"
prompts_file_name = f"{OUTPUT_DIR}/prompts_gsm8k_mixtral.json"

# Setup general agent

In [92]:
endpoint_url = "https://fayjubiy2xqn36z0.us-east-1.aws.endpoints.huggingface.cloud"
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    max_new_tokens=1000,
)
agent = ChatHuggingFace(llm=llm)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful
Type:
<class 'langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint'>


In [93]:
llm("ok")

OKOKOK model::: None


', i\'m using the stock 3.5" lcd on a Sega Saturn and have the whole thing connected to a SNES through a GBS-8200 board, so that i can play Saturn games on the SNES.\n\ni\'ve been trying to find a way to make the Saturn\'s menu (the one you access by pressing the "C" button on the controller) appear on the SNES.\n\nas you may know, the GBS-8200 board is able to detect when a console sends a VGA signal, and it outputs it on a VGA port.\n\nso, i was wondering if there\'s a way to make the Saturn send the VGA signal even when the menu is on.\n\ni\'ve tried to find a way to do it by changing some settings on the Saturn, but so far, i\'ve had no success.\n\ndoes anybody know if it\'s even possible to do it, and if so, how?\n\nthanks in advance!'

# Setup teacher optimization

In [94]:
from langchain.chat_models import ChatOpenAI

teacher_agent = ChatOpenAI(model="gpt-4-1106-preview")

# llm = HuggingFaceEndpoint(
#     endpoint_url=endpoint_url,
#     max_new_tokens=1000,
# )
# teacher_agent = HuggingFaceEndpoint(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1")

In [95]:
fewshot_df = pd.DataFrame(fewshot_dataset)

In [96]:
initial_prompt = """
Q: What is the answer to the following math problem? Make sure to give your answer as the final number in the format "The answer is 42".
- {question}

A:"""

In [None]:
from scripts.optimize_prompt import optimize_prompt

logs = []

logs = optimize_prompt(
    logs,
    initial_prompt,
    fewshot_df,
    student_agent=agent,
    teacher_agent=teacher_agent,
    scoring_function=score_exact_match,
)

In [99]:
def save_logs(logs, filename):
    with open(filename, "wb") as f:
        pickle.dump(logs, f)


file_path = f"{OUTPUT_DIR}/optimizer_mixtral_gpt4-teach.pkl"
if not os.path.exists(file_path):
    save_logs(logs, file_path)

In [None]:
import pickle

logs = pickle.load(open(file_path, "rb"))

In [100]:
index_best_prompt = max(enumerate(logs), key=(lambda x: x[1]["score"]))[0]
best_prompt = logs[index_best_prompt]["prompt"]

In [101]:
[(el["prompt"], el["score"]) for el in logs]

[('\nQ: What is the answer to the following math problem? Make sure to give your answer as the final number in the format "The answer is 42".\n- {question}\n\nA:',
  0.7),
 ('\n\nQ: Consider the following math problem and solve it step by step. Begin by identifying the given quantities and the relationships between them. Then, use logical reasoning and arithmetic operations to find the final answer. Ensure that you provide a clear and complete explanation of your reasoning process, and conclude with the final answer in the format "The answer is {final_number}". Here is the problem:\n- {question}\n\n',
  0.9),
 ('\nQ: To solve the following math problem, start by enumerating all the given information and the relationships between the different elements involved. Make sure to derive any implied quantities by using logical deductions based on the given data. Carefully check each step to ensure that all information has been considered and used appropriately. Provide a detailed explanation 

# Test all prompts

In [102]:
prompt_dict = {
    "initial_prompt": {"prompt": initial_prompt},
    "best_prompt": {"prompt": best_prompt},
    "CoT": {"prompt": "Q: {question}\n\nA: Let's think step-by-step."},
}

In [None]:
fewshot_prompt = """
Please answer the following math problem. Make sure to give your answer ending with your answer in the format "The answer is 42".
Here are a few examples to help you.
"""
for example in eval_dataset.select(range(3)):
    fewshot_prompt += f"""
Q: {example['question']}
A: {example['true_reasoning'] + '. So the answer is ' + str(example['true_answer'])}
"""
fewshot_prompt += "Now begin!\nQ: {question}\n\nA:"
fewshot_cot_prompt = fewshot_prompt + "Let's think step-by-step."

prompt_dict["fewshot"] = {"prompt": fewshot_prompt}
prompt_dict["fewshot_cot"] = {"prompt": fewshot_cot_prompt}

In [117]:
from scripts.optimize_prompt import get_answers

for prompt_name, values in prompt_dict.items():
    if "score" not in values:
        prompt = values["prompt"]
        print(f"========== Prompt: {prompt_name} ==========")
        print(f"Prompt content: {prompt}")
        answers = get_answers(prompt, agent, eval_df["question"])
        extracted_answers = answers.apply(extract_number)
        answer_match = score_exact_match(extracted_answers, eval_df["true_answer"])
        prompt_dict[prompt_name]["answers"] = answers.to_dict()
        prompt_dict[prompt_name]["score"] = answer_match.mean()
        print(answer_match.mean())

prompt_dict["langchain_agent"] = {
    "prompt": "Cf source file",
    "answers": "cf other experiment",
    "score": 0.73,
}

In [None]:
import json
import os

if not os.path.exists(prompts_file_name):
    with open(prompts_file_name, "w") as f:
        json.dump(prompt_dict, f)

### Display results

In [123]:
file_name_mistral = "dump/prompts_gsm8k_4_mistral7b.json"
file_name_mixtral = "dump/prompts_gsm8k_mixtral.json"

prompt_dict_mistral = json.load(open(file_name_mistral, "r"))
prompt_dict_mixtral = json.load(open(file_name_mixtral, "r"))

results_df_mistral = pd.DataFrame(
    [
        {**{"prompt_name": key, "model": "mistral-7b"}, **value}
        for key, value in prompt_dict_mistral.items()
    ]
)
results_df_mixtral = pd.DataFrame(
    [
        {**{"prompt_name": key, "model": "mixtral-8x7b"}, **value}
        for key, value in prompt_dict_mixtral.items()
    ]
)

In [124]:
results_df = pd.concat([results_df_mistral, results_df_mixtral])

In [128]:
results_df.groupby(["model", "prompt_name"])["score"].first()

model         prompt_name    
mistral-7b    CoT                0.200000
              best_prompt        0.266667
              fewshot            0.200000
              fewshot_cot        0.400000
              initial_prompt     0.066667
              langchain_agent    0.200000
mixtral-8x7b  CoT                0.700000
              best_prompt        0.666667
              fewshot            0.666667
              fewshot_cot        0.700000
              initial_prompt     0.733333
              langchain_agent    0.730000
Name: score, dtype: float64

### Insights from the experiment
- Prompt optimization with GPT4 does not seem to work well for now 🚫
- Prompting techniques are most important for less powerful models.