In [2]:
import pandas as pd
from pprint import pprint
import jsonlines
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
import os

initial data reading

In [2]:
df = pd.read_parquet("test.parquet")
shuffled = df.sample(frac=1).reset_index(drop=True)[:200]

In [3]:
all_qa = []
for _, row in shuffled.iterrows():
    all_qa.append(
        {
            "question": row["question"],
            "answer": row["answer"],
        }
    )
pprint(all_qa[:1])

model specification and helper functions

In [6]:
from models import select_model

system_prompt = """
You always give perfect reasoning and correct answers.
"""

user_prompt = """
Please solve the following problem step by step and provide a detailed reasoning for your answer.
{q}
"""

judger_system = """
You can perfectly determine if a student answer a question correctly or not.
"""

judger_model = select_model("meta-llama/Meta-Llama-3-70B-Instruct", judger_system)

judger_user = """
Determine if the student correctly answer the given Question given the ground truth answer.
IMPORTANT: Only if the student's final answer is consistent with the ground truth answer, you should mark it as correct.

Submit your judgement using the following JSON format:
{{
    "Is the student's answer correct?": true or false
}}

# Question

{q}

# Ground Truth Answer

{ga}

# Student's Answer

{sa}
"""

def generate_answer(question: str, answer: str, model_name: str):
    try:
        model = select_model(model_name, system_prompt)
        sa = model(user_prompt.format(q=question), temperature=0.0)
        judger_model.clear_conversations()
        correct = judger_model(judger_user.format(q=question, ga=answer, sa=sa), use_json=True)["Is the student's answer correct?"]
        return {
            "question": question,
            "true_answer": answer,
            "model_answer": sa,
            "correct": correct
        }
    except Exception as e:
        print(e)
        return None


def judge(question: str, true_answer: str, student_answer: str):
    judger_model.clear_conversations()
    correct = judger_model(judger_user.format(q=question, ga=true_answer, sa=student_answer), use_json=True)["Is the student's answer correct?"]
    return correct

peak one sample and test

In [5]:
qa = all_qa[0]
generate_answer(qa["question"], qa["answer"], "meta-llama/Meta-Llama-3-8B-Instruct")

load existing questions

In [4]:
# read one file for all qa
all_qa = []
file = "Meta-Llama-3-8B-Instruct-all.jsonl"
with jsonlines.open(file, "r") as reader:
    for json in reader:
        all_qa.append({
            "question": json["question"],
            "answer": json["true_answer"]
        })
print(len(all_qa))

generate data for all questions

In [7]:
student_models = [
    # "mistralai/Mistral-7B-Instruct-v0.2",
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    # "gpt-3.5-turbo",
    # "meta-llama/Meta-Llama-3-8B-Instruct",
    "meta-llama/Meta-Llama-3-70B-Instruct",
    # "meta-llama/Llama-2-13b-chat-hf",
    "meta-llama/Llama-2-70b-chat-hf",
]

for student_model in tqdm(student_models):
    results = []
    existing_qs = set()
    splits = student_model.split("/")
    if len(splits) > 1:
        model_name = splits[1]
    else:
        model_name = splits[0]
    if os.path.exists(f"{model_name}-all.jsonl"):
        # read existing questions
        with jsonlines.open(f"{model_name}-all.jsonl", "r") as reader:
            for json in reader:
                existing_qs.add(json["question"])
                results.append(json)
        print(f"Existing questions: {len(existing_qs)}")
    with ThreadPoolExecutor(max_workers=20) as executor:
        futures = [executor.submit(generate_answer, qa["question"], qa["answer"], student_model) for qa in all_qa if qa["question"] not in existing_qs]
        for future in tqdm(as_completed(futures), total=len(futures)):
            r = future.result()
            if r is not None:
                results.append(r)

    with jsonlines.open(f"{model_name}-all.jsonl", "w") as writer:
        writer.write_all(results)

reorder and sanity checks

In [8]:
files = [
    "Meta-Llama-3-8B-Instruct-all.jsonl",
    "Meta-Llama-3-70B-Instruct-all.jsonl",
    "Llama-2-13b-chat-hf-all.jsonl",
    "Llama-2-70b-chat-hf-all.jsonl",
    "gpt-3.5-turbo-all.jsonl",
    "Mistral-7B-Instruct-v0.2-all.jsonl",
    "Mixtral-8x7B-Instruct-v0.1-all.jsonl",
]

jsons = []
for f in files:
    j = []
    with jsonlines.open(f, "r") as reader:
        for line in reader:
            j.append(line)
    jsons.append(j)

# sort by question
for j in jsons:
    j.sort(key=lambda x: x["question"])

# shuffle each json with the same seed to ensure the same order
seed = 42
for i, j in enumerate(jsons):
    random.seed(seed)
    random.shuffle(j)

# check that all jsons are in the same order
for i in range(1, len(jsons)):
    assert [qa["question"] for qa in jsons[0]] == [qa["question"] for qa in jsons[i]]

# write to new files
for i, j in enumerate(jsons):
    with jsonlines.open(files[i], "w") as writer:
        writer.write_all(j)

# split into train and test
for i, j in enumerate(jsons):
    with jsonlines.open(files[i].replace("all", "train"), "w") as writer:
        writer.write_all(j[:40])
    with jsonlines.open(files[i].replace("all", "test"), "w") as writer:
        writer.write_all(j[40:100])

In [5]:
# redo judge
files = [
    # "Meta-Llama-3-8B-Instruct-all.jsonl",
    # "Meta-Llama-3-70B-Instruct-all.jsonl",
    "Llama-2-13b-chat-hf-all.jsonl",
    "Llama-2-70b-chat-hf-all.jsonl",
]

for f in tqdm(files):
    with jsonlines.open(f, "r") as reader:
        results = []
        for qa in tqdm(reader):
            correct = judge(qa["question"], qa["true_answer"], qa["model_answer"])
            qa["correct"] = correct
            results.append(qa)
    with jsonlines.open(f, "w") as writer:
        writer.write_all(results)