## Load GSM8K

In [41]:
from datasets import load_dataset

dataset = load_dataset("openai/gsm8k", "main")

## Get Codestral in

In [42]:
from dotenv import load_dotenv
load_dotenv()

True

In [43]:
import os

from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

api_key = os.environ["MISTRAL_API_KEY"]

client = MistralClient(api_key=api_key)
model = "codestral-latest"

## Solve Problem

In [None]:
problem_list = [""]*len(dataset["test"])
expected_answers = [0]*len(dataset["test"])
codestral_code = [""]*len(dataset["test"])
codestral_answers = [0]*len(dataset["test"])

In [None]:
for i in range(len(dataset["test"])):
    math_problem = dataset["test"][i]["question"]
    expected_answer = int(dataset["test"][i]["answer"].split("####")[-1].replace(",", "").strip())
    prompt = f"""Generate python code to solve the following math problem: 

    <math_problem> {math_problem} </math_problem>
    Make your answer in form of an executable function. Do not code anything outside of the function.
    Do not add arguments. Make everything part of the coded function.
    Your code MUST output an integer.
    Before writing the code, write a paragraph about your understanding of the problem.
    Let's think step by step."""

    response = client.chat(
        model=model,
        messages=[ChatMessage(role="user", content=prompt)]
    )
    answer = response.choices[0].message.content    
    python_answer = answer.split("```python")[1].split("```")[0].strip()
    function_name = python_answer.split("def ")[1].split("()")[0].strip()
    func = locals()[function_name]
    codestral_answer = func()
    if codestral_answer:
        result = int(codestral_answer)
    else:
        result = 0    
    problem_list[i] = math_problem
    expected_answers[i] = expected_answer
    codestral_code[i] = python_answer
    codestral_answers[i] = result

## Shape solutions into dataframe

In [141]:
import pandas as pd

benchmark_dict = {"Problem": problem_list, "Codestral Code": codestral_code, "Expected Answer": expected_answers, "Codestral Answer": codestral_answers}
answer_df = pd.DataFrame.from_dict(benchmark_dict)
answer_df.to_csv("answer_df.csv", index=False)

In [145]:
count = (answer_df['Expected Answer'].astype(int) == answer_df['Codestral Answer'].astype(int)).sum()
count

1089

In [148]:
benchmark_score = int(count/len(dataset["test"])*100)
print(f"GSM8k pass@1 score: {benchmark_score}%")

GSM8k pass@1 score: 82%
