Code from A3 problem1-prompt.ipynb and https://github.com/SewoongLee/reproduce-llama3-arithmetic/blob/main/llama3-tutorial-gsm8k.ipynb

You need to set your OPENAI_API_KEY environment variable in .env

In [1]:
from openai import OpenAI
from tqdm import tqdm
import textwrap
import dotenv
import os
%load_ext dotenv
%dotenv

client=OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# we use gpt-4o-mini as default: current pricing is 
# $0.150 / 1M input tokens, $0.600 / 1M output tokens
def generate_gpt_response(prompt,messages=[], model="gpt-4o-mini", temperature=0.7):
    messages.append({"role": "user", "content": prompt})
            
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature = temperature
    )
    content=response.choices[0].message.content
    messages.append({'role': 'assistant', 'content': content})
    return messages


def print_message(message, width=80):
    text=message["role"]+": " + message["content"]
    wrapped_text = textwrap.fill(text, width=width)
    
    for line in wrapped_text.split('\n'):
        print(f" {line.ljust(width)} ")        
    

cannot find .env file


load gsm8k dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("gsm8k",'main')

train_data = dataset['train'].select(range(10))
test_data = dataset['test'].select(range(10))

for i in range(len(train_data)):
    print('Q:')
    print(train_data[i]['question'])
    print()
# print('A:')
# print(train_data[0]['answer'])


Q:
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

Q:
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?

Q:
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?

Q:
Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?

Q:
James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year?

Q:
Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 8

We will use the following system prompt

In [3]:
system_prompt= \
'''You are an expert math tutor specializing in grade-school-level problems like those in the GSM8K dataset. 
Solve each problem step by step, explaining the reasoning and calculations clearly and concisely to help the student understand the process. 
Please return your final answer after the delimiter \"####\" as a numeric number'''

We can answer from gpt api using fewshots in context learning and system prompt

In [5]:
import re

def extract_ans_from_response(answer: str, eos=None):
    if eos:
        answer = answer.split(eos)[0].strip()

    answer = answer.split('####')[-1].strip()

    for remove_char in [',', '$', '%', 'g']:
        answer = answer.replace(remove_char, '')

    try:
        return int(answer)
    except ValueError:
        return answer

def get_num_answer(text):
    num_matches = re.findall(r'\d+\.?\d*', text)

    if not num_matches:
        return None
    
    final_answer = num_matches[-1]

    return final_answer

# Implement the function answer_from_gpt(question,temperature,num_shots), which should return the gpt answer as a string
# (see how it is called in the function evaluate_fewshots below)
# temperature is the temperature parameter passed to gpt api
# if num_shots>0, you should use the first num_shots examples in train_data for in-context learning
# Please use the system prompt provided above (see lecture10-gpt.ipynb for examples of system prompt usage)
#
def answer_from_gpt(question, temperature, num_shots):
    example = []

    if num_shots > 0:
        for i in range(num_shots):
            curr_ex = train_data[i]
            example.append({"role": "user", "content": curr_ex['question']})
            example.append({"role": "assistant", "content": curr_ex['answer']})

    full_prompt = system_prompt + "\n" + question
    # print("Q: ", full_prompt)

    messages = generate_gpt_response(prompt=full_prompt, messages=example, temperature=temperature)
    answer = messages[-1]["content"]
    # print("A: ", answer)
    last_token = extract_ans_from_response(answer)
    return last_token

Evaluate on a small subset of gsm8k test data

In [None]:
def evaluate_fewshots(temperature,num_shots):
    num_corrects=0
    for k in  tqdm(range(len(train_data))):
        gpt_answer=answer_from_gpt(train_data[k]['question'],temperature=temperature,num_shots=num_shots)
        true_answer= train_data[k]['answer'].split()[-1]
        print(f'k={k}: gpt_answer={gpt_answer}  true_answer={true_answer}')   # this is for debug only
        if float(gpt_answer) == float(true_answer):
            num_corrects += 1
    print(f'accuracy={num_corrects/len(train_data):0.2f}')

temperature = 0
num_shots = 0

print(f'=== temperature={temperature} num_shots={num_shots} ===')
evaluate_fewshots(temperature,num_shots)

=== temperature=0 num_shots=0 ===


 10%|█         | 1/10 [00:04<00:40,  4.53s/it]

k=0: gpt_answer=18  true_answer=18


 20%|██        | 2/10 [00:08<00:33,  4.14s/it]

k=1: gpt_answer=3  true_answer=3


 30%|███       | 3/10 [00:13<00:31,  4.56s/it]

k=2: gpt_answer=70000  true_answer=70000


 40%|████      | 4/10 [00:15<00:20,  3.47s/it]

k=3: gpt_answer=540  true_answer=540


 50%|█████     | 5/10 [00:25<00:29,  5.89s/it]

k=4: gpt_answer=20  true_answer=20


 60%|██████    | 6/10 [00:33<00:26,  6.50s/it]

k=5: gpt_answer=64  true_answer=64


 70%|███████   | 7/10 [00:36<00:16,  5.53s/it]

k=6: gpt_answer=260  true_answer=260


 80%|████████  | 8/10 [00:42<00:11,  5.76s/it]

k=7: gpt_answer=120  true_answer=160


 90%|█████████ | 9/10 [00:51<00:06,  6.50s/it]

k=8: gpt_answer=45  true_answer=45


100%|██████████| 10/10 [00:55<00:00,  5.50s/it]

k=9: gpt_answer=460  true_answer=460
accuracy=0.90



