Code from A3 problem1-prompt.ipynb and https://github.com/SewoongLee/reproduce-llama3-arithmetic/blob/main/llama3-tutorial-gsm8k.ipynb

You need to set your OPENAI_API_KEY environment variable in .env

In [2]:
from openai import OpenAI
from tqdm import tqdm
import textwrap
import dotenv
import os
%load_ext dotenv
%dotenv

client=OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# we use gpt-4o-mini as default: current pricing is 
# $0.150 / 1M input tokens, $0.600 / 1M output tokens
def generate_gpt_response(prompt,messages=[], model="gpt-3.5-turbo", temperature=0.7):
    messages.append({"role": "user", "content": prompt})
            
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature = temperature
    )
    content=response.choices[0].message.content
    messages.append({'role': 'assistant', 'content': content})
    return messages


def print_message(message, width=80):
    text=message["role"]+": " + message["content"]
    wrapped_text = textwrap.fill(text, width=width)
    
    for line in wrapped_text.split('\n'):
        print(f" {line.ljust(width)} ")        
    

cannot find .env file


load ablated evolved gsm8k json files

In [8]:
import os
import random
import json
import re

random.seed(56)

ablated_dataset = []
q_indices = []
folder_path = "gsm_evolved_ablations"

files = os.listdir(folder_path)
sorted_files = sorted(files, key=lambda x: int(x.split('_evolved')[0][1:]))

print(sorted_files)
print()

for file_name in sorted_files:
    f_path = os.path.join(folder_path, file_name)

    match = re.search(r'\d+', f_path)
    num = match.group(0)
    q_indices.append(int(num))

    with open(f_path, 'r') as f:
        lines = f.readlines()

    # random_lines = random.sample([json.loads(item.strip()) for item in lines], min(10, len(lines)))
    # print(random_lines)
    # print()

    # ablated_dataset.append(random_lines)
    ablated_dataset.append([json.loads(item.strip()) for item in lines])

print()
print(len(ablated_dataset))
print(len(ablated_dataset[0]))
print(q_indices)


['q0_evolved--NUM100.jsonl', 'q1_evolved--NUM100.jsonl', 'q2_evolved--NUM100.jsonl', 'q3_evolved--NUM100.jsonl', 'q4_evolved--NUM100.jsonl', 'q5_evolved--NUM100.jsonl', 'q6_evolved--NUM100.jsonl', 'q7_evolved--NUM100.jsonl', 'q8_evolved--NUM100.jsonl', 'q9_evolved--NUM100.jsonl', 'q10_evolved--NUM100.jsonl', 'q11_evolved--NUM100.jsonl', 'q12_evolved--NUM100.jsonl', 'q13_evolved--NUM100.jsonl', 'q14_evolved--NUM100.jsonl', 'q15_evolved--NUM100.jsonl', 'q16_evolved--NUM100.jsonl', 'q17_evolved--NUM100.jsonl', 'q18_evolved--NUM100.jsonl', 'q19_evolved--NUM100.jsonl']


20
100
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


We will use the following system prompt

In [4]:
system_prompt= \
'''You are an expert math tutor specializing in grade-school-level problems like those in the GSM8K dataset. 
Solve each problem step by step, explaining the reasoning and calculations clearly and concisely to help the student understand the process. 
Please return your final answer after the delimiter \"####\" as a numeric number'''

We can answer from gpt api using fewshots in context learning and system prompt

In [5]:
import re

def extract_ans_from_response(answer: str, eos=None):
    if eos:
        answer = answer.split(eos)[0].strip()

    answer = answer.split('####')
    
    if len(answer) > 2:
        answer = answer[-2].strip()
    elif len(answer) == 2:
        answer = answer[-1].strip()
    else:
        answer = "NA"

    if isinstance(answer, str):
        for remove_char in [',', '$', '%', 'g']:
            answer = answer.replace(remove_char, '')

    try:
        return int(answer)
    except ValueError:
        return answer

# def extract_ans_from_response(answer: str, eos=None):
#     if eos:
#         answer = answer.split(eos)[0].strip()

#     answer = answer.split('####')[-1].strip()

#     for remove_char in [',', '$', '%', 'g']:
#         answer = answer.replace(remove_char, '')

#     try:
#         return int(answer)
#     except ValueError:
#         return answer

# def get_num_answer(text):
#     num_matches = re.findall(r'\d+\.?\d*', text)

#     if not num_matches:
#         return None
    
#     final_answer = num_matches[-1]

#     return final_answer

# Implement the function answer_from_gpt(question,temperature,num_shots), which should return the gpt answer as a string
# (see how it is called in the function evaluate_fewshots below)
# temperature is the temperature parameter passed to gpt api
# if num_shots>0, you should use the first num_shots examples in train_data for in-context learning
# Please use the system prompt provided above (see lecture10-gpt.ipynb for examples of system prompt usage)
#
def answer_from_gpt(question, temperature, num_shots):
    example = []

    # if num_shots > 0:
    #     for i in range(num_shots):
    #         curr_ex = train_data[i]
    #         example.append({"role": "user", "content": curr_ex['question']})
    #         example.append({"role": "assistant", "content": curr_ex['answer']})

    full_prompt = system_prompt + "\n" + question
    # print("Q: ", full_prompt)

    messages = generate_gpt_response(prompt=full_prompt, messages=example, temperature=temperature)
    answer = messages[-1]["content"]
    # print("A: ", answer)
    last_token = extract_ans_from_response(answer)
    return last_token

Evaluate on a small subset of gsm8k test data

In [None]:
def evaluate_fewshots(data, temperature,num_shots):
    num_corrects=0
    for k in  tqdm(range(len(data))):
        gpt_answer=answer_from_gpt(data[k]['problem'],temperature=temperature,num_shots=num_shots)
        true_answer= data[k]['result']
        # print(f'k={k}: gpt_answer={gpt_answer}  true_answer={true_answer}')   # this is for debug only
        
        if gpt_answer != "NA":
            try:
                if float(gpt_answer) == float(true_answer):
                    # print("CORRECT")
                    num_corrects += 1
                else:
                    # print("INCORRECT")
                    num_corrects += 0
            except ValueError:
                # print("INCORRECT")
                num_corrects += 0
    
    accuracy = num_corrects/len(data)
    print(f'accuracy={accuracy:0.2f}')
    return accuracy

temperature = 0
num_shots = 0

total_inaccurate = 0

print(f'=== temperature={temperature} num_shots={num_shots} ===')
for i in range(len(ablated_dataset)):
    curr_ablation = ablated_dataset[i]

    print("Question ", q_indices[i])
    accuracy = evaluate_fewshots(curr_ablation, temperature, num_shots)
    if accuracy < 1:
        total_inaccurate += 1
    print()

print("Total Inaccurate: ", total_inaccurate)

=== temperature=0 num_shots=0 ===
Question  0


  1%|          | 1/100 [00:01<02:16,  1.38s/it]

k=0: gpt_answer=1170  true_answer=2210
INCORRECT


  2%|▏         | 2/100 [00:02<02:20,  1.43s/it]

k=1: gpt_answer=1170  true_answer=2210
INCORRECT


  3%|▎         | 3/100 [00:04<02:06,  1.31s/it]

k=2: gpt_answer=1170  true_answer=2210
INCORRECT


  4%|▍         | 4/100 [00:05<02:22,  1.48s/it]

k=3: gpt_answer=1170  true_answer=2210
INCORRECT


  5%|▌         | 5/100 [00:06<02:10,  1.38s/it]

k=4: gpt_answer=2210  true_answer=2210
CORRECT


  5%|▌         | 5/100 [00:08<02:35,  1.64s/it]


KeyboardInterrupt: 