# Evaluation of GSM8K

In [1]:
import os

import pandas as pd

with open("/home/billxbf/Documents/myks/openai.key", "r") as f:
    keys = f.readlines()
    os.environ["OPENAI_API_KEY"] = keys[0].strip()
with open("/home/billxbf/Documents/myks/serpapi.key", "r") as f:
    keys = f.readlines()
    os.environ["SERPAPI_API_KEY"] = keys[0].strip()
with open("/home/billxbf/Documents/myks/wolfram.key", "r") as f:
    keys = f.readlines()
    os.environ["WOLFRAM_ALPHA_APPID"] = keys[0].strip()

In [2]:
from utils.DataLoader import DataLoader
from utils.Evaluator import Evaluator
from algos.PWS import *
from algos.react import ReactBase, ReactExtraTool
from algos.notool import IO, CoT
from prompts import fewshots, solver, planner

In [3]:
def save_data(dataset, data, save_path):
    dataset["label"] = data["label"]
    dataset["preds"] = data["preds"]
    dataset["em"]  = data["em"]
    dataset["f1"] = data["f1"]
    dataset["acc"] = data["acc"]
    dataset["wall_time"] = data["wall_time"]
    dataset["total_tokens"] = data["total_tokens"]
    dataset["steps"] = data["steps"]
    dataset["tool_cost"] = data["tool_cost"]
    dataset["token_cost"] = data["token_cost"]
    dataset["total_cost"] = data["total_cost"]
    dataset.to_csv(save_path, index=False)
    return dataset

In [4]:
EVAL_LLM = "text-davinci-003"
EVAL_DATASET = "gsm8k"
SEED = 2024

## Standard IO

In [11]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(500)
io = IO(model_name=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=io)

Found cached dataset gsm8k (/home/billxbf/workspace/PWS/data/gsm8k/gsm8k/main/1.1.0/37bfb08b1d4fcbb01f06b03d9e1ef5f1fcbd4d3af3d08842c50d7305091285ba)


  0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
response, data = eval.run()
df = save_data(dataset, data, "results/gsm8k_io_chat.csv")
response


******************* Start Evaluation *******************



  0%|          | 0/500 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [None]:
df.head()

## CoT

In [13]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(20)
cot = CoT(fewshot=fewshots.GSM8K_COT, model_name=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=cot)

Found cached dataset gsm8k (/home/billxbf/workspace/PWS/data/gsm8k/gsm8k/main/1.1.0/37bfb08b1d4fcbb01f06b03d9e1ef5f1fcbd4d3af3d08842c50d7305091285ba)


  0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
response, data = eval.run()
response


******************* Start Evaluation *******************



100%|██████████| 20/20 [03:41<00:00, 11.05s/it]


{'avg_em': 0.5,
 'avg_f1': 0.7096125116713351,
 'avg_acc': 0.9,
 'avg_wall_time': 9.088731503486633,
 'avg_total_tokens': 495.7,
 'avg_total_cost': 0.0009914,
 'avg_steps': 3.2,
 'avg_token_cost': 0.0009914,
 'avg_tool_cost': 0.0}

## REACT

In [8]:
dataset = DataLoader(EVAL_DATASET, seed=2024).load(500)
react = ReactExtraTool(model_name=EVAL_LLM, available_tools=["WolframAlpha", "Calculator", "LLM"], fewshot=fewshots.GSM8K_REACT, verbose=False)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=react)

Found cached dataset gsm8k (/home/billxbf/workspace/PWS/data/gsm8k/gsm8k/main/1.1.0/37bfb08b1d4fcbb01f06b03d9e1ef5f1fcbd4d3af3d08842c50d7305091285ba)


  0%|          | 0/2 [00:00<?, ?it/s]



In [12]:
response, data = eval.run()
df = save_data(dataset, data, "results/gsm8k_react_chat.csv")
response


******************* Start Evaluation *******************



100%|██████████| 500/500 [1:32:50<00:00, 11.14s/it]


ValueError: Length of values (506) does not match length of index (500)

In [22]:
response

{'avg_em': 0.2549407114624506,
 'avg_f1': 0.3728534446851278,
 'avg_acc': 0.6,
 'avg_wall_time': 9.181846753472374,
 'avg_total_tokens': 1874.3075396825398,
 'avg_total_cost': 0.00864229365079365,
 'avg_steps': 2.861111111111111,
 'avg_token_cost': 0.00864229365079365,
 'avg_tool_cost': 0.0}

In [30]:
# save dict to json
import json
with open("results/gsm8k_react_chat.json", "w") as f:
    json.dump(data, f)

 ## PWSBase

In [9]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(20)
pwsbase = PWS_Base(fewshot=fewshots.GSM8K_PWS, available_tools=["WolframAlpha", "Calculator", "LLM"], planner_model=EVAL_LLM, solver_model=EVAL_LLM)
pwsbase.planner.suffix = planner.RESOURCE_RELUCTANT_SUFFIX
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=pwsbase)

Found cached dataset gsm8k (/home/billxbf/workspace/PWS/data/gsm8k/gsm8k/main/1.1.0/37bfb08b1d4fcbb01f06b03d9e1ef5f1fcbd4d3af3d08842c50d7305091285ba)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
res = pwsbase.run('''The town of Centerville spends 15% of its annual budget on its public library. Centerville spent $3,000 on its public library and 24% on the public parks. How much is left of the annual budget?''')

In [7]:
print(res["planner_log"])

For the following tasks, make plans that can solve the problem step-by-step. For each plan, indicate which external tool together with tool input to retrieve evidence. You can store the evidence into a variable #E that can be called by later tools. (Plan, #E1, Plan, #E2, Plan, ...) 

Tools can be one of the following:
WolframAlpha[input]: A WolframAlpha search engine. Useful when you need to solve a complicated Mathematical or Algebraic equation. Input should be an equation or function.
Calculator[input]: A calculator that can compute arithmetic expressions. Useful when you need to perform math calculations. Input should be a mathematical expression
LLM[input]: A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction.

For Example:
Thomas, Toby, and Rebecca worked a total of 157 hours in one week.  Thomas worked x hours.  Toby worked 10 ho

In [8]:
print(res["solver_log"])

Solve the following task or problem. To assist you, we provide some plans and corresponding evidences that might be helpful. Notice that some of these information contain noise so you should trust them with caution.

The town of Centerville spends 15% of its annual budget on its public library. Centerville spent $3,000 on its public library and 24% on the public parks. How much is left of the annual budget?
Plan: Find the total annual budget.
Evidence:
32500.0
Plan: Find the remaining amount of the annual budget.
Evidence:
29500.0

Now begin to solve the task or problem. Respond with the answer directly with no extra words.

The town of Centerville spends 15% of its annual budget on its public library. Centerville spent $3,000 on its public library and 24% on the public parks. How much is left of the annual budget?

29500.0


In [10]:
response, data = eval.run()
df = save_data(dataset, data, "results/tmp.csv")
response


******************* Start Evaluation *******************



100%|██████████| 20/20 [02:22<00:00,  7.13s/it]


{'avg_em': 0.25,
 'avg_f1': 0.33333333333333337,
 'avg_acc': 0.5,
 'avg_wall_time': 5.4728925108909605,
 'avg_total_tokens': 928.85,
 'avg_total_cost': 0.018577000000000003,
 'avg_steps': 2.4,
 'avg_token_cost': 0.018577000000000003,
 'avg_tool_cost': 0.0}

In [8]:
response, data = eval.run()
df = save_data(dataset, data, "results/gsm8k_pwsbase_davinci.csv")
response


******************* Start Evaluation *******************



100%|██████████| 100/100 [14:57<00:00,  8.98s/it]


{'avg_em': 0.16,
 'avg_f1': 0.2590384615384615,
 'avg_acc': 0.38,
 'avg_wall_time': 7.056018805503845,
 'avg_total_tokens': 999.3,
 'avg_total_cost': 0.019986,
 'avg_steps': 2.69,
 'avg_token_cost': 0.019986,
 'avg_tool_cost': 0.0}

In [9]:
df.head(10)

Unnamed: 0,question,answer,label,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,Gary manages two Amazon distribution centers. ...,First find how many packages the second center...,14000,Answer: 80500.0,False,0.0,0,4.686474,891,2,0.0,0.01782,0.01782
1,The square footage of the two bedrooms in the ...,Let M be the number of square feet in Matha's ...,120,Answer: 240 square feet.,False,0.0,0,2.384351,745,2,0.0,0.0149,0.0149
2,"Thomas, Toby, and Rebecca worked a total of 15...",Toby: 2x-10\nRebecca: (2x-10)-8=2x-18\nTotal:2...,56,Rebecca worked 19 hours.,False,0.0,0,4.689618,790,2,0.0,0.0158,0.0158
3,Two-thirds of the class have brown eyes. Half ...,There are 6 * 2 = <<6*2=12>>12 students with b...,18,Answer: 9,False,0.0,0,3.456528,744,2,0.0,0.01488,0.01488
4,"Jackie spends 8 hours working, 3 hours of exer...",Jackie spends a total of 8 + 8 + 3 = <<8+8+3=1...,5,5 hours,False,0.666667,1,6.890093,971,3,0.0,0.01942,0.01942
5,If Layla scored 104 goals in four hockey games...,If Layla scored 104 goals in four hockey games...,92,Answer: 46.0,False,0.0,0,6.873497,991,3,0.0,0.01982,0.01982
6,Bugs are thriving in a garden of 30 plants. Th...,The bugs ate enough plants for there to be 30-...,4,5,False,0.0,1,9.11008,1279,4,0.0,0.02558,0.02558
7,A family is going to the amusement park. The a...,To solve this problem you can set up an equati...,58,Answer: 58,False,0.666667,1,3.390827,872,2,0.0,0.01744,0.01744
8,A retailer sells any shirt for the same price ...,If I paid $120 and I was refunded 25% for all ...,45,$30.,False,0.0,0,15.459617,1259,4,0.0,0.02518,0.02518
9,Half of Taylor's house guests like weak coffee...,He uses 1 tablespoon for weak coffee and doubl...,36,Answer: 36 tablespoons,False,0.5,1,3.911902,929,2,0.0,0.01858,0.01858


In [5]:
dataset["question"][6]

NameError: name 'dataset' is not defined

In [9]:
response = pwsbase.run(dataset["question"][6])

In [10]:
print(response["planner_log"])

For the following tasks, make plans that can solve the problem step-by-step. For each plan, indicate which external tool together with tool input to retrieve evidence. You can store the evidence into a variable #E (#E1, #E2, ...) that can be called by later tools.

Tools can be one of the following:
WolframAlpha[input]: A WolframAlpha search engine. Useful when you need to solve a complicated Mathematical or Algebraic equation. Input should be an equation or function.
Calculator[input]: A calculator that can compute arithmetic expressions. Useful when you need to perform math calculations. Input should be a mathematical expression
LLM[input]: A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction.

For Example:
Thomas, Toby, and Rebecca worked a total of 157 hours in one week.  Thomas worked x hours.  Toby worked 10 hours less than twice

In [12]:
print(response["solver_log"])

Solve the following task or problem. To assist you, we provide some plans and corresponding evidences that might be helpful. Notice that some of these information contain noise so you should trust them with caution.

Bugs are thriving in a garden of 30 plants. The bugs are very hungry today and completely ate 20 plants. The next day, they were still full and only ate half of the remaining plants. After that, they ate only 1 of the plants that were left. How many plants remain?
Plan: Calculate the number of plants left after each day and subtract the number of plants eaten.
Evidence:
19

Now begin to solve the task or problem. Respond with the answer directly with no extra words.

Bugs are thriving in a garden of 30 plants. The bugs are very hungry today and completely ate 20 plants. The next day, they were still full and only ate half of the remaining plants. After that, they ate only 1 of the plants that were left. How many plants remain?
8 plants remain.
