In [1]:
import os

import pandas as pd

with open("/home/billxbf/Documents/myks/openai.key", "r") as f:
    keys = f.readlines()
    os.environ["OPENAI_API_KEY"] = keys[0].strip()
with open("/home/billxbf/Documents/myks/serpapi.key", "r") as f:
    keys = f.readlines()
    os.environ["SERPAPI_API_KEY"] = keys[0].strip()
with open("/home/billxbf/Documents/myks/wolfram.key", "r") as f:
    keys = f.readlines()
    os.environ["WOLFRAM_ALPHA_APPID"] = keys[0].strip()

In [2]:
from utils.DataLoader import DataLoader
from utils.Evaluator import Evaluator
from algos.PWS import *
from algos.react import ReactBase, ReactExtraTool
from algos.notool import IO, CoT
from prompts import fewshots, solver, planner

In [3]:
def save_data(dataset, data, save_path):
    dataset["label"] = data["label"]
    dataset["preds"] = data["preds"]
    dataset["em"]  = data["em"]
    dataset["f1"] = data["f1"]
    dataset["acc"] = data["acc"]
    dataset["wall_time"] = data["wall_time"]
    dataset["total_tokens"] = data["total_tokens"]
    dataset["steps"] = data["steps"]
    dataset["tool_cost"] = data["tool_cost"]
    dataset["token_cost"] = data["token_cost"]
    dataset["total_cost"] = data["total_cost"]
    dataset.to_csv(save_path, index=False)
    return dataset

In [4]:
EVAL_LLM = "gpt-3.5-turbo"
EVAL_DATASET = "strategy_qa"
SEED = 2024
SAVE_DIR = "results/strategy_qa_{}_{}.csv"

## Standard IO

In [11]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(sample_size=300)
io = IO(model_name=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=io)

In [12]:
response, data = eval.run()
df = save_data(dataset, data, SAVE_DIR.format("io", EVAL_LLM))
response


******************* Start Evaluation *******************



100%|██████████| 300/300 [11:57<00:00,  2.39s/it]


{'avg_em': 0.6533333333333333,
 'avg_f1': 0.6533333333333333,
 'avg_acc': 0.6566666666666666,
 'avg_wall_time': 0.6874453552563985,
 'avg_total_tokens': 44.25,
 'avg_total_cost': 8.85e-05,
 'avg_steps': 1.0,
 'avg_token_cost': 8.85e-05,
 'avg_tool_cost': 0.0}

In [21]:
df.head()

Unnamed: 0,input,target,label,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,Answer with Yes or No: Can you use the T-Mobil...,No,No,No.,True,1.0,1,0.739054,46,1,0,9.2e-05,9.2e-05
1,Answer with Yes or No: Is the Holy Land import...,No,No,Yes.,False,0.0,0,0.682858,36,1,0,7.2e-05,7.2e-05
2,Answer with Yes or No: Could jockey win Tripl...,No,No,"I'm sorry, as an AI language model, I cannot p...",False,0.0,0,2.380594,75,1,0,0.00015,0.00015
3,"Answer with Yes or No: Do Star Wars fans say ""...",No,No,No.,True,1.0,1,0.722619,39,1,0,7.8e-05,7.8e-05
4,Answer with Yes or No: Would a Durian be dange...,Yes,Yes,Yes.,True,1.0,1,0.404635,40,1,0,8e-05,8e-05


## CoT

In [7]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(150)
cot = CoT(fewshot=fewshots.TRIVIAQA_COT, model_name=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=cot)

In [23]:
response, data = eval.run()
df = save_data(dataset, data, SAVE_DIR.format("cot", EVAL_LLM))
response


******************* Start Evaluation *******************



100%|██████████| 150/150 [11:46<00:00,  4.71s/it]


{'avg_em': 0.5066666666666667,
 'avg_f1': 0.5066666666666667,
 'avg_acc': 0.56,
 'avg_wall_time': 2.8668559885025022,
 'avg_total_tokens': 170.56666666666666,
 'avg_total_cost': 0.0003411333333333333,
 'avg_steps': 0.8533333333333334,
 'avg_token_cost': 0.0003411333333333333,
 'avg_tool_cost': 0.0}

In [None]:
df.head()

In [11]:
cot = CoT(fewshot=fewshots.TRIVIAQA_COT, model_name="text-davinci-003")
cot.run(dataset["input"][0])

{'wall_time': 0.677175760269165,
 'input': "Answer following questions. Let's think step by step. Give your reasoning process, and then answer the question in a new line directly with no extra words.\nWhat is the name of the river on which Bakewell stands?\nStep 1: Identify the location of Bakewell.\nBakewell is a small town in Derbyshire, England.\nStep 2: Search for the river that passes through Bakewell.\nThe River Wye flows through the town.\nAnswer: River Wye\nAnswer with Yes or No: Can you use the T-Mobile tuesdays app if you aren't a T-Mobile customer?\n",
 'output': 'No',
 'prompt_tokens': 131,
 'completion_tokens': 1,
 'total_tokens': 132,
 'token_cost': 0.0026400000000000004,
 'tool_cost': 0,
 'total_cost': 0.0026400000000000004,
 'steps': 0}

## REACT

In [17]:
dataset

Unnamed: 0,input,target,label,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,Determine the plausibility of the following st...,plausible,plausible,Implausible,False,0,0,4.431216,1931,3,0,0.03862,0.03862


In [7]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(150)
react = ReactExtraTool(model_name="text-davinci-003", available_tools=["Google", "Wikipedia", "WolframAlpha", "Calculator", "LLM"], fewshot=fewshots.GSM8K_REACT, verbose=False)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=react)

In [8]:
response, data = eval.run()
df = save_data(dataset, data, SAVE_DIR.format("react", EVAL_LLM))
response


******************* Start Evaluation *******************



100%|██████████| 150/150 [22:07<00:00,  8.85s/it]


{'avg_em': 0.6466666666666666,
 'avg_f1': 0.6466666666666666,
 'avg_acc': 0.6466666666666666,
 'avg_wall_time': 7.106342790921529,
 'avg_total_tokens': 1686.32,
 'avg_total_cost': 0.03372064,
 'avg_steps': 2.5866666666666664,
 'avg_token_cost': 0.03372064,
 'avg_tool_cost': 0.0}

In [9]:
df.head()

Unnamed: 0,input,target,label,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,Answer with 'Yes' or 'No': Can you use the T-M...,No,No,No,True,1.0,1,3.935129,1279,2,0,0.02558,0.02558
1,Answer with 'Yes' or 'No': Is the Holy Land im...,No,No,Yes,False,0.0,0,6.141816,1179,2,0,0.02358,0.02358
2,Answer with 'Yes' or 'No': Could jockey win T...,No,No,No,True,1.0,1,6.68857,1204,2,0,0.02408,0.02408
3,Answer with 'Yes' or 'No': Do Star Wars fans s...,No,No,No,True,1.0,1,4.330515,1209,2,0,0.02418,0.02418
4,Answer with 'Yes' or 'No': Would a Durian be d...,Yes,Yes,Yes,True,1.0,1,3.417613,1194,2,0,0.02388,0.02388


 ## PWSBase

In [16]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(150)
pws = PWS_Base(available_tools=["Google", "Wikipedia", "WolframAlpha", "Calculator", "LLM"], fewshot=fewshots.TRIVIAQA_PWS, planner_model="text-davinci-003", solver_model=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=pws)

In [17]:
response, data = eval.run()
df = save_data(dataset, data, SAVE_DIR.format("pws", EVAL_LLM))
response


******************* Start Evaluation *******************





  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 150/150 [22:22<00:00,  8.95s/it]


{'avg_em': 0.66,
 'avg_f1': 0.66,
 'avg_acc': 0.66,
 'avg_wall_time': 7.068437215487163,
 'avg_total_tokens': 1287.0266666666666,
 'avg_total_cost': 0.024350040000000003,
 'avg_steps': 3.2,
 'avg_token_cost': 0.017683373333333335,
 'avg_tool_cost': 0.006666666666666667}

In [17]:
df.head(10)

Unnamed: 0,question,answer,label,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,Gary manages two Amazon distribution centers. ...,First find how many packages the second center...,14000,14000.0,False,0.0,0,5.534352,884,2,0.0,0.004018,0.004018
1,The square footage of the two bedrooms in the ...,Let M be the number of square feet in Matha's ...,120,Martha's bedroom is 120 square feet.,False,0.285714,1,5.54037,803,3,0.0,0.001768,0.001768
2,"Thomas, Toby, and Rebecca worked a total of 15...",Toby: 2x-10\nRebecca: (2x-10)-8=2x-18\nTotal:2...,56,9 hours.,False,0.0,0,6.390907,842,3,0.0,0.001882,0.001882
3,Two-thirds of the class have brown eyes. Half ...,There are 6 * 2 = <<6*2=12>>12 students with b...,18,There are 18 students in total.,False,0.285714,1,7.168095,1030,3,0.0,0.006488,0.006488
4,"Jackie spends 8 hours working, 3 hours of exer...",Jackie spends a total of 8 + 8 + 3 = <<8+8+3=1...,5,5 hours.,False,0.666667,1,7.389826,954,3,0.0,0.006084,0.006084
5,If Layla scored 104 goals in four hockey games...,If Layla scored 104 goals in four hockey games...,92,212.0,False,0.0,0,6.634716,992,3,0.0,0.00625,0.00625
6,Bugs are thriving in a garden of 30 plants. Th...,The bugs ate enough plants for there to be 30-...,4,10 plants remain.,False,0.0,0,7.885642,1194,2,0.0,0.009264,0.009264
7,A family is going to the amusement park. The a...,To solve this problem you can set up an equati...,58,$58,True,1.0,1,8.923775,1124,2,0.0,0.008512,0.008512
8,A retailer sells any shirt for the same price ...,If I paid $120 and I was refunded 25% for all ...,45,The price of 1 shirt at this place is $45.,False,0.2,1,6.289263,1069,3,0.0,0.004316,0.004316
9,Half of Taylor's house guests like weak coffee...,He uses 1 tablespoon for weak coffee and doubl...,36,"[60.0, 120.0] (total number of tablespoons of ...",False,0.0,0,11.993834,1299,4,0.0,0.009168,0.009168


In [5]:
dataset["question"][6]

NameError: name 'dataset' is not defined

In [9]:
response = pwsbase.run(dataset["question"][6])

In [10]:
print(response["planner_log"])

For the following tasks, make plans that can solve the problem step-by-step. For each plan, indicate which external tool together with tool input to retrieve evidence. You can store the evidence into a variable #E (#E1, #E2, ...) that can be called by later tools.

Tools can be one of the following:
WolframAlpha[input]: A WolframAlpha search engine. Useful when you need to solve a complicated Mathematical or Algebraic equation. Input should be an equation or function.
Calculator[input]: A calculator that can compute arithmetic expressions. Useful when you need to perform math calculations. Input should be a mathematical expression
LLM[input]: A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction.

For Example:
Thomas, Toby, and Rebecca worked a total of 157 hours in one week.  Thomas worked x hours.  Toby worked 10 hours less than twice

In [12]:
print(response["solver_log"])

Solve the following task or problem. To assist you, we provide some plans and corresponding evidences that might be helpful. Notice that some of these information contain noise so you should trust them with caution.

Bugs are thriving in a garden of 30 plants. The bugs are very hungry today and completely ate 20 plants. The next day, they were still full and only ate half of the remaining plants. After that, they ate only 1 of the plants that were left. How many plants remain?
Plan: Calculate the number of plants left after each day and subtract the number of plants eaten.
Evidence:
19

Now begin to solve the task or problem. Respond with the answer directly with no extra words.

Bugs are thriving in a garden of 30 plants. The bugs are very hungry today and completely ate 20 plants. The next day, they were still full and only ate half of the remaining plants. After that, they ate only 1 of the plants that were left. How many plants remain?
8 plants remain.
