In [2]:
import os

import pandas as pd

with open("/home/billxbf/Documents/myks/openai.key", "r") as f:
    keys = f.readlines()
    os.environ["OPENAI_API_KEY"] = keys[0].strip()
with open("/home/billxbf/Documents/myks/serpapi.key", "r") as f:
    keys = f.readlines()
    os.environ["SERPAPI_API_KEY"] = keys[0].strip()
with open("/home/billxbf/Documents/myks/wolfram.key", "r") as f:
    keys = f.readlines()
    os.environ["WOLFRAM_ALPHA_APPID"] = keys[0].strip()

In [3]:
from utils.DataLoader import DataLoader
from utils.Evaluator import Evaluator
from algos.PWS import *
from algos.react import ReactBase, ReactExtraTool
from algos.notool import IO, CoT
from prompts import fewshots, solver, planner

In [4]:
def save_data(dataset, data, save_path):
    dataset["label"] = data["label"]
    dataset["preds"] = data["preds"]
    dataset["em"]  = data["em"]
    dataset["f1"] = data["f1"]
    dataset["acc"] = data["acc"]
    dataset["wall_time"] = data["wall_time"]
    dataset["total_tokens"] = data["total_tokens"]
    dataset["steps"] = data["steps"]
    dataset["tool_cost"] = data["tool_cost"]
    dataset["token_cost"] = data["token_cost"]
    dataset["total_cost"] = data["total_cost"]
    dataset.to_csv(save_path, index=False)
    return dataset

In [5]:
EVAL_LLM = "gpt-3.5-turbo"
EVAL_DATASET = "disfl_qa"
SEED = 2024
SAVE_DIR = "results/disfl_qa_{}_{}.csv"

## Standard IO

In [6]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(sample_size=150)
io = IO(model_name=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=io)

In [7]:
response, data = eval.run()
df = save_data(dataset, data, SAVE_DIR.format("io", EVAL_LLM))
response


******************* Start Evaluation *******************



100%|██████████| 150/150 [08:52<00:00,  3.55s/it]


{'avg_em': 0.12666666666666668,
 'avg_f1': 0.283701574048705,
 'avg_acc': 0.4066666666666667,
 'avg_wall_time': 1.5868335469563801,
 'avg_total_tokens': 234.5,
 'avg_total_cost': 0.00046899999999999996,
 'avg_steps': 1.0,
 'avg_token_cost': 0.00046899999999999996,
 'avg_tool_cost': 0.0}

In [8]:
df.head()

Unnamed: 0,input,target,label,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,By which year did Dodge ugh dammit Chrysler en...,1981,1981,Chrysler ended production of their full-sized ...,False,0.133333,1,1.533049,214,1,0,0.000428,0.000428
1,What did France offer that was rare by imperia...,Christianity and French culture,Christianity and French culture,France offered the upliftment of other regions...,False,0.533333,0,1.009747,185,1,0,0.00037,0.00037
2,What did Iraq wait no actually Saudi Arabia tr...,domestic Islamists,domestic Islamists,Saudi Arabia tried to repress domestic Islamis...,False,0.333333,0,1.02409,216,1,0,0.000432,0.000432
3,In which county does Duval I mean Jacksonville...,Duval,Duval,Duval County.,False,0.666667,1,0.869765,175,1,0,0.00035,0.00035
4,What allows the adaptive immune system to reta...,immunological memory,immunological memory,Immunological memory.,True,1.0,1,0.873788,212,1,0,0.000424,0.000424


## CoT

In [18]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(150)
cot = CoT(fewshot=fewshots.TRIVIAQA_COT, model_name=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=cot)

In [12]:
response, data = eval.run()
df = save_data(dataset, data, SAVE_DIR.format("cot", EVAL_LLM))
response


******************* Start Evaluation *******************



100%|██████████| 150/150 [11:53<00:00,  4.76s/it]


{'avg_em': 0.06666666666666667,
 'avg_f1': 0.21819442183956578,
 'avg_acc': 0.38666666666666666,
 'avg_wall_time': 2.8440882126490274,
 'avg_total_tokens': 347.64,
 'avg_total_cost': 0.0006952799999999999,
 'avg_steps': 0.3333333333333333,
 'avg_token_cost': 0.0006952799999999999,
 'avg_tool_cost': 0.0}

In [16]:
df.head()

Unnamed: 0,input,target,label,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,By which year did Dodge ugh dammit Chrysler en...,1981,1981,The end of the 1981 model year.,False,0.333333,1,1.295294,294,0,0,0.000588,0.000588
1,What did France offer that was rare by imperia...,Christianity and French culture,Christianity and French culture,"France offered full citizenship rights, or ""as...",False,0.145455,0,2.947282,327,0,0,0.000654,0.000654
2,What did Iraq wait no actually Saudi Arabia tr...,domestic Islamists,domestic Islamists,Saudi Arabia tried to repress domestic Islamis...,False,0.210526,0,1.635472,313,0,0,0.000626,0.000626
3,In which county does Duval I mean Jacksonville...,Duval,Duval,Duval County,False,0.666667,1,0.737179,264,0,0,0.000528,0.000528
4,What allows the adaptive immune system to reta...,immunological memory,immunological memory,The immunological memory allows the adaptive i...,False,0.166667,0,1.836538,325,0,0,0.00065,0.00065


## REACT

In [31]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(75)
react = ReactExtraTool(model_name="text-davinci-003", available_tools=["Google", "Wikipedia", "WolframAlpha", "Calculator", "LLM"], fewshot=fewshots.DEFAULT_REACT, verbose=False)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=react)



In [32]:
response, data = eval.run()
df = save_data(dataset, data, SAVE_DIR.format("react", EVAL_LLM))
response


******************* Start Evaluation *******************



100%|██████████| 75/75 [28:00<00:00, 22.41s/it]


{'avg_em': 0.05333333333333334,
 'avg_f1': 0.1385626986739189,
 'avg_acc': 0.18666666666666668,
 'avg_wall_time': 16.147707431212716,
 'avg_total_tokens': 4162.608695652174,
 'avg_total_cost': 0.009105739130434783,
 'avg_steps': 4.318840579710145,
 'avg_token_cost': 0.009105739130434783,
 'avg_tool_cost': 0.0}

In [33]:
df.head()

Unnamed: 0,input,target,label,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,By which year did Dodge ugh dammit Chrysler en...,1981,1981,Dodge Chrysler ended production of their full-...,False,0.125,0,16.729712,3711.0,5.0,0.0,0.007962,0.007962
1,What did France offer that was rare by imperia...,Christianity and French culture,Christianity and French culture,France offered education as part of their civi...,False,0.05,0,38.310331,15930.0,8.0,0.0,0.032976,0.032976
2,What did Iraq wait no actually Saudi Arabia tr...,domestic Islamists,domestic Islamists,Saudi Arabia tried to repress Shiites to compe...,False,0.0,0,18.123836,5866.0,5.0,0.0,0.012416,0.012416
3,In which county does Duval I mean Jacksonville...,Duval,Duval,Duval County.,False,0.666667,1,3.91635,1080.0,2.0,0.0,0.002592,0.002592
4,What allows the adaptive immune system to reta...,immunological memory,immunological memory,Memory allows the adaptive immune system to re...,False,0.095238,0,13.834762,2855.0,4.0,0.0,0.00643,0.00643


In [30]:
# save dict to json
import json
with open("results/gsm8k_react_chat.json", "w") as f:
    json.dump(data, f)

 ## PWSBase

In [8]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(10)
pwsbase = PWS_Base(fewshot=fewshots.GSM8K_PWS, available_tools=["WolframAlpha", "Calculator", "LLM"], planner_model=EVAL_LLM, solver_model=EVAL_LLM)
pwsbase.planner.suffix = planner.RESOURCE_RELUCTANT_SUFFIX
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=pwsbase)

Found cached dataset gsm8k (/home/billxbf/workspace/PWS/data/gsm8k/gsm8k/main/1.1.0/37bfb08b1d4fcbb01f06b03d9e1ef5f1fcbd4d3af3d08842c50d7305091285ba)


  0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
response, data = eval.run()
df = save_data(dataset, data, "results/gsm8k_pwsbase_chat.csv")
response


******************* Start Evaluation *******************



100%|██████████| 10/10 [01:44<00:00, 10.45s/it]


{'avg_em': 0.1,
 'avg_f1': 0.24380952380952384,
 'avg_acc': 0.5,
 'avg_wall_time': 7.375077843666077,
 'avg_total_tokens': 1019.1,
 'avg_total_cost': 0.005775000000000001,
 'avg_steps': 2.8,
 'avg_token_cost': 0.005775000000000001,
 'avg_tool_cost': 0.0}

In [17]:
df.head(10)

Unnamed: 0,question,answer,label,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,Gary manages two Amazon distribution centers. ...,First find how many packages the second center...,14000,14000.0,False,0.0,0,5.534352,884,2,0.0,0.004018,0.004018
1,The square footage of the two bedrooms in the ...,Let M be the number of square feet in Matha's ...,120,Martha's bedroom is 120 square feet.,False,0.285714,1,5.54037,803,3,0.0,0.001768,0.001768
2,"Thomas, Toby, and Rebecca worked a total of 15...",Toby: 2x-10\nRebecca: (2x-10)-8=2x-18\nTotal:2...,56,9 hours.,False,0.0,0,6.390907,842,3,0.0,0.001882,0.001882
3,Two-thirds of the class have brown eyes. Half ...,There are 6 * 2 = <<6*2=12>>12 students with b...,18,There are 18 students in total.,False,0.285714,1,7.168095,1030,3,0.0,0.006488,0.006488
4,"Jackie spends 8 hours working, 3 hours of exer...",Jackie spends a total of 8 + 8 + 3 = <<8+8+3=1...,5,5 hours.,False,0.666667,1,7.389826,954,3,0.0,0.006084,0.006084
5,If Layla scored 104 goals in four hockey games...,If Layla scored 104 goals in four hockey games...,92,212.0,False,0.0,0,6.634716,992,3,0.0,0.00625,0.00625
6,Bugs are thriving in a garden of 30 plants. Th...,The bugs ate enough plants for there to be 30-...,4,10 plants remain.,False,0.0,0,7.885642,1194,2,0.0,0.009264,0.009264
7,A family is going to the amusement park. The a...,To solve this problem you can set up an equati...,58,$58,True,1.0,1,8.923775,1124,2,0.0,0.008512,0.008512
8,A retailer sells any shirt for the same price ...,If I paid $120 and I was refunded 25% for all ...,45,The price of 1 shirt at this place is $45.,False,0.2,1,6.289263,1069,3,0.0,0.004316,0.004316
9,Half of Taylor's house guests like weak coffee...,He uses 1 tablespoon for weak coffee and doubl...,36,"[60.0, 120.0] (total number of tablespoons of ...",False,0.0,0,11.993834,1299,4,0.0,0.009168,0.009168


In [5]:
dataset["question"][6]

NameError: name 'dataset' is not defined

In [9]:
response = pwsbase.run(dataset["question"][6])

In [10]:
print(response["planner_log"])

For the following tasks, make plans that can solve the problem step-by-step. For each plan, indicate which external tool together with tool input to retrieve evidence. You can store the evidence into a variable #E (#E1, #E2, ...) that can be called by later tools.

Tools can be one of the following:
WolframAlpha[input]: A WolframAlpha search engine. Useful when you need to solve a complicated Mathematical or Algebraic equation. Input should be an equation or function.
Calculator[input]: A calculator that can compute arithmetic expressions. Useful when you need to perform math calculations. Input should be a mathematical expression
LLM[input]: A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction.

For Example:
Thomas, Toby, and Rebecca worked a total of 157 hours in one week.  Thomas worked x hours.  Toby worked 10 hours less than twice

In [12]:
print(response["solver_log"])

Solve the following task or problem. To assist you, we provide some plans and corresponding evidences that might be helpful. Notice that some of these information contain noise so you should trust them with caution.

Bugs are thriving in a garden of 30 plants. The bugs are very hungry today and completely ate 20 plants. The next day, they were still full and only ate half of the remaining plants. After that, they ate only 1 of the plants that were left. How many plants remain?
Plan: Calculate the number of plants left after each day and subtract the number of plants eaten.
Evidence:
19

Now begin to solve the task or problem. Respond with the answer directly with no extra words.

Bugs are thriving in a garden of 30 plants. The bugs are very hungry today and completely ate 20 plants. The next day, they were still full and only ate half of the remaining plants. After that, they ate only 1 of the plants that were left. How many plants remain?
8 plants remain.
