# Evaluation of TriviaQA

In [1]:
import os

import nodes.Worker

with open("/home/billxbf/Documents/myks/openai.key", "r") as f:
    keys = f.readlines()
    os.environ["OPENAI_API_KEY"] = keys[0].strip()
with open("/home/billxbf/Documents/myks/serpapi.key", "r") as f:
    keys = f.readlines()
    os.environ["SERPAPI_API_KEY"] = keys[0].strip()

In [2]:
from utils.DataLoader import DataLoader
from utils.Evaluator import Evaluator
from algos.PWS import *
from algos.react import ReactBase
from algos.notool import IO, CoT
from prompts import fewshots

In [3]:
def save_data(dataset, data, save_path):
    dataset["preds"] = data["preds"]
    dataset["em"]  = data["em"]
    dataset["f1"] = data["f1"]
    dataset["acc"] = data["acc"]
    dataset["wall_time"] = data["wall_time"]
    dataset["total_tokens"] = data["total_tokens"]
    dataset["steps"] = data["steps"]
    dataset["tool_cost"] = data["tool_cost"]
    dataset["token_cost"] = data["token_cost"]
    dataset["total_cost"] = data["total_cost"]
    dataset.to_csv(save_path, index=False)
    return dataset

In [4]:
EVAL_LLM = "gpt-3.5-turbo"
EVAL_DATASET = "trivia_qa"
SEED = 2024

## Standard IO

In [9]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(500)
io = IO(model_name=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=io)

Found cached dataset trivia_qa (/home/billxbf/workspace/PWS/data/trivia_qa/trivia_qa/rc.nocontext/1.2.0/e73c5e47a8704744fa9ded33504b35a6c098661813d1c2a09892eb9b9e9d59ae)


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
response, data = eval.run()
df = save_data(dataset, data, "results/triviaqa_io_chat.csv")
response


******************* Start Evaluation *******************



100%|██████████| 500/500 [21:28<00:00,  2.58s/it] 


{'avg_em': 0.642,
 'avg_f1': 0.7401808725599343,
 'avg_acc': 0.806,
 'avg_wall_time': 0.8073866505622864,
 'avg_total_tokens': 43.466,
 'avg_total_cost': 8.693200000000001e-05,
 'avg_steps': 1.0,
 'avg_token_cost': 8.693200000000001e-05,
 'avg_tool_cost': 0.0}

In [11]:
df.head(10)

Unnamed: 0,question,answer,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,"""What was the first name of the character """"Ri...","{'aliases': ['Ruperts', 'Rupert', 'RUPERT', 'R...",Rigsby.,False,0.0,0,1.242188,47,1,0,9.4e-05,9.4e-05
1,What seven letter word is the name of the char...,"{'aliases': ['Snellen (disambiguation)', 'SNEL...",Snellen.,True,1.0,1,1.000442,48,1,0,9.6e-05,9.6e-05
2,Which Asian capital city is known as Krung The...,{'aliases': ['Krung-devamahanagara amararatana...,Bangkok.,True,1.0,1,1.030027,47,1,0,9.4e-05,9.4e-05
3,Jack Bauer is the main character in which TV s...,"{'aliases': ['24', 'twenty-four'], 'normalized...",24,True,1.0,1,1.270002,31,1,0,6.2e-05,6.2e-05
4,Which US singer's real name is Ernest Evans?,"{'aliases': ['Chubby Checker'], 'normalized_al...",Chubby Checker.,True,1.0,1,1.023786,33,1,0,6.6e-05,6.6e-05
5,Braxy is a fatal bacterial infection in which ...,"{'aliases': ['Sheep', 'Ovis aries', 'Domestic ...",Sheep.,True,1.0,1,0.978746,33,1,0,6.6e-05,6.6e-05
6,The first coin-operated parking meter in the U...,"{'aliases': ['one thousand, nine hundred and t...",1935.,True,1.0,1,0.992506,47,1,0,9.4e-05,9.4e-05
7,What giant bird was hunted to extinction by Mā...,"{'aliases': ['Emeidae', 'Moaspecies', 'Wingles...",Moa.,True,1.0,1,1.031404,38,1,0,7.6e-05,7.6e-05
8,With which heavy metal pop group is Rick Allen...,"{'aliases': ['Def leppard let's go', 'Def lepa...",Def Leppard.,True,1.0,1,0.644419,34,1,0,6.8e-05,6.8e-05
9,"In Sharia law, what is the specific punishment...","{'aliases': ['Lapidate', 'Stoned (punishment)'...",The specific punishment for adultery in Sharia...,False,0.428571,1,1.675104,45,1,0,9e-05,9e-05


## CoT

In [8]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(500)
cot = CoT(fewshot=fewshots.TRIVIAQA_COT, model_name=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=cot)

Found cached dataset trivia_qa (/home/billxbf/workspace/PWS/data/trivia_qa/trivia_qa/rc.nocontext/1.2.0/e73c5e47a8704744fa9ded33504b35a6c098661813d1c2a09892eb9b9e9d59ae)


  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
response, data = eval.run()
df = save_data(dataset, data, "results/trivia_qa_cot_chat.csv")
response


******************* Start Evaluation *******************



100%|██████████| 500/500 [1:04:13<00:00,  7.71s/it] 


{'avg_em': 0.6,
 'avg_f1': 0.7173426166426166,
 'avg_acc': 0.786,
 'avg_wall_time': 5.3582473263740535,
 'avg_total_tokens': 199.12,
 'avg_total_cost': 0.00039824000000000006,
 'avg_steps': 2.088,
 'avg_token_cost': 0.00039824000000000006,
 'avg_tool_cost': 0.0}

In [10]:
df.head()

Unnamed: 0,question,answer,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,"""What was the first name of the character """"Ri...","{'aliases': ['Ruperts', 'Rupert', 'RUPERT', 'R...",Rupert.,True,1.0,1,6.099105,225,3,0,0.00045,0.00045
1,What seven letter word is the name of the char...,"{'aliases': ['Snellen (disambiguation)', 'SNEL...",Snellen,True,1.0,1,5.031381,193,2,0,0.000386,0.000386
2,Which Asian capital city is known as Krung The...,{'aliases': ['Krung-devamahanagara amararatana...,"Bangkok, Thailand on the Chao Phraya River.",False,0.285714,1,5.260989,206,2,0,0.000412,0.000412
3,Jack Bauer is the main character in which TV s...,"{'aliases': ['24', 'twenty-four'], 'normalized...",24,True,1.0,1,2.252567,147,1,0,0.000294,0.000294
4,Which US singer's real name is Ernest Evans?,"{'aliases': ['Chubby Checker'], 'normalized_al...",Chubby Checker,True,1.0,1,4.328381,177,2,0,0.000354,0.000354


In [13]:
from nodes.Worker import WikipediaWorker
work = WikipediaWorker()
work.run("Bakewell")

'Bakewell is a market town and civil parish in the Derbyshire Dales district of Derbyshire, England, known for Bakewell pudding. It lies on the River Wye, 13 miles (21 km) south-west of Sheffield. At the 2011 census, the population of the civil parish was 3,949. It was estimated at 3,695 in 2019. The town is close to the tourist attractions of Chatsworth House and Haddon Hall.'

In [11]:
print(fewshots.HOTPOTQA_REACT)

Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
(3) Finish[answer], which returns the answer and finishes the task.
Here are some examples.

Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
Thought: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.
Action: Search[Colorado orogeny]
Observation: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.
Thought: It does not mention the eastern sector. So 

## REACT

In [9]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(50)
react = ReactBase(fewshot=fewshots.TRIVIAQA_REACT, model_name=EVAL_LLM, max_iter=8, verbose=False)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=react)

Found cached dataset trivia_qa (/home/billxbf/workspace/PWS/data/trivia_qa/trivia_qa/rc.nocontext/1.2.0/e73c5e47a8704744fa9ded33504b35a6c098661813d1c2a09892eb9b9e9d59ae)


  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
dataset = DataLoader(EVAL_DATASET, seed=111).load(50)
react = ReactBase(fewshot=fewshots.TRIVIAQA_REACT, model_name=EVAL_LLM, max_iter=8, verbose=False)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=react)
response, data = eval.run()
response

Found cached dataset trivia_qa (/home/billxbf/workspace/PWS/data/trivia_qa/trivia_qa/rc.nocontext/1.2.0/e73c5e47a8704744fa9ded33504b35a6c098661813d1c2a09892eb9b9e9d59ae)


  0%|          | 0/3 [00:00<?, ?it/s]


******************* Start Evaluation *******************





  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 50/50 [14:26<00:00, 17.32s/it]


{'avg_em': 0.48,
 'avg_f1': 0.5696666666666667,
 'avg_acc': 0.6,
 'avg_wall_time': 13.405489727854729,
 'avg_total_tokens': 3476.2291666666665,
 'avg_total_cost': 0.006952458333333334,
 'avg_steps': 4.708333333333333,
 'avg_token_cost': 0.006952458333333334,
 'avg_tool_cost': 0.0}

In [10]:
response, data = eval.run()
df = save_data(dataset, data, "results/trivia_qa_react_chat.csv")
response


******************* Start Evaluation *******************





  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 50/50 [18:40<00:00, 22.41s/it]


{'avg_em': 0.38,
 'avg_f1': 0.4146666666666667,
 'avg_acc': 0.48,
 'avg_wall_time': 15.643525327954974,
 'avg_total_tokens': 4949.65306122449,
 'avg_total_cost': 0.00989930612244898,
 'avg_steps': 5.714285714285714,
 'avg_token_cost': 0.00989930612244898,
 'avg_tool_cost': 0.0}

In [11]:
df.head()

Unnamed: 0,question,answer,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,"""What was the first name of the character """"Ri...","{'aliases': ['Ruperts', 'Rupert', 'RUPERT', 'R...",Agent stopped due to iteration limit or time l...,False,0.0,0,23.220037,5676.0,9.0,0.0,0.011352,0.011352
1,What seven letter word is the name of the char...,"{'aliases': ['Snellen (disambiguation)', 'SNEL...",Agent stopped due to iteration limit or time l...,False,0.0,0,22.95382,13692.0,9.0,0.0,0.027384,0.027384
2,Which Asian capital city is known as Krung The...,{'aliases': ['Krung-devamahanagara amararatana...,Bangkok,True,1.0,1,5.303156,1312.0,2.0,0.0,0.002624,0.002624
3,Jack Bauer is the main character in which TV s...,"{'aliases': ['24', 'twenty-four'], 'normalized...",24,True,1.0,1,5.014344,1026.0,2.0,0.0,0.002052,0.002052
4,Which US singer's real name is Ernest Evans?,"{'aliases': ['Chubby Checker'], 'normalized_al...",Ernest Evans,False,0.0,0,10.034224,2044.0,4.0,0.0,0.004088,0.004088


## PWSBase

In [10]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(500)
pwsbase = PWS_Base(fewshot=fewshots.TRIVIAQA_PWS, planner_model=EVAL_LLM, solver_model=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=pwsbase)

Found cached dataset trivia_qa (/home/billxbf/workspace/PWS/data/trivia_qa/trivia_qa/rc.nocontext/1.2.0/e73c5e47a8704744fa9ded33504b35a6c098661813d1c2a09892eb9b9e9d59ae)


  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
response, data = eval.run()
df = save_data(dataset, data, "results/trivia_qa_pwsbase_chat.csv")
response


******************* Start Evaluation *******************





  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 500/500 [2:05:51<00:00, 15.10s/it]  


{'avg_em': 0.518,
 'avg_f1': 0.6063827217227217,
 'avg_acc': 0.666,
 'avg_wall_time': 10.256969241622071,
 'avg_total_tokens': 1340.9378881987577,
 'avg_total_cost': 0.009595552795031055,
 'avg_steps': 3.546583850931677,
 'avg_token_cost': 0.009595552795031055,
 'avg_tool_cost': 0.0}

In [12]:
df.head()

Unnamed: 0,question,answer,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,"""What was the first name of the character """"Ri...","{'aliases': ['Ruperts', 'Rupert', 'RUPERT', 'R...",Francis.,False,0.0,0,11.308511,972.0,4.0,0.0,0.004734,0.004734
1,What seven letter word is the name of the char...,"{'aliases': ['Snellen (disambiguation)', 'SNEL...",Snellen.,True,1.0,1,9.421635,1097.0,3.0,0.0,0.007594,0.007594
2,Which Asian capital city is known as Krung The...,{'aliases': ['Krung-devamahanagara amararatana...,Bangkok.,True,1.0,1,16.723793,544.0,2.0,0.0,0.001574,0.001574
3,Jack Bauer is the main character in which TV s...,"{'aliases': ['24', 'twenty-four'], 'normalized...",Jack Bauer is the main character in the TV ser...,False,0.2,1,8.805704,1592.0,4.0,0.0,0.003184,0.003184
4,Which US singer's real name is Ernest Evans?,"{'aliases': ['Chubby Checker'], 'normalized_al...",Chubby Checker.,True,1.0,1,6.603708,651.0,3.0,0.0,0.002688,0.002688
