# Evaluation of HotpotQA

In [2]:
import os
with open("/home/billxbf/Documents/myks/openai.key", "r") as f:
    keys = f.readlines()
    os.environ["OPENAI_API_KEY"] = keys[0].strip()
with open("/home/billxbf/Documents/myks/serpapi.key", "r") as f:
    keys = f.readlines()
    os.environ["SERPAPI_API_KEY"] = keys[0].strip()

In [3]:
from utils.DataLoader import DataLoader
from utils.Evaluator import Evaluator
from algos.PWS import *
from algos.react import ReactBase
from algos.notool import IO, CoT
from prompts import fewshots

In [4]:
def save_data(dataset, data, save_path):
    dataset["preds"] = data["preds"]
    dataset["em"]  = data["em"]
    dataset["f1"] = data["f1"]
    dataset["acc"] = data["acc"]
    dataset["wall_time"] = data["wall_time"]
    dataset["total_tokens"] = data["total_tokens"]
    dataset["steps"] = data["steps"]
    dataset["tool_cost"] = data["tool_cost"]
    dataset["token_cost"] = data["token_cost"]
    dataset["total_cost"] = data["total_cost"]
    dataset.to_csv(save_path, index=False)
    return dataset

In [5]:
EVAL_LLM = "gpt-3.5-turbo"
EVAL_DATASET = "hotpot_qa"
SEED = 2024
SAVE_DIR = "results/hotpot_qa_{}_{}.csv"

## Standard IO

In [6]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(sample_size=500)
io = IO(model_name=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=io)

Found cached dataset hotpot_qa (/home/billxbf/workspace/PWS/data/hotpot_qa/hotpot_qa/fullwiki/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
response, data = eval.run()
df = save_data(dataset, data, SAVE_DIR.format("io", EVAL_LLM))
response


******************* Start Evaluation *******************



100%|██████████| 500/500 [30:50<00:00,  3.70s/it]


{'avg_em': 0.28,
 'avg_f1': 0.3625966108849186,
 'avg_acc': 0.378,
 'avg_wall_time': 1.4129042415618895,
 'avg_total_tokens': 55.458,
 'avg_total_cost': 0.00011091599999999999,
 'avg_steps': 1.0,
 'avg_token_cost': 0.00011091599999999999,
 'avg_tool_cost': 0.0}

In [7]:
df.head()

Unnamed: 0,question,answer,preds,em,f1,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,Noble is a 2014 film written and directed by S...,The Cripple of Inishmaan,Billy Elliot the Musical.,False,0.0,0.729125,87,1,0,0.000174,0.000174
1,Who was the English clergyman and Archbishop o...,Henry Beaufort,Otto von Ziegenhain was not the English clergy...,False,0.0,2.45697,74,1,0,0.000148,0.000148
2,Which fictional character in the Harry Potter ...,Rubeus Hagrid,Hagrid.,False,0.666667,0.71447,39,1,0,7.8e-05,7.8e-05
3,What Classification is the album by UK band Se...,Trip hop,Trip hop.,True,1.0,0.603935,55,1,0,0.00011,0.00011
4,What actor from The Imitation Game also starre...,Mark Strong,Robert Downey Jr.,False,0.0,0.630275,52,1,0,0.000104,0.000104


## CoT

In [11]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(500)
cot = CoT(fewshot=fewshots.HOTPOTQA_COT, model_name=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=cot)

Found cached dataset hotpot_qa (/home/billxbf/workspace/PWS/data/hotpot_qa/hotpot_qa/fullwiki/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
response, data = eval.run()
df = save_data(dataset, data, SAVE_DIR.format("cot", EVAL_LLM))
response


******************* Start Evaluation *******************



 43%|████▎     | 215/500 [38:43<57:12, 12.04s/it]  Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: Request failed due to server shutdown {
  "error": {
    "message": "Request failed due to server shutdown",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'Request failed due to server shutdown', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Fri, 05 May 2023 04:09:42 GMT', 'Content-Type': 'application/json', 'Content-Length': '141', 'Connection': 'keep-alive', 'access-control-allow-origin': '*', 'openai-model': 'text-davinci-003', 'openai-organization': 'user-bwgvrmzdtlryt3qbvfrf1vbk', 'openai-processing-ms': '5308', 'openai-version': '2020-10-01', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-ratelimit-limit-requests': '3000', 'x-ratelimit-limit-tokens': '250000', 'x-ratelimit-remaining-requests': '2999', 'x-ratelimit-rem

{'avg_em': 0.214,
 'avg_f1': 0.3086934544327207,
 'avg_acc': 0.416,
 'avg_wall_time': 8.141690133571625,
 'avg_total_tokens': 570.036,
 'avg_total_cost': 0.001140072,
 'avg_steps': 2.472,
 'avg_token_cost': 0.001140072,
 'avg_tool_cost': 0.0}

In [9]:
df.head(10)

Unnamed: 0,question,answer,preds,em,f1,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,In what city and state was the vocalist for E...,"Redwood City, California","Bayamón, Puerto Rico",False,0.0,4.445898,447,2,0,0.000894,0.000894
1,"What was the birth date of the director of ""Th...","December 28, 1888March","I am sorry, I cannot answer this question with...",False,0.0,1.839958,396,0,0,0.000792,0.000792
2,What is the name of the cognac house that make...,Jas Hennessy & Co.,Hennessy,False,0.5,7.578993,499,3,0,0.000998,0.000998
3,The Wisconsin Badgers football team has had a ...,The Iron Horse,"""The Horse""",False,0.666667,5.424428,468,2,0,0.000936,0.000936
4,"Which film director is older, Jan Troell or Ag...",Agnès Varda,Agnès Varda,True,1.0,6.450117,472,2,0,0.000944,0.000944
5,"Foshan Lingnan Mingzhu Gymnasium, where Boxing...",2006,2010,False,0.0,7.975955,517,2,0,0.001034,0.001034
6,In between Parenting and Cook's Illustrated wh...,Parenting,1993 to 2013.,False,0.0,8.808713,536,3,0,0.001072,0.001072
7,"In the year 2016, how many passengers used the...",11.04 million,Not enough information provided.,False,0.0,3.693008,430,1,0,0.00086,0.00086
8,What was the nationality of Robert Kardashian'...,American,American,True,1.0,7.475537,484,3,0,0.000968,0.000968
9,In what Ohio city does the highway which the n...,Cincinnati,Not enough information provided to answer the...,False,0.0,9.0093,532,3,0,0.001064,0.001064


## REACT

In [12]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(50)
react = ReactBase(fewshot=fewshots.HOTPOTQA_REACT, model_name=EVAL_LLM, max_iter=8, verbose=False)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=react)

Found cached dataset hotpot_qa (/home/billxbf/workspace/PWS/data/hotpot_qa/hotpot_qa/fullwiki/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
response, data = eval.run()
df = save_data(dataset, data, SAVE_DIR.format("react", EVAL_LLM))
response


******************* Start Evaluation *******************



 66%|██████▌   | 33/50 [15:22<08:26, 29.78s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID ec900b2d4520f1d4efb7adcf3a23f855 in your message.).


  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 50/50 [24:14<00:00, 29.09s/it]


{'avg_em': 0.4,
 'avg_f1': 0.42696969696969694,
 'avg_acc': 0.4,
 'avg_wall_time': 23.001482283815424,
 'avg_total_tokens': 6930.170212765957,
 'avg_total_cost': 0.013860340425531912,
 'avg_steps': 5.8936170212765955,
 'avg_token_cost': 0.013860340425531912,
 'avg_tool_cost': 0.0}

In [9]:
df.head(10)

Unnamed: 0,question,answer,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,In what city and state was the vocalist for E...,"Redwood City, California",,False,0.0,0,,,,,,
1,"What was the birth date of the director of ""Th...","December 28, 1888March",,False,0.0,0,,,,,,
2,What is the name of the cognac house that make...,Jas Hennessy & Co.,Hennessy,False,0.5,1,8.891754,5092.0,3.0,0.0,0.010184,0.010184
3,The Wisconsin Badgers football team has had a ...,The Iron Horse,the Iron Horse or the Horse,False,0.666667,0,12.183595,7105.0,4.0,0.0,0.01421,0.01421
4,"Which film director is older, Jan Troell or Ag...",Agnès Varda,Jan Troell,False,0.0,0,9.318587,5263.0,3.0,0.0,0.010526,0.010526
5,"Foshan Lingnan Mingzhu Gymnasium, where Boxing...",2006,2006,True,1.0,1,7.46473,3284.0,2.0,0.0,0.006568,0.006568
6,In between Parenting and Cook's Illustrated wh...,Parenting,Parenting,True,1.0,1,10.895709,7210.0,4.0,0.0,0.01442,0.01442
7,"In the year 2016, how many passengers used the...",11.04 million,Agent stopped due to iteration limit or time l...,False,0.0,0,33.010987,21835.0,9.0,0.0,0.04367,0.04367
8,What was the nationality of Robert Kardashian'...,American,Agent stopped due to iteration limit or time l...,False,0.0,0,21.680504,14876.0,9.0,0.0,0.029752,0.029752
9,In what Ohio city does the highway which the n...,Cincinnati,Agent stopped due to iteration limit or time l...,False,0.0,0,30.278475,17950.0,9.0,0.0,0.0359,0.0359


## PWSBase

In [9]:
dataset = DataLoader(EVAL_DATASET, seed=SEED).load(500)
pwsbase = PWS_Base(fewshot=fewshots.HOTPOTQA_PWS_BASE, planner_model=EVAL_LLM, solver_model=EVAL_LLM)
eval = Evaluator(task=EVAL_DATASET, dataset=dataset, algo=pwsbase)

Found cached dataset hotpot_qa (/home/billxbf/workspace/PWS/data/hotpot_qa/hotpot_qa/fullwiki/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
response, data = eval.run()
df = save_data(dataset, data, SAVE_DIR.format("pws", EVAL_LLM))
response


******************* Start Evaluation *******************





  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 200/200 [30:51<00:00,  9.26s/it]


{'avg_em': 0.295,
 'avg_f1': 0.41383575684758556,
 'avg_acc': 0.45,
 'avg_wall_time': 7.591724774837494,
 'avg_total_tokens': 1495.63,
 'avg_total_cost': 0.029912600000000004,
 'avg_steps': 4.095,
 'avg_token_cost': 0.029912600000000004,
 'avg_tool_cost': 0.0}

In [13]:
df.head(10)

Unnamed: 0,question,answer,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,Noble is a 2014 film written and directed by S...,The Cripple of Inishmaan,Normal People (miniseries).,False,0.0,0,7.235505,1206,4,0.0,0.02412,0.02412
1,Who was the English clergyman and Archbishop o...,Henry Beaufort,Otto von Ziegenhain led the Battle of Tachov.,False,0.0,0,9.427552,1235,5,0.0,0.0247,0.0247
2,Which fictional character in the Harry Potter ...,Rubeus Hagrid,Rubeus Hagrid.,True,1.0,1,5.455595,1460,3,0.0,0.0292,0.0292
3,What Classification is the album by UK band Se...,Trip hop,Britpop.,False,0.0,0,6.557354,1226,4,0.0,0.02452,0.02452
4,What actor from The Imitation Game also starre...,Mark Strong,Benedict Cumberbatch,False,0.0,0,16.167417,2319,6,0.0,0.04638,0.04638


Unnamed: 0,question,answer,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,Noble is a 2014 film written and directed by S...,The Cripple of Inishmaan,Normal People (miniseries).,False,0.0,0,7.235505,1206,4,0.0,0.02412,0.02412
1,Who was the English clergyman and Archbishop o...,Henry Beaufort,Otto von Ziegenhain led the Battle of Tachov.,False,0.0,0,9.427552,1235,5,0.0,0.0247,0.0247
2,Which fictional character in the Harry Potter ...,Rubeus Hagrid,Rubeus Hagrid.,True,1.0,1,5.455595,1460,3,0.0,0.0292,0.0292
3,What Classification is the album by UK band Se...,Trip hop,Britpop.,False,0.0,0,6.557354,1226,4,0.0,0.02452,0.02452
4,What actor from The Imitation Game also starre...,Mark Strong,Benedict Cumberbatch,False,0.0,0,16.167417,2319,6,0.0,0.04638,0.04638
5,This 2006 Iowa Hawkeyes football coach played ...,Kirk Ferentz,"Yes, Kirk Ferentz played linebacker for the Un...",False,0.307692,1,9.570546,1998,5,0.0,0.03996,0.03996
6,Ikivo Animator integrates with a software appl...,Adobe Systems,Hypermedia.,False,0.0,0,4.10822,723,3,0.0,0.01446,0.01446
7,What American actress born in 1912 did the vio...,Marta Eggerth,Mary Pickford.,False,0.0,0,11.671225,2403,5,0.0,0.04806,0.04806
8,"Which plant has more species, Teucrium or Atal...",Teucrium,Teucrium has more species.,False,0.4,1,6.937577,1497,4,0.0,0.02994,0.02994
9,When was the British-American journalist and e...,3 November 1949,Unknown.,False,0.0,0,8.877709,1484,5,0.0,0.02968,0.02968


## PWSExtra

In [4]:
dataset = DataLoader("hotpot_qa", seed=2024).load(10)
pwsextra = PWS_Extra(fewshot=fewshots.HOTPOTQA_PWS_EXTRA, planner_model="gpt-3.5-turbo", solver_model="gpt-3.5-turbo")
eval = Evaluator(task="hotpot_qa", dataset=dataset, algo=pwsextra)

Found cached dataset hotpot_qa (/home/billxbf/workspace/PWS/data/hotpot_qa/hotpot_qa/fullwiki/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
response, data = eval.run()
df = save_data(dataset, data, SAVE_DIR.format("pws", EVAL_LLM))
response


******************* Start Evaluation *******************



100%|██████████| 10/10 [02:34<00:00, 15.45s/it]


{'avg_em': 0.0,
 'avg_f1': 0.06666666666666667,
 'avg_acc': 0.1,
 'avg_wall_time': 13.793950587511063,
 'avg_total_tokens': 973.75,
 'avg_total_cost': 0.012277,
 'avg_steps': 4.0,
 'avg_token_cost': 0.003527,
 'avg_tool_cost': 0.008749999999999999}

In [6]:
df.head(10)

Unnamed: 0,question,answer,preds,em,f1,acc,wall_time,total_tokens,steps,tool_cost,token_cost,total_cost
0,Noble is a 2014 film written and directed by S...,The Cripple of Inishmaan,Helen McCormick.,False,0.0,0,10.750393,1079.0,4.0,0.01,0.004048,0.014048
1,Who was the English clergyman and Archbishop o...,Henry Beaufort,,False,0.0,0,,,,,,
2,Which fictional character in the Harry Potter ...,Rubeus Hagrid,Hagrid.,False,0.666667,1,11.31754,893.0,4.0,0.0,0.003136,0.003136
3,What Classification is the album by UK band Se...,Trip hop,The album Classification by UK band Seccond Pe...,False,0.0,0,17.508473,1046.0,4.0,0.01,0.003928,0.013928
4,What actor from The Imitation Game also starre...,Mark Strong,Benedict Cumberbatch.,False,0.0,0,13.698198,977.0,4.0,0.01,0.003412,0.013412
5,This 2006 Iowa Hawkeyes football coach played ...,Kirk Ferentz,False.,False,0.0,0,24.061866,1123.0,4.0,0.01,0.005432,0.015432
6,Ikivo Animator integrates with a software appl...,Adobe Systems,Ikivo.,False,0.0,0,8.823584,858.0,4.0,0.02,0.002832,0.022832
7,What American actress born in 1912 did the vio...,Marta Eggerth,Katharine Hepburn.,False,0.0,0,11.060501,932.0,4.0,0.0,0.002854,0.002854
8,"Which plant has more species, Teucrium or Atal...",Teucrium,,False,0.0,0,,,,,,
9,When was the British-American journalist and e...,3 November 1949,The British-American journalist and editor was...,False,0.0,0,13.131051,882.0,4.0,0.01,0.002574,0.012574
