## Imports

In [None]:
!pip install -q datasets
!pip install llama_index
!pip install llama-index-embeddings-huggingface
!pip install openai
!pip install llama-index-postprocessor-cohere-rerank


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama_index
  Downloading llama_index-0.10.34-py3-none-any.whl (6.9 kB)
Collecting llama-index-agent-openai<0.3.0,>=0.1.4 (from llama_index)
  Downloading llama_index_agent_openai-0.2.3-py3-none-any.whl (13 kB)
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama_index)
  Downloading llama_index_cli-0.1.12-py3-none-any.whl (26 kB)
Collecting llama-index-core<0.11.0,>=0.

In [None]:
from tqdm import tqdm
import os

In [None]:

from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.vector_stores import VectorStoreQuery
from llama_index.llms.openai import OpenAI as oai


from llama_index.postprocessor.cohere_rerank import CohereRerank

In [None]:
# OS settings for API keys
os.environ["OPENAI_API_KEY"] = ##
os.environ["PINECONE_API_KEY"] = ##
os.environ["PINECONE_ENV"] = ##
os.environ["COHERE_API_KEY"] = ##

In [None]:
import numpy as np
import pandas as pd

import openai
from openai import OpenAI

## Experiment with hotpot QA

In [None]:
from datasets import load_dataset

dataset = load_dataset("hotpot_qa", "distractor", split = "train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/31.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/27.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90447 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7405 [00:00<?, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
    num_rows: 90447
})

In [None]:
def convert_list_to_sent(para_list):
  """
  Para list is a list of list of sentences
  """

  return [' '.join(x) for x in para_list]

Extract all contexts that will be used to build the index for retrieval

In [None]:

contexts = []
for i in tqdm(range(3000)):
  contexts += dataset[i]['context']['sentences']

100%|██████████| 3000/3000 [00:00<00:00, 4337.28it/s]


In [None]:
contexts = [tuple(context) for context in contexts]
contexts = list(set(contexts))
contexts = [list(context) for context in contexts]
contexts = convert_list_to_sent(contexts)
print(contexts[0])

William Jefferson White (December 25, 1831 – April 17, 1913) was a civil rights leader, minister, educator, and journalist in Augusta, Georgia.  He was the founder of Harmony Baptist Church in Augusta in 1869 as well as other churches.  He also was a co-founder of the Augusta Institute in 1867, which would become Morehouse College.  He also helped found Atlanta University and was a trustee of both schools.  He was a founder in 1880 and the managing editor of the "Georgia Baptist", a leading African American newspaper for many years.  He was an outspoken civil rights leader.


In [None]:
len(contexts)

28356

In [None]:
print(len(contexts))

28356


## Lets construct some data

While we get multiple contexts, we test on 100. The idea being retrieval across a variety of vector DB noise. 100 further makes it possible to do manual analysis

In [None]:
questions = [dataset[i]['question'] for i in range(100)]
supporting_titles = [dataset[i]['supporting_facts']['title'] for i in range(100)]
answers = [dataset[i]['answer'] for i in range(100)]
level = [dataset[i]['level'] for i in range(100)]

supporting_articles = []
for i in tqdm(range(100)):
  sup_titles = list(set(supporting_titles[i]))
  sup_1 = sup_titles[0]
  sup_2 = sup_titles[1]

  idx_1 = dataset[i]['context']['title'].index(sup_1)
  idx_2 = dataset[i]['context']['title'].index(sup_2)

  supporting_articles.append([' '.join(dataset[i]['context']['sentences'][idx_1]), \
                              ' '.join(dataset[i]['context']['sentences'][idx_2])])




100%|██████████| 100/100 [00:00<00:00, 1331.41it/s]


In [None]:
questions[9]

supporting_articles[9]

['Mount Panorama Circuit is a motor racing track located in Bathurst, New South Wales, Australia.  It is situated on a hill with the dual official names of Mount Panorama and Wahluu and is best known as the home of the Bathurst 1000 motor race held each October, and the Bathurst 12 Hour event held each February.  The 6.213 km long track is technically a street circuit, and is a public road, with normal speed restrictions, when no racing events are being run, and there are many residences which can only be accessed from the circuit.',
 'The 2013 Liqui Moly Bathurst 12 Hour was an endurance race for a variety of GT and touring car classes, including: GT3 cars, GT4 cars, Group 3E Series Production Cars and Dubai 24 Hour cars.  The event, which was staged at the Mount Panorama Circuit, near Bathurst, in New South Wales, Australia on 10 February 2013, was the eleventh running of the Bathurst 12 Hour.  The race also incorporated the opening round of the 2013 Australian GT Championship.  The 

## Setting up retrieval


Here, we create the indexa and test it out to see how it works



In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core import Document

documents = [Document(text=t) for t in contexts]
index = VectorStoreIndex.from_documents(
    documents,
    llm = oai(temperature = 0, model = 'gpt-3.5-turbo')
    )

In [None]:
questions[13]

'In which American football game was Malcolm Smith named Most Valuable player?'

In [None]:
#Example of generation using one question
query_str = "What is the length of the Mount Panorama Circuit, where the 2013 Liqui Moly Bathurst 12 Hour was staged?"
q = index.as_query_engine(similarity_top_k = 4)
query_result = q.query(query_str)

print(query_result.response)

The length of the Mount Panorama Circuit, where the 2013 Liqui Moly Bathurst 12 Hour was staged, is approximately 6.213 kilometers.


In [None]:
answers[13]

'Super Bowl XLVIII'

In [None]:
print(query_result.source_nodes[0].text)

The 2017 Liqui Moly Bathurst 12 Hour endurance race for GT and touring car classes, GT3 and GT4 cars was staged on the Mount Panorama Circuit, near Bathurst, in New South Wales, Australia 5 February 2017.  The 15th running of the Bathurst 12 Hour constituted the opening round of the 2017 Intercontinental GT Challenge Series.  For the first time, the winners of the race were awarded the Australian Tourist Trophy.


In [None]:
supporting_articles[20]

['Longs Drugs is an American chain with approximately 40 drug stores throughout the state of Hawaii.',
 "Warren Bryant was the CEO of Longs Drugs Store Corporation out of California prior to the retail chain's acquisition by CVS/Caremark.  Hired in 2002 to Longs, he was Senior Vice President of The Kroger Company. , a retail grocery chain, from 1999 to 2002.  Prior to that, from 1996 to 1999, he was President and Chief Executive Officer of Dillon Companies, Inc., a retail grocery chain and subsidiary of The Kroger Co.  He is also a director of OfficeMax Incorporated."]

In [None]:
questions[21]

'Which  American politician did Donahue replaced '

## Vanilla RAG with HotpotQA

We run this for 100 examples and also compute the needed metrics

In [None]:
#Evaluating hits @ k [in our case k here is 5]

similarity_top_k = 4
hit_per_q = []
num_hit = 0
num_hit_rel = 0
responses = []
mrr_sum = 0
retrieved_docs = []

engine = index.as_query_engine(similarity_top_k = similarity_top_k)

for i in tqdm(range(0, 100)):
  hit_perc = 0
  q = questions[i]
  hit_flag = False

  #Get answer
  query_result = engine.query(q)

  #Get supporting articles
  hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]

  responses.append(query_result.response)
  retrieved_docs.append(hits)

  #Compute metrics
  for j in range(len(supporting_articles[i])):
    if supporting_articles[i][j] in hits:
      hit_flag = True
      num_hit += 1
      hit_perc += 1

  for j in range(len(hits)):
    if hits[j] in supporting_articles[i]:
      mrr_sum += (1/(j+1))
      break


  if hit_flag:
    num_hit_rel += 1
  hit_per_q += [hit_perc]

100%|██████████| 100/100 [04:57<00:00,  2.97s/it]


In [None]:
# Compute some stats
print(sum([hit_per_q[i] == 0 for i in range(100)]))
print(sum([hit_per_q[i] == 1 for i in range(100)]))
print(sum([hit_per_q[i] == 2 for i in range(100)]))

3
39
58


In [None]:
#Computing further stats on retrieval

hard_hits = 0
num_hard = 0
for i in range(100):
  if level[i] == "hard":
    num_hard += 1
    if hit_per_q[i] == 2:
      hard_hits += 1

print(num_hard)
print(hard_hits / num_hard)


16
0.625


In [None]:
print(num_hit_rel / 100)
print(num_hit / 200)

0.97
0.775


## Evaluating generation performance

For this, we will make use of an evaluation query run on GPT 3.5 Turbo to see if the responses match

Note this is only the first step. **For the obtained results, we validate each of them manually on a spreadsheet**

In [None]:
from openai import OpenAI
import openai

In [None]:
client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])


In [None]:
def create_prompt_check(idx):

  prompt = "For the question: " + questions[idx] + " Are the following two answers both saying the same thing? " + \
    "Answer 1: " + responses[idx] + ", Answer 2: " + \
    answers[idx] + ". Respond with yes or no"

  return prompt


ans_cor = []

#We run this for all answers as a first round of checks
for i in tqdm(range(100)):
  prompt = create_prompt_check(i)



  response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature = 0
  )


  if response.choices[0].text.strip().lower().startswith("yes"):
    ans_cor.append(1)
  else:
    ans_cor.append(0)

100%|██████████| 100/100 [00:23<00:00,  4.18it/s]


Below is accuracy based on this. We manually evaluate all of the checks based on the CSV extracted below. This process is just to get a good V1

In [None]:
#accuracy of the answers

np.sum(ans_cor) / len(ans_cor)

0.68

In [None]:
#Exporting this evaluation as a dataframe

eval_df = pd.DataFrame({
    'question': questions,
    'ground_truth': answers,
    'rag_response': responses,
    'level': level,
    'correct_or_not': ans_cor
})

eval_df.to_csv("vanilla_hotpot_eval_k4.csv")

## Aside: Baseline accuracy evaluation on first 100 based on GPT 3.5

**This is to see accuracy when GPT 3.5 is provided ground truth passages. This basically helps us verify whether retrieval is an issue or it is generation. If accuracy here is very high, it just means we need to retrieve better as given the correct contexts, GPT 3.5 can do a pretty good job with the task.**

In [None]:
def create_rag_prompt(passage1, passage2, question):

  prompt = """
    We have provided context information below.

    ---------------------
    {passage1}
    ---------------------
    {passage2}
    ---------------------
    Given this information, please answer the question: {question}
  """.format(passage1 = passage1, passage2 = passage2, question = question)

  return prompt

responses = []

#We run RAG evaluation with ground truth contexts
for i in tqdm(range(len(questions))):
  passage1 = supporting_articles[i][0]
  passage2 = supporting_articles[i][1]
  question = questions[i]


  prompt = create_rag_prompt(passage1, passage2, question)

  response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature = 0,
    max_tokens = 150
  )

  responses.append(response.choices[0].text.strip())

100%|██████████| 100/100 [00:59<00:00,  1.67it/s]


In [None]:
#We run this check and eventually use manual analysis as in the before case
ans_cor = []

for i in tqdm(range(len(questions))):
  prompt = create_prompt_check(i)

  response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature = 0
  )


  if response.choices[0].text.strip().lower().startswith("yes"):
    ans_cor.append(1)
  else:
    ans_cor.append(0)

100%|██████████| 100/100 [00:36<00:00,  2.76it/s]


In [None]:
#accuracy of the answers

np.sum(ans_cor) / len(ans_cor)

0.83

In [None]:
#Exporting this evaluation as a dataframe

eval_df = pd.DataFrame({
    'question': questions,
    'ground_truth': answers,
    'rag_response': responses,
    'level': level,
    'correct_or_not': ans_cor
})

eval_df.to_csv("groundtruth_hotpot_eval.csv")

## Experiment: Naive Iteration

We first define some functions to help with naive iteration.

Specifically we define next question extraction function to extract next question based on current passage and then a construct final prompt function to construct a RAG prompt based on 4 given passages

In [None]:
def next_q_extraction(ques, passage):

  prompt = """
  Imagine you are answering the question:
  {ques}

  You have the following imformation in the passage below:

  Passage: {passage}

  Based on this, what other information do you need to answer this question. Frame it as a concise one-step question.
  """.format(ques = ques, passage = passage)

  response = client.completions.create(
      model="gpt-3.5-turbo-instruct",
      prompt=prompt,
      temperature = 0,
      max_tokens = 100
    )

  return response.choices[0].text.strip()


def construct_final_prompt(ques, passage_1, passage_2, passage_3, passage_4):

  prompt = """
  You are given the following passages as context:

  Passage 1: {passage_1}
  Passage 2: {passage_2}
  Passage 3: {passage_3}
  Passage 4: {passage_4}

  Based on information in the above contexts, answer the following question: {ques}
  """.format(ques = ques, passage_1 = passage_1, passage_2 = passage_2, \
             passage_3 = passage_3, passage_4 = passage_4)

  return prompt

In [None]:
similarity_top_k = 4
hit_per_q = []
num_hit = 0
num_hit_rel = 0
responses = []
mrr_sum = 0
engine = index.as_query_engine(similarity_top_k = similarity_top_k)
retrievals = []

for i in tqdm(range(100)):
  hit_perc = 0
  q = questions[i]
  hit_flag = False

  query_result = engine.query(q)

  hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]

  first_hit = hits[0]

  next_q = next_q_extraction(q, first_hit)

  query_result_new = engine.query(next_q)

  new_hits = [query_result_new.source_nodes[i].text for i in range(len(query_result_new.source_nodes))]

  first_new_hit = new_hits[0]

  final_prompt = construct_final_prompt(q, passage_1 = first_hit, passage_2 = hits[1], \
                                        passage_3 = new_hits[0], passage_4 = new_hits[1])

  retrievals.append([hits[0], hits[1], new_hits[0], new_hits[1]])

  response = client.completions.create(
      model="gpt-3.5-turbo-instruct",
      prompt=final_prompt,
      temperature = 0,
      max_tokens = 100
    )

  responses.append(response.choices[0].text.strip())

  retrieved_contexts = [hits[0], hits[1], new_hits[0], new_hits[1]]

  for j in range(len(supporting_articles[i])):
    if supporting_articles[i][j] in retrieved_contexts:
      hit_flag = True
      num_hit += 1
      hit_perc += 1

  if hit_flag:
    num_hit_rel += 1

  hit_per_q += [hit_perc]

  for j in range(len(retrieved_contexts)):
    if hits[j] in supporting_articles[i]:
      mrr_sum += (1/(j+1))
      break

100%|██████████| 100/100 [12:38<00:00,  7.59s/it]


In [None]:
print(sum([hit_per_q[i] == 0 for i in range(100)]))
print(sum([hit_per_q[i] == 1 for i in range(100)]))
print(sum([hit_per_q[i] == 2 for i in range(100)]))

5
37
58


In [None]:
hard_hits = 0
num_hard = 0
for i in range(100):
  if level[i] == "medium":
    num_hard += 1
    if hit_per_q[i] == 2:
      hard_hits += 1



print(num_hard)
print(hard_hits / num_hard)


57
0.6491228070175439


In [None]:
print(num_hit_rel / 100)
print(num_hit / 200)

0.95
0.765


In [None]:
print(mrr_sum)

92.66666666666667


In [None]:
ans_cor = []
#We run this check but manually evaluate the CSV extracted later
for i in tqdm(range(len(questions))):
  prompt = create_prompt_check(i)



  response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature = 0
  )


  if response.choices[0].text.strip().lower().startswith("yes"):
    ans_cor.append(1)
  else:
    ans_cor.append(0)

100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


In [None]:
#accuracy of the answers

np.sum(ans_cor) / len(ans_cor)

0.63

In [None]:
#Exporting this evaluation as a dataframe

eval_df = pd.DataFrame({
    'question': questions,
    'ground_truth': answers,
    'rag_response': responses,
    'level': level,
    'correct_or_not': ans_cor
})

eval_df.to_csv("vanilla_hotpot_eval_iter.csv")

## Experiment: Guided Iteration

We first create helper functions similar to the other case in order. These would include a function to check if a claim is implied by facts for the conditional in guided retrieval. A create claim function to create a "claim" from a question and an answer. We noticed that facts -> claim checking is empiricaly slightly better than question context answer checking.

We naturally have a similar next question extraction function and a construct final prompt function

In [None]:
def create_reflection_prompt(facts, claim):

  refl_prompt = """
  Based on the given facts, is it reasonable to say that the following claim is true?

  Facts: {facts}
  Claim: {claim}

  Think step by step. End you answer with either yes or no.
  """.format(facts = facts, claim = claim)

  return refl_prompt


def create_claim(question, answer):

  claim_prompt = """
  Convert the following question and its answer into a simple, one-sentence claim.
  Make sure all details from the question are present in the claim.

  Question: {question}
  Answer: {answer}
  """.format(question = question, answer = answer)

  client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])

  claim = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=claim_prompt,
    temperature = 0,
    max_tokens = 100
  )

  return claim.choices[0].text.strip()

def next_q_extraction_guided(ques, passage):

  prompt = """
  You are the given the following information as context:

  -----------
  Context
  {passage}
  -----------

  Based on the information in the context passage below, ask a follow up question
  so that you can better answer the question: {ques}.
  """.format(ques = ques, passage = passage)

  response = client.completions.create(
      model="gpt-3.5-turbo-instruct",
      prompt=prompt,
      temperature = 0,
      max_tokens = 100
    )

  return response.choices[0].text.strip()


def construct_final_prompt_guided(ques, passage_1, passage_2, passage_3, passage_4):

  prompt = """
  You are given the following passages as context

  Passage 1: {passage_1}
  Passage 2: {passage_2}
  Passage 3: {passage_3}
  Passage 4: {passage_4}

  Based on the information in the context passages below, answer the question: {ques}
  """.format(ques = ques, passage_1 = passage_1, passage_2 = passage_2, \
             passage_3 = passage_3, passage_4 = passage_4)

  return prompt


In [None]:
#Evaluation block
client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])
similarity_top_k = 4
hit_per_q = []
num_hit = 0
num_hit_rel = 0
responses = []
mrr_sum = 0
engine = index.as_query_engine(similarity_top_k = similarity_top_k,
                               llm = oai(temperature = 0, model = "gpt-3.5-turbo-instruct"))

# retrievals = []

for i in tqdm(range(20, 100)):
  print("Iteration " + str(i))
  hit_perc = 0
  q = questions[i]
  hit_flag = False

  # query_result = engine.query(q)
  query_result = engine.query(q)

  hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]

  facts_string = ""

  for j in range(2):
    facts_string += "Fact " + str(j+1) + ": " + hits[j] + "\n"

  claim = create_claim(q, query_result.response)
  print(f"Question: {q}")
  print(f"Response: {query_result.response}")
  print(f"Claim: {claim}")

  print(f"Facts string: {facts_string}")

  refl_prompt = create_reflection_prompt(facts_string, query_result.response)

  reflection = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=refl_prompt,
    temperature = 0,
    max_tokens = 100
  )

  print(reflection.choices[0].text.strip())

  if reflection.choices[0].text.strip().lower().split(' ')[0].startswith("no"):

    print("follow-up question chosen")

    next_q = next_q_extraction_guided(q, facts_string)

    print(f"Next question: {next_q}")

    # query_result_new = engine.query(next_q)
    query_result_new = engine.query(next_q)

    new_hits = [query_result_new.source_nodes[i].text for i in range(len(query_result_new.source_nodes))]

    final_prompt = construct_final_prompt_guided(q, passage_1 = hits[0], passage_2 = hits[1], \
                                          passage_3 = new_hits[0], passage_4 = new_hits[1])


    retrievals.append([hits[0], hits[1], new_hits[0], new_hits[1]])

    response = client.completions.create(
      model="gpt-3.5-turbo-instruct",
      prompt=final_prompt,
      temperature = 0,
      max_tokens = 100
    ).choices[0].text.strip()

    retrieved_contexts = [hits[0], hits[1], new_hits[0], new_hits[1]]


  else:

    response = query_result.response

    retrievals.append(hits)

    retrieved_contexts = hits


  responses.append(response)


  for j in range(len(supporting_articles[i])):
    if supporting_articles[i][j] in retrieved_contexts:
      hit_flag = True
      num_hit += 1
      hit_perc += 1

  if hit_flag:
    num_hit_rel += 1

  hit_per_q += [hit_perc]

  for j in range(len(retrieved_contexts)):
    if hits[j] in supporting_articles[i]:
      mrr_sum += (1/(j+1))
      break

In [None]:
print(sum([hit_per_q[i] == 0 for i in range(100)]))
print(sum([hit_per_q[i] == 1 for i in range(100)]))
print(sum([hit_per_q[i] == 2 for i in range(100)]))

4
43
53


In [None]:
hard_hits = 0
num_hard = 0
for i in range(len(responses)):
  if level[i] == "easy":
    num_hard += 1
    if hit_per_q[i] == 2:
      hard_hits += 1



print(num_hard)
print(hard_hits / num_hard)


27
0.2962962962962963


In [None]:
print(num_hit_rel / 100)
print(num_hit / 200)

0.96
0.745


In [None]:
ans_cor = []

#We run the v1 check and then manually evaluate later
for i in tqdm(range(len(responses))):
  prompt = create_prompt_check(i)



  response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature = 0
  )


  if response.choices[0].text.strip().lower().startswith("yes"):
    ans_cor.append(1)
  else:
    ans_cor.append(0)

100%|██████████| 100/100 [00:36<00:00,  2.72it/s]


In [None]:
#accuracy of the answers

np.sum(ans_cor) / len(ans_cor)

0.68

In [None]:
#Exporting this evaluation as a dataframe

eval_df = pd.DataFrame({
    'question': questions,
    'ground_truth': answers,
    'rag_response': responses,
    'level': level,
    'correct_or_not': ans_cor
})

eval_df.to_csv("vanilla_hotpot_eval_refl_2.csv")

## Tree-RAG

After the failure of a decomposition based strategy, we will instead try and see if a tree based strategy works instead.

We once again create helper functions to implement this

In [None]:
def next_q_extraction_tree(ques, passage):

  prompt = """
  I know the following information in context:

  -----------
  Context
  {passage}
  -----------

  I need to answer the question: {ques}.
  What more information do I need in addition to the context to answer this?
  Frame your response as a simple, one hop follow-up question.
  """.format(ques = ques, passage = passage)

  response = client.completions.create(
      model="gpt-3.5-turbo-instruct",
      prompt=prompt,
      temperature = 0,
      max_tokens = 100
    )

  return response.choices[0].text.strip()


def construct_final_prompt_tree(ques, passage_1, passage_2, passage_3):

  prompt = """
  You are given the following passages as context

  Passage 1: {passage_1}
  Passage 2: {passage_2}
  Passage 3: {passage_3}

  Based on the information in the context passages below, answer the following question:

  Question: {ques}
  """.format(ques = ques, passage_1 = passage_1, passage_2 = passage_2, \
             passage_3 = passage_3)

  return prompt


In [None]:
#Evaluation block
client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])
similarity_top_k = 4
hit_per_q = []
num_hit = 0
num_hit_rel = 0
responses = []
mrr_sum = 0
engine = index.as_query_engine(similarity_top_k = similarity_top_k,
                               llm = oai(temperature = 0, model = "gpt-3.5-turbo-instruct"))

retrievals = []

for i in tqdm(range(0, 100)):
  retrieved_contexts = []
  print("Iteration " + str(i))
  hit_perc = 0
  q = questions[i]
  hit_flag = False

  # query_result = engine.query(q)
  query_result = engine.query(q)

  hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]


  print("follow-up question chosen")

  new_contexts = []
  next_questions = []

  for j in range(len(hits[:3])):
    next_qs = next_q_extraction_tree(q, hits[j])
    query_result_new = engine.query(next_qs)
    new_con = [hits[j]] + [query_result_new.source_nodes[k].text for k in range(2)]
    retrieved_contexts.extend(new_con)
    new_contexts.append('. '.join(new_con))
    next_questions.append(next_qs)


  final_prompt = construct_final_prompt_tree(q, passage_1 = new_contexts[0], passage_2 = new_contexts[1], \
                                        passage_3 = new_contexts[2])

  print(final_prompt)
  response = client.completions.create(
      model="gpt-3.5-turbo-instruct",
      prompt=final_prompt,
      temperature = 0,
      max_tokens = 100,
      logprobs = 1
    ).choices[0].text.strip()


  responses.append(response)


  for j in range(len(supporting_articles[i])):
    if supporting_articles[i][j] in retrieved_contexts:
      hit_flag = True
      num_hit += 1
      hit_perc += 1

  if hit_flag:
    num_hit_rel += 1

  hit_per_q += [hit_perc]

In [None]:
hard_hits = 0
num_hard = 0
for i in range(len(responses)):
  if level[i] == "hard":
    num_hard += 1
    if hit_per_q[i] == 2:
      hard_hits += 1



print(num_hard)
print(hard_hits / num_hard)


16
0.6875


In [None]:
hard_hits = 0
for i in range(len(responses)):
  if hit_per_q[i] == 2:
    hard_hits += 1

hard_hits/100

0.69

In [None]:
print(num_hit_rel / 100)
print(num_hit / 200)

0.98
0.835


In [None]:
ans_cor = []

#We run this as a v1 check and then manually evaluate later
for i in tqdm(range(len(responses))):
  prompt = create_prompt_check(i)



  response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature = 0
  )


  if response.choices[0].text.strip().lower().startswith("yes"):
    ans_cor.append(1)
  else:
    ans_cor.append(0)

100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


In [None]:
#accuracy of the answers

np.sum(ans_cor) / len(ans_cor)

0.74

In [None]:
#Exporting this evaluation as a dataframe

eval_df = pd.DataFrame({
    'question': questions,
    'ground_truth': answers,
    'rag_response': responses,
    'level': level,
    'correct_or_not': ans_cor
})

eval_df.to_csv("vanilla_hotpot_eval_tree.csv")