Step 7: Use verified retrieved documents to Prompt LLM

This is an advanced RAG Technique called Child-Parent RecursiveRetriever using Mistral LLM

In [None]:
!pip install pinecone-client
!pip install sentence-transformers
!pip install llama-index --use-deprecated=legacy-resolver
!pip install langchain
!pip install replicate

In [None]:
from llama_index.response.notebook_utils import display_source_node
from llama_index.retrievers import RecursiveRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.llms import OpenAI
import json

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
import pinecone
api_key = "3da0e6b6-40a1-4094-9ab1-ca22a2a98621"
pinecone.init(api_key=api_key, environment="gcp-starter")

  from tqdm.autonotebook import tqdm


In [None]:
pinecone.describe_index("langchain-rag")
pinecone_index = pinecone.Index("langchain-rag")

In [None]:
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.vector_stores import PineconeVectorStore

vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index,
    add_sparse_vector=True,
)

In [None]:
os.environ["REPLICATE_API_TOKEN"]="r8_B4QZzdaf3iZheDalQ1PPMKhXxIOSf862e759H"

In [None]:
from llama_index.llms import Replicate

mistral = Replicate(
    model="mistralai/mistral-7b-instruct-v0.1:83b6a56e7c828e667f21fd596c338fd4f0039b46bcfa18d973e8e70e455fda70"
)

In [None]:
# Create our retriever.
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=mistral)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, service_context=service_context)

# Fetch the top 5 most relevant chunks.
retriever = index.as_retriever(similarity_top_k=5)

In [None]:
questions =[
    "What are the side effects of doxycycline?",
    "What are the side effects of spironolactone?",
    "What are the side effects of minocycline?",
    "What are the side effects of Accutane?",
    "What are the side effects of clindamycin?",
    "What are the side effects of Aldactone?",
    "What are the side effects of tretinoin?",
    "What are the side effects of isotretinoin?",
    "What are the side effects of Bactrim ?",
    "What are the side effects of Retin-A ?"]


In [None]:
from llama_index import Document
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import IndexNode
from llama_index.node_parser import SentenceSplitter

responses = []
for question in questions:
  #query = "What are the side effects of drugs?"
  nodes = retriever.retrieve(question)

  doc_text = "\n\n".join([d.get_content() for d in nodes])

  docs= [Document(text=doc_text)]

  node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

  base_nodes = node_parser.get_nodes_from_documents(docs)

  for idx, node in enumerate(base_nodes):
    node.id_ = f"node-{idx}"

  base_nodes = node_parser.get_nodes_from_documents(docs)

  base_index = VectorStoreIndex(base_nodes, service_context=service_context)
  base_retriever = base_index.as_retriever(similarity_top_k=2)

  retrievals = base_retriever.retrieve(
    question
  )

  query_engine_base = RetrieverQueryEngine.from_args(
    base_retriever, service_context=service_context
  )



  #responses.append(str(response))
  # print(str(response))
  sub_chunk_sizes = [250, 256, 512]
  sub_node_parsers = [
    SimpleNodeParser.from_defaults(chunk_size=c) for c in sub_chunk_sizes
  ]

  all_nodes = []
  for base_node in base_nodes:
    for n in sub_node_parsers:
        sub_nodes = n.get_nodes_from_documents([base_node])
        sub_inodes = [
            IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
        ]
        all_nodes.extend(sub_inodes)

    # also add original node to node
    original_node = IndexNode.from_text_node(base_node, base_node.node_id)
    all_nodes.append(original_node)

  all_nodes_dict = {n.node_id: n for n in all_nodes}

  vector_index_chunk = VectorStoreIndex(
    all_nodes, service_context=service_context
  )
  vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=2)

  retriever_chunk = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever_chunk},
    node_dict=all_nodes_dict,
    verbose=True,
  )

  query_engine = RetrieverQueryEngine.from_args(
    retriever_chunk
  )

  response = query_engine.query(
    question
  )

  responses.append(str(response))

  # nodes = retriever_chunk.retrieve(
  #   question

  # )
  # for node in nodes:
  #   display_source_node(node, source_length=2000)

  # response = query_engine_base.query(
  #   question
  # )



# print(docs)


[1;3;34mRetrieving with query id None: What are the side effects of doxycycline?
[0m[1;3;38;5;200mRetrieved node with id, entering: ceb9e608-bd70-441a-9d81-33d6c874640c
[0m[1;3;34mRetrieving with query id ceb9e608-bd70-441a-9d81-33d6c874640c: What are the side effects of doxycycline?
[0m[1;3;34mRetrieving with query id None: What are the side effects of spironolactone?
[0m[1;3;38;5;200mRetrieved node with id, entering: d94975eb-c0a9-41fa-bb9f-2dd36abcdc96
[0m[1;3;34mRetrieving with query id d94975eb-c0a9-41fa-bb9f-2dd36abcdc96: What are the side effects of spironolactone?
[0m[1;3;34mRetrieving with query id None: What are the side effects of minocycline?
[0m[1;3;38;5;200mRetrieved node with id, entering: d22268d4-1c7a-43ac-a5e5-6c57f5e1ecf2
[0m[1;3;34mRetrieving with query id d22268d4-1c7a-43ac-a5e5-6c57f5e1ecf2: What are the side effects of minocycline?
[0m[1;3;34mRetrieving with query id None: What are the side effects of Accutane?
[0m[1;3;38;5;200mRetrieved node

In [None]:
# for i in responses:
#   print(i)
#   print(len(i))
for index, r in enumerate(responses):
    print(f"Response {index + 1}: {r}\n")

Response 1: The side effects of doxycycline may include nausea and vomiting, upset stomach, loss of appetite, mild diarrhea, skin rash or itching, darkened skin color, vaginal itching or discharge. In rare cases, serious side effects may occur, such as severe stomach pain, diarrhea that is watery or bloody, throat irritation, trouble swallowing, chest pain, irregular heart rhythm, feeling short of breath, little or no urination, low white blood cell counts, severe headaches, ringing in the ears, dizziness, nausea, vision problems, pain behind the eyes, loss of appetite, upper stomach pain, tiredness, nausea or vomiting, fast heart rate, dark urine, jaundice.

Response 2: The side effects of spironolactone may include breast swelling or tenderness, drowsiness, dizziness, lack of energy, leg cramps, weakness, feeling like you might pass out, severe pain in your upper stomach spreading to your back, nausea and vomiting, electrolyte imbalance, and high or low potassium levels.

Response 3:

### Evaluation

In [None]:
import pinecone
api_key = "3da0e6b6-40a1-4094-9ab1-ca22a2a98621"
pinecone.init(api_key=api_key, environment="gcp-starter")

In [None]:
pinecone.describe_index("langchain-rag")
pinecone_index = pinecone.Index("langchain-rag")

In [None]:
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.vector_stores import PineconeVectorStore

vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index,
    add_sparse_vector=True,
)

In [None]:
import json 
with open("../datasets/child-recursive-responses.json", 'r') as json_file:
    rag_response_str = json.load(json_file)
with open("../datasets/golden-responses.json", 'r') as json_file:
    golden_responses = json.load(json_file)

#### Using LlamaIndex Correctness Evaluator on Golden Responses Dataset

In [None]:
from llama_index.evaluation import CorrectnessEvaluator

In [None]:
from llama_index.llms import OpenAI
from llama_index import VectorStoreIndex, ServiceContext
eval_llm = OpenAI("gpt-3.5-turbo", temperature=0.0)
service_context = ServiceContext.from_defaults(llm=eval_llm)
evaluator = CorrectnessEvaluator(service_context=service_context)

In [None]:
eval_results = []
from tqdm import tqdm
for rag_response, golden_response in tqdm(list(zip(rag_response_str, golden_responses))):
    query = golden_response["question"]
    golden_answer = golden_response["response"]
    generated_answer = rag_response["response"]
    
    eval_result = evaluator.evaluate(query=query, reference=golden_answer, response=generated_answer)
    eval_results.append(eval_result)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [12:07<00:00, 72.76s/it]


In [None]:
[r.score for r in eval_results]

[4.5, 3.5, 4.5, 5.0, 3.0, 3.5, 5.0, 5.0, 5.0, 5.0]

In [None]:
scores = [
    {"question": golden_response["question"],
     "golden_response": golden_response["response"],
     "generated_response": eval_result.response,
     "score": eval_result.score,
     "reasoning": eval_result.feedback,
    }
    for eval_result, golden_response in zip(eval_results, golden_responses)
]

In [None]:
with open("child-recursive-scores-mistral-goldeneval.json", "w") as file:
    json.dump(scores, file, indent=4)

In [None]:
average_scores = sum(score["score"] for score in scores) / len(scores)
average_scores

4.4

#### Using LlamaIndex Correctness Evaluator on User Responses Dataset

In [None]:
import json
with open("../datasets/child-recursive-scores-mistral-goldeneval.json", "r") as file:
    pred_responses = json.load(file)

In [None]:
eval_results = []
from tqdm import tqdm
for pred_response, golden_response in tqdm(list(zip(pred_responses, golden_responses))):
    query = golden_response["question"]
    golden_answer = golden_response["response"]
    response = pred_response["generated_response"]
    
    eval_result = evaluator.evaluate(query=query, reference=golden_answer, response=response)
    eval_results.append(eval_result)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:54<00:00, 17.46s/it]


In [None]:
[r.score for r in eval_results]

[4.0, 3.5, 3.5, 3.5, 3.0, 3.0, 4.5, 3.5, 4.5, 4.5]

In [None]:
scores = [
    {"question": golden_response["question"],
     "golden_response": golden_response["response"],
     "generated_response": eval_result.response,
     "score": eval_result.score,
     "reasoning": eval_result.feedback,
    }
    for eval_result, golden_response in zip(eval_results, golden_responses)
]
with open("mistralrecursivevshuman.json", "w") as file:
    json.dump(scores, file, indent=4)
average_scores = sum(score["score"] for score in scores) / len(scores)
average_scores

3.75

#### Industry Metrics on Golden Responses Dataset

In [None]:
import json
with open("../datasets/eval-scores-bare-mistral.json", "r") as file:
    bare_llm = json.load(file)
with open("../datasets/eval-scores-rag-mistral.json", "r") as file:
    rag = json.load(file)
with open("../datasets/child-recursive-scores-mistral-goldeneval.json", "r") as file:
    child = json.load(file) 
with open("../datasets/golden-responses.json", "r") as file:
    golden = json.load(file)

rag_responses = []
child_responses = []
bare_responses = []
golden_responses = []
for i in range(0, 10):
    rag_responses.append(rag[i]["generated_response"])
    child_responses.append(child[i]["generated_response"])
    bare_responses.append(bare_llm[i]["generated_response"])
    golden_responses.append(golden[i]["response"])
    
predictions_dict = {
    "Bare Mistral LLM": bare_responses,
    "Mistral + RAG": rag_responses,
    "Mistral + RAG + Recursive-Retrieval ": child_responses,
}
from eval import generate_metrics_summary
result = generate_metrics_summary(golden_responses, predictions_dict)

Calculating ROUGE Score...
Calculating BLEU Score...
Calculating BERT Score...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculating METEOR Score...


[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/aditi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/aditi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /

In [None]:
result["meteor"].to_csv('r.csv', index=False)

In [None]:
print(result['rouge'])

                                 System    rouge1    rouge2    rougeL  \
0                      Bare Mistral LLM  0.252120  0.070203  0.175378   
1                         Mistral + RAG  0.625968  0.502921  0.473645   
2  Mistral + RAG + Recursive-Retrieval   0.774692  0.750064  0.743726   

   rougeLsum  
0   0.196235  
1   0.490782  
2   0.743960  


In [None]:
print(result['bleu'])

                                 System      bleu
0                      Bare Mistral LLM  0.039006
1                         Mistral + RAG  0.365213
2  Mistral + RAG + Recursive-Retrieval   0.624797


In [None]:
print(result['meteor'])

                                 System    meteor
0                      Bare Mistral LLM  0.239231
1                         Mistral + RAG  0.609384
2  Mistral + RAG + Recursive-Retrieval   0.816886


#### Industry Metrics on User Responses Dataset

In [1]:
import json
with open("../datasets/human1_responses.json", "r") as file:
    human1 = json.load(file)
with open("../datasets/human2_responses.json", "r") as file:
    human2 = json.load(file)
with open("../datasets/human3_responses.json", "r") as file:
    human3 = json.load(file)
with open("../datasets/child-recursive-scores-mistral.json", "r") as file:
    eval = json.load(file)

human1_responses = []
human2_responses = []
human3_responses = []
eval_responses = []

for i in range(0, 10):
    human1_responses.append(human1[i]["response"])
    human2_responses.append(human2[i]["response"])
    human3_responses.append(human3[i]["response"])
    eval_responses.append(eval[i]["generated_response"])
    
references_dict = {
    "human_1": human1_responses,
    "human_2": human2_responses,
    "human_3": human3_responses,
}

In [11]:
from eval import generate_human_eval_summary
recursive_vs_humans_result = generate_human_eval_summary(references_dict, eval_responses, "Mistral + RAG + Recursive-Retrieval")

Calculating ROUGE Score...
Calculating BLEU Score...
Calculating BERT Score...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculating METEOR Score...


[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/aditi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/aditi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /

In [12]:
print(recursive_vs_humans_result["rouge"])

  Mistral + RAG + Recursive-Retrieval    rouge1    rouge2    rougeL  rougeLsum
0                             human_1  0.584116  0.401521  0.404429   0.401688
1                             human_2  0.570525  0.371188  0.382870   0.381524
2                             human_3  0.530661  0.386513  0.442905   0.440329


In [17]:
recursive_vs_humans_result["meteor"].to_csv('r.csv', index=False)

In [14]:
print(recursive_vs_humans_result["bleu"])

  Mistral + RAG + Recursive-Retrieval      bleu
0                             human_1  0.328418
1                             human_2  0.328418
2                             human_3  0.328418


In [15]:
print(recursive_vs_humans_result["meteor"])

  Mistral + RAG + Recursive-Retrieval    meteor
0                             human_1  0.400536
1                             human_2  0.400536
2                             human_3  0.400536
