### Installations

In [None]:
!pip install openai
!pip install langchain
!pip install langchain_community
!pip install faiss-cpu
!pip install python-dotenv
!pip install langchain pypdf
!pip install tiktoken
!pip install pinecone

#### Imports

In [10]:
import openai
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

### model loading

In [14]:
import os
from dotenv import load_dotenv

# Load the .env file
load_dotenv(".env")

# Access variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [16]:
from langchain.llms import OpenAI
import os

llm = OpenAI(openai_api_key=OPENAI_API_KEY)


In [21]:
llm.invoke("Explain ML in 1 line.")

'\n\nML stands for Machine Learning and it is a subset of Artificial Intelligence that allows machines to learn and improve from data without being explicitly programmed.'

### upload a pdf

In [22]:
from google.colab import files

# upload a file
uploaded_files = files.upload()

# get uploaded file path
pdf_file_path = list(uploaded_files.keys())[0]

# check file extension
if not pdf_file_path.lower().endswith(".pdf"):
    raise ValueError("Please upload a PDF file only!")

print(f"Uploaded PDF file: {pdf_file_path}")

Saving rag_paper.pdf to rag_paper.pdf
Uploaded PDF file: rag_paper.pdf


In [23]:
from langchain.document_loaders import PyPDFLoader

def load_document(pdf_file_path):
  # Load PDF into Document objects
  loader = PyPDFLoader(pdf_file_path)
  documents = loader.load()  # returns a list of Document objects

  # Check number of pages loaded
  print(f"Number of pages loaded: {len(documents)}")

  # Optional: preview first page
  print(documents[0].page_content[:500])  # first 500 chars'
  return documents

In [24]:
documents = load_document(pdf_file_path)

Number of pages loaded: 21
1
Retrieval-Augmented Generation for Large
Language Models: A Survey
Yunfan Gaoa, Yun Xiongb, Xinyu Gao b, Kangxiang Jia b, Jinliu Pan b, Yuxi Bic, Yi Dai a, Jiawei Sun a, Meng
Wangc, and Haofen Wang a,c
aShanghai Research Institute for Intelligent Autonomous Systems, Tongji University
bShanghai Key Laboratory of Data Science, School of Computer Science, Fudan University
cCollege of Design and Innovation, Tongji University
Abstract—Large Language Models (LLMs) showcase impres-
sive capabilities 


### Splitting Text to Chunks

**Note:**
I am using RecursiveCharacterTextSplitter here to capture more context from pdfs.

**Info**
RecursiveCharacterTextSplitter splits text into chunks that avoids loosing context over pages by keeping paragraphs together.

In [25]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)
chunks = text_splitter.split_documents(documents)

In [26]:
chunks[:2]

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-03-28T00:54:45+00:00', 'author': '', 'keywords': '', 'moddate': '2024-03-28T00:54:45+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'rag_paper.pdf', 'total_pages': 21, 'page': 0, 'page_label': '1'}, page_content='1\nRetrieval-Augmented Generation for Large\nLanguage Models: A Survey\nYunfan Gaoa, Yun Xiongb, Xinyu Gao b, Kangxiang Jia b, Jinliu Pan b, Yuxi Bic, Yi Dai a, Jiawei Sun a, Meng\nWangc, and Haofen Wang a,c\naShanghai Research Institute for Intelligent Autonomous Systems, Tongji University\nbShanghai Key Laboratory of Data Science, School of Computer Science, Fudan University\ncCollege of Design and Innovation, Tongji University\nAbstract—Large Language Models (LLMs) showcase impres-'),
 Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator'

### Vector Index/DB

FAISS: Facebook AI Similarity Search --> a powerful library for similarity search and clustering of dense vectors.

In [27]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings  # many others are availabe for other models too.

# creating embeddings
embeddings = OpenAIEmbeddings(api_key = OPENAI_API_KEY)
db = FAISS.from_documents(documents=chunks, embedding=embeddings)

  embeddings = OpenAIEmbeddings(api_key = OPENAI_API_KEY)


In [30]:
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(
    """
      Given the following conversation and follow up question, get insights about the question and give short
      precise answer.
      {chat_history}
      Follow up Input: {question}
      Insights:
    """
)

qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=db.as_retriever(), condense_question_prompt= CONDENSE_QUESTION_PROMPT,
                                           return_source_documents=True, verbose=False)

In [31]:
chat_history= []
query = "What is RAG?."
result = qa.invoke(input =
          {
              "question":query,
              "chat_history": chat_history
          })

print(result["answer"])

 RAG is a research paradigm that has evolved over time and is now categorized into three stages: Naive RAG, Advanced RAG, and Modular RAG. It has shown to be cost-effective and outperform native LLMs, but also has limitations. The development of Advanced RAG and Modular RAG aim to address these limitations. RAG is continuously expanding its application scope into multimodal domains and has gained interest from academic and industrial sectors.


In [32]:
chat_history= []
query = "What can rag be used for?."
result = qa.invoke(input =
          {
              "question":query,
              "chat_history": chat_history
          })

print(result["answer"])

 RAG can be used for addressing complex problems and integrative or summary questions that require reading a large amount of material to answer. It can also be used in multimodal domains, such as interpreting and processing images, videos, and code, and for improving precision and flexibility in queries. Additionally, RAG can be used for decision-making and autonomous judgment capabilities in generating accurate responses. 


In [33]:
chat_history= []
query = "Can you tell more about the workflow of rag?."
result = qa.invoke(input =
          {
              "question":query,
              "chat_history": chat_history
          })

print(result["answer"])



The workflow of RAG involves a flexible orchestration that showcases the benefits of adaptive retrieval through techniques such as FLARE and Self-RAG. This approach evaluates the necessity of retrieval based on different scenarios and allows for integration with other technologies like fine-tuning or reinforcement learning. RAG's application scope is also expanding into multimodal domains, making it applicable to diverse data forms like images, videos, and code. The evaluation of RAG models is a primary objective in the field of NLP, with a focus on optimizing performance across various application scenarios. The core task of RAG is Question Answering, including traditional single-hop/multi-hop QA, multiple-choice QA, and open-domain QA. 


### Setup Pinecone

In [34]:
from pinecone import Pinecone
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

In [39]:
if False:
  index_name = "pdf-ragbot"

  if not pc.has_index(index_name):
      pc.create_index_for_model(
          name=index_name,
          cloud="aws",
          region="us-east-1",
          embed={
              "model": "llama-text-embed-v2",  # works for OpenAI embeddings but is much bigger
              "field_map": {"text": "chunk_text"}
          }
      )

In [46]:
index = pc.Index("pdf-ragbot")
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [40]:
# Create with exact dimensions
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud,region=region)

In [43]:
index_name = 'rag-chatbot'

In [44]:
if index_name not in pc.list_indexes().names():
  pc.create_index(index_name, dimension=1536,metric='cosine',spec=spec)

index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

### Authenticate Pinecone

In [None]:
!pip install pinecone_notebooks

In [48]:
from pinecone_notebooks.colab import Authenticate
Authenticate()

In [None]:
!pip install langchain_pinecone

In [50]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(chunks,embeddings, index_name = index_name)


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [51]:
docsearch.as_retriever()

VectorStoreRetriever(tags=['PineconeVectorStore', 'OpenAIEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x78593e8f6f60>, search_kwargs={})

In [52]:
qa = ConversationalRetrievalChain.from_llm(llm=llm,
                                           retriever=docsearch.as_retriever(),
                                           condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                                           return_source_documents=True,
                                           verbose=False)

In [72]:
def append_history(chat_history,query, answer):
  """
    Append new prompt query and answer to chat history.
  """
  chat_history.append(("user", query))
  chat_history.append(("assistant", result["answer"]))
  return chat_history

In [74]:
chat_history = []
query = """What is RAG?"""
result = qa({"question": query, "chat_history": chat_history})
print(result["answer"])

 RAG stands for Retrieval-Augmented Generation and is a research paradigm that has three stages: Naive, Advanced, and Modular RAG. RAG is continuously evolving and has applications in various domains such as images, videos, and code. It is cost-effective and has shown to surpass the performance of native LLMs, but also has limitations that have led to the development of Advanced RAG and Modular RAG. As RAG's application scope expands, there is a need for refining evaluation methodologies to accurately assess its performance and contributions to the AI research and development community.


In [75]:
chat_history = append_history(chat_history,query, result["answer"])
chat_history

[('user', 'What is RAG?'),
 ('assistant',
  " RAG stands for Retrieval-Augmented Generation and is a research paradigm that has three stages: Naive, Advanced, and Modular RAG. RAG is continuously evolving and has applications in various domains such as images, videos, and code. It is cost-effective and has shown to surpass the performance of native LLMs, but also has limitations that have led to the development of Advanced RAG and Modular RAG. As RAG's application scope expands, there is a need for refining evaluation methodologies to accurately assess its performance and contributions to the AI research and development community.")]

In [76]:
query = """Why use RAG?."""
result = qa({"question": query, "chat_history": chat_history})
print(result["answer"])

 "What are the reasons for the growing adoption of RAG in the field of NLP and what advantages does it offer? How does the development of Advanced RAG and Modular RAG address the limitations of Naive RAG? How can evaluation methodologies be refined to accurately assess RAG's contributions to the AI research and development community?"


In [77]:
chat_history = append_history(chat_history,query, result["answer"])
chat_history

[('user', 'What is RAG?'),
 ('assistant',
  " RAG stands for Retrieval-Augmented Generation and is a research paradigm that has three stages: Naive, Advanced, and Modular RAG. RAG is continuously evolving and has applications in various domains such as images, videos, and code. It is cost-effective and has shown to surpass the performance of native LLMs, but also has limitations that have led to the development of Advanced RAG and Modular RAG. As RAG's application scope expands, there is a need for refining evaluation methodologies to accurately assess its performance and contributions to the AI research and development community."),
 ('user', 'Why use RAG?.'),
 ('assistant',
  ' "What are the reasons for the growing adoption of RAG in the field of NLP and what advantages does it offer? How does the development of Advanced RAG and Modular RAG address the limitations of Naive RAG? How can evaluation methodologies be refined to accurately assess RAG\'s contributions to the AI research an