# Retrieval Augmented Generation (RAG) - Data Retrieval

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

### Load, split, embed, and store a PDF document

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# Load a PDF document and split it into chunks
file_path = "data/Dale_Carnegie_Golden_Book-Se.pdf"  # Path of the document to be loaded
loader = PyPDFLoader(file_path)                      # Initialize the pdf loader
documents = loader.load()                            # Load the pdf document 

# Initialize the recursive character text splitter
text_splitter = RecursiveCharacterTextSplitter(              
    separators="",
    chunk_size=100,
    chunk_overlap=20
)   

# Split the documents into chunks
chunks = text_splitter.split_documents(documents)

# Initialize the Hugging Face embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Store embeddings into the vector store
vector_store = FAISS.from_documents(
    documents=chunks,
    embedding=embeddings
)

### Data Retrieval - Vector Store as Retriever

#### Similarity Search Retrieval

In [11]:
# Retrieve relevant information using similarity search
retriever = vector_store.as_retriever() # uses similarity search by default
docs = retriever.invoke("Who is Dale Carnegie?")
print(len(docs), "documents have been retrieved.")

4 documents have been retrieved.


In [10]:
docs

[Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 6}, page_content='Dale Carnegie \n1888-1955 \nFounder \nBiography'),
 Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 6}, page_content='Founder \nBiography \nDale Carnegie was born in 1888 in Missouri, USA and was educated at'),
 Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 0}, page_content='DALE CARNEGIE’S \nGOLDEN BOOK \nwww.dalecarnegie.com'),
 Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 6}, page_content="Founded in 1912, Dale Carnegie Training has evolved from one man's belief in")]

#### Maximum Marginal Relevance (MMR) Retrieval

In [13]:
# Retrieve relevant information using maximum marginal relevance retrieval
retriever = vector_store.as_retriever(search_type="mmr") # uses mmr as search type
docs = retriever.invoke("Who is Dale Carnegie?")
print(len(docs), "documents have been retrieved.")

4 documents have been retrieved.


In [14]:
docs

[Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 6}, page_content='Dale Carnegie \n1888-1955 \nFounder \nBiography'),
 Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 6}, page_content='Warrensburg State Teachers College. As a salesman and aspiring actor, he'),
 Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 6}, page_content='world leaders. He wrote newspaper columns and had his own daily radio show.'),
 Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 1}, page_content='– – – – – – – – – – – – – – – – – – – – – – – – – – – – – – – – – – – – – – –')]

#### Similarity Score Threshold Retriever

In [21]:
# Retrieve relevant information using similarity score threshold
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5}
)
docs = retriever.invoke("Who is Dale Carnegie?")
print(len(docs), "documents have been retrieved.")

2 documents have been retrieved.


In [22]:
docs

[Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 6}, page_content='Dale Carnegie \n1888-1955 \nFounder \nBiography'),
 Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 6}, page_content='Founder \nBiography \nDale Carnegie was born in 1888 in Missouri, USA and was educated at')]

#### Specifying Top k Documents

In [27]:
# Retrieve top k relevant documents
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
docs = retriever.invoke("Who is Dale Carnegie?")
print(len(docs), "documents have been retrieved.")

3 documents have been retrieved.


In [28]:
docs

[Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 6}, page_content='Dale Carnegie \n1888-1955 \nFounder \nBiography'),
 Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 6}, page_content='Founder \nBiography \nDale Carnegie was born in 1888 in Missouri, USA and was educated at'),
 Document(metadata={'source': 'data/Dale_Carnegie_Golden_Book-Se.pdf', 'page': 0}, page_content='DALE CARNEGIE’S \nGOLDEN BOOK \nwww.dalecarnegie.com')]