RAG TWO SINGLE DOCUMENT - Load and Split

In [120]:
# Import Dependencies
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_core.documents import Document
from huggingface_hub import login
from typing import List
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.runnable import RunnablePassthrough
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import AIMessage, HumanMessage
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [31]:
load_dotenv()

True

In [36]:
HUGGINGFACE_API_KEY = os.getenv("H_API_KEY")
# print("Hugging Face API Key:", HUGGINGFACE_API_KEY)

In [37]:
# Login to Hugging Face
login(token=HUGGINGFACE_API_KEY)

In [76]:
# Get the GROQ API key from the environment variable
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
print(GROQ_API_KEY)

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


In [77]:
# Set the GROQ API Key in the environment
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [78]:
# Initialize the ChatGroq model
llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [79]:
# Define output parser
output_parser = StrOutputParser()

In [18]:
# Function to load documents from a directory
# The pages of the pdf files are merged into a single document
# This is a buggy behavior, but it is the default behavior of the PyPDFLoader
# The docx files are loaded as separate documents
def load_documents_from_directory(directory: str):
    documents = []
    for filename in os.listdir(directory):
        if filename.startswith('~$') or filename.startswith('.'):
            continue
        filepath = os.path.join(directory, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(filepath)
            pages = loader.load()
            merged_text = "\n\n".join([p.page_content for p in pages])
            metadata = pages[0].metadata
            metadata["source_file"] = filename
            documents.append(Document(page_content=merged_text, metadata=metadata))
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(filepath)
            docs = loader.load()
            for doc in docs:
                doc.metadata["source_file"] = filename
            documents.extend(docs)
    return documents

In [19]:
# Load Documents from the specified directory
directory = r'D:\\lrag\\docs'  # Replace with your directory path
documents = load_documents_from_directory(directory)
# Print the number of documents loaded
print(f"Loaded {len(documents)} documents from {directory}")
# list the loaded documents
for doc in documents:
    print(doc.metadata['source'])

Loaded 5 documents from D:\\lrag\\docs
D:\\lrag\\docs\AeroVance_Systems_Extended.pdf
D:\\lrag\\docs\FinNova_Capital_Extended.pdf
D:\\lrag\\docs\GreenEarth_Biotech_Extended.docx
D:\\lrag\\docs\MediSphere_Diagnostics_Extended.pdf
D:\\lrag\\docs\NovaTech_Solutions_Extended.docx


In [20]:
# Text Splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

In [21]:
# split the documents into smaller chunks
splits = text_splitter.split_documents(documents)
# Print the number of splits
print(f"Split into {len(splits)} chunks")

Split into 14 chunks


Embeddings

In [51]:
# Initialize embeddings compatible with llama-3.1-8b-instant
embed_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",  # Use small version for CPU
    model_kwargs={"device": "cpu"},  # Force CPU
    encode_kwargs={"normalize_embeddings": True}
)

In [52]:
document_embeddings = embed_model.embed_documents([split.page_content for split in splits])
# Print the number of document embeddings
print(f"Created embeddings for {len(document_embeddings)} document chunks")

Created embeddings for 14 document chunks


In [53]:
document_embeddings[0]  # Print the first document embedding

[0.029238184913992882,
 0.0397220179438591,
 0.02330380119383335,
 -0.03731292486190796,
 0.09539669007062912,
 -0.03166435286402702,
 0.06643205881118774,
 0.02781878598034382,
 0.009808049537241459,
 -0.05688025429844856,
 -0.007452498190104961,
 -0.03390991687774658,
 0.01916222833096981,
 0.006790407467633486,
 0.040818534791469574,
 0.0011405308032408357,
 0.06132574751973152,
 -0.05142804980278015,
 -8.124792657326907e-06,
 -0.018773933872580528,
 0.01912602409720421,
 0.028326373547315598,
 0.03149903565645218,
 0.006104258820414543,
 -0.020294059067964554,
 0.044972460716962814,
 -0.0347469262778759,
 -0.042890939861536026,
 0.02132396213710308,
 -0.09751825034618378,
 0.006312664598226547,
 -0.042831555008888245,
 -0.006815862841904163,
 -0.015227827243506908,
 -0.07265114039182663,
 -0.0024735976476222277,
 -0.0411909855902195,
 -0.011107610538601875,
 0.015323769301176071,
 -0.0031717068050056696,
 0.03797410801053047,
 0.017169885337352753,
 -0.061464354395866394,
 -0.02926

In [55]:
# Store the embeddings in a Chroma database
embedding_function = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",  # Use small version for CPU
    model_kwargs={"device": "cpu"},  # Force CPU
    encode_kwargs={"normalize_embeddings": True}
)
collection_name = "my_collection"
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embedding_function,
    persist_directory="./chroma_db",
    collection_name=collection_name
)

In [57]:
# Similarity Search - We are just getting the top 2 most relevant chunks for a query
query = "Where is AeroVance Systems located?"

search_results = vectorstore.similarity_search(query, k=2)

# Print the search results
print(f"\nTop 2 most relevant chunks for query: '{query}':\n")
for i, result in enumerate(search_results, 1):
    print(f"Result {i}:")
    print(f"Source: {result.metadata.get('source', 'unknown')}")
    print(f"Page Content: {result.page_content}")
    print(f"Metadata: {result.metadata}")
    print("-" * 80)


Top 2 most relevant chunks for query: 'Where is AeroVance Systems located?':

Result 1:
Source: D:\\lrag\\docs\AeroVance_Systems_Extended.pdf
Page Content: AeroVance Systems
Industry: Aerospace Engineering & Defense Technology
Location: Toulouse, France
About Us
AeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and
defense sectors with R&D in avionics, propulsion systems, and aerospace cybersecurity. (Detail
level 1)
AeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and
defense sectors with R&D in avionics, propulsion systems, and aerospace cybersecurity. (Detail
level 2)
AeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and
defense sectors with R&D in avionics, propulsion systems, and aerospace cybersecurity. (Detail
level 3)
AeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and
defense sectors with R&D in avionics, propul

In [67]:
# We cannot use the invoke method with the vectorstore object directly
# We need to convert the vectorstore to a retriever first
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
# Check the data type of the retriever
print(f"Retriever type: {type(retriever)}")
retriever.invoke(query)

Retriever type: <class 'langchain_core.vectorstores.base.VectorStoreRetriever'>


[Document(id='fa1cb206-2e33-4111-b35a-d0af26eca58b', metadata={'creationdate': 'D:20250430140620', 'creator': 'PyPDF', 'page': 0, 'page_label': '1', 'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'source': 'D:\\\\lrag\\\\docs\\AeroVance_Systems_Extended.pdf', 'source_file': 'AeroVance_Systems_Extended.pdf', 'total_pages': 2}, page_content='AeroVance Systems\nIndustry: Aerospace Engineering & Defense Technology\nLocation: Toulouse, France\nAbout Us\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\ndefense sectors with R&D in avionics, propulsion systems, and aerospace cybersecurity. (Detail\nlevel 1)\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\ndefense sectors with R&D in avionics, propulsion systems, and aerospace cybersecurity. (Detail\nlevel 2)\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\ndefense sectors with R&D in avionics

In [60]:
# Create a prompt template for the LLM
template = """
Answer the question based only on the following context. If the answer is not in the context, say "I don't know".
{context}

Question: {question}

Answer: """
prompt = ChatPromptTemplate.from_template(template)

In [62]:
# Define the chain to use the retriever and the prompt
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} | prompt
)

In [64]:
# Invoke the chain with a query
query = "Where is AeroVance Systems located?"
rag_chain.invoke(query)

ChatPromptValue(messages=[HumanMessage(content='\nAnswer the question based only on the following context. If the answer is not in the context, say "I don\'t know".\n[Document(id=\'fa1cb206-2e33-4111-b35a-d0af26eca58b\', metadata={\'creationdate\': \'D:20250430140620\', \'creator\': \'PyPDF\', \'page\': 0, \'page_label\': \'1\', \'producer\': \'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/\', \'source\': \'D:\\\\\\\\lrag\\\\\\\\docs\\\\AeroVance_Systems_Extended.pdf\', \'source_file\': \'AeroVance_Systems_Extended.pdf\', \'total_pages\': 2}, page_content=\'AeroVance Systems\\nIndustry: Aerospace Engineering & Defense Technology\\nLocation: Toulouse, France\\nAbout Us\\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\\ndefense sectors with R&D in avionics, propulsion systems, and aerospace cybersecurity. (Detail\\nlevel 1)\\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\\ndefense sectors with R&

In [71]:
# Function to filter the page content from the document object
def docs2str(docs: List) -> str:
    return "\n\n".join(doc.page_content for doc in docs)

In [73]:
# Modify the chain to use the docs2str function
rag_chain = (
    {"context": retriever | docs2str, "question": RunnablePassthrough()} | prompt
)

In [74]:
# Invoke the modified chain with a query
query = "Where is AeroVance Systems located?"
rag_chain.invoke(query)

ChatPromptValue(messages=[HumanMessage(content='\nAnswer the question based only on the following context. If the answer is not in the context, say "I don\'t know".\nAeroVance Systems\nIndustry: Aerospace Engineering & Defense Technology\nLocation: Toulouse, France\nAbout Us\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\ndefense sectors with R&D in avionics, propulsion systems, and aerospace cybersecurity. (Detail\nlevel 1)\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\ndefense sectors with R&D in avionics, propulsion systems, and aerospace cybersecurity. (Detail\nlevel 2)\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\ndefense sectors with R&D in avionics, propulsion systems, and aerospace cybersecurity. (Detail\nlevel 3)\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\ndefense sectors with 

In [102]:
# Modify the chain again to use the LLM ad the output parser
rag_chain = (
    {"context": retriever | docs2str, "question": RunnablePassthrough()} | prompt | llm | output_parser
)

In [112]:
# Invoke the remodified chain with a query
question = "Where is AeroVance Systems located?"
response = rag_chain.invoke(question)
print(f"Response: {response}")

Response: Toulouse, France.


Conversational RAG - Answer follow up questions

In [113]:
# Initialize the chat history
chat_history = []

In [114]:
# Insert the question and response into the chat history
chat_history.extend([
    HumanMessage(content=question),
    AIMessage(content=response)
])

In [115]:
chat_history  # Print the chat history

[HumanMessage(content='Where is AeroVance Systems located?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='Toulouse, France.', additional_kwargs={}, response_metadata={})]

Contextualize the question

In [116]:
contextualize_q_system_prompt = """Your ONLY task is to reformulate questions to be standalone. NEVER answer.
Follow these rules STRICTLY:
1. If the question is already standalone, return it unchanged
2. If it references chat history, rewrite it to include necessary context
3. UNDER NO CIRCUMSTANCES provide an answer
4. Only output the reformulated question, nothing else

Current question to reformulate:"""

In [117]:
# Messages placeholder can take a list of messages (chat history)
contextualize_q_prompt = ChatPromptTemplate.from_messages([
    ("system", contextualize_q_system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
])
contextualize_chain = contextualize_q_prompt | llm | StrOutputParser()
contextualize_chain.invoke({"input": "Which sectors does it support?", "chat_history": chat_history})

'What sectors does AeroVance Systems support?'

In [119]:
# Create and use history aware retriever
history_aware_retriever = create_history_aware_retriever(
    llm,
    retriever,
    contextualize_q_prompt
)
history_aware_retriever.invoke({"input": "Which sectors does it support?", "chat_history": chat_history})

[Document(id='fa1cb206-2e33-4111-b35a-d0af26eca58b', metadata={'creationdate': 'D:20250430140620', 'creator': 'PyPDF', 'page': 0, 'page_label': '1', 'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'source': 'D:\\\\lrag\\\\docs\\AeroVance_Systems_Extended.pdf', 'source_file': 'AeroVance_Systems_Extended.pdf', 'total_pages': 2}, page_content='AeroVance Systems\nIndustry: Aerospace Engineering & Defense Technology\nLocation: Toulouse, France\nAbout Us\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\ndefense sectors with R&D in avionics, propulsion systems, and aerospace cybersecurity. (Detail\nlevel 1)\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\ndefense sectors with R&D in avionics, propulsion systems, and aerospace cybersecurity. (Detail\nlevel 2)\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\ndefense sectors with R&D in avionics

In [121]:
# Create another prompt
qa_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful AI assistant. Use the following context to answer the user's question."),
    ("system", "Context: {context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}")
])

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [122]:
rag_chain.invoke({"input": "Which sectors does it support?", "chat_history": chat_history})

{'input': 'Which sectors does it support?',
 'chat_history': [HumanMessage(content='Where is AeroVance Systems located?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Toulouse, France.', additional_kwargs={}, response_metadata={})],
 'context': [Document(id='fa1cb206-2e33-4111-b35a-d0af26eca58b', metadata={'creationdate': 'D:20250430140620', 'creator': 'PyPDF', 'page': 0, 'page_label': '1', 'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'source': 'D:\\\\lrag\\\\docs\\AeroVance_Systems_Extended.pdf', 'source_file': 'AeroVance_Systems_Extended.pdf', 'total_pages': 2}, page_content='AeroVance Systems\nIndustry: Aerospace Engineering & Defense Technology\nLocation: Toulouse, France\nAbout Us\nAeroVance Systems is a leader in next-generation aerospace technologies. It supports civil and\ndefense sectors with R&D in avionics, propulsion systems, and aerospace cybersecurity. (Detail\nlevel 1)\nAeroVance Systems is a leader in next-generation aerospace technolog