In [2]:
!pip install -q \
    langchain \
    langchain-community \
    langchain-groq \
    faiss-cpu \
    pypdf \
    sentence-transformers \
    transformers \
    torch


In [2]:
# Core Python Libraries
import os

# LangChain Core
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# PDF Loader
from langchain_community.document_loaders import PyPDFLoader

# Embeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

# Vector Database
from langchain_community.vectorstores import FAISS

# Groq LLM
from langchain_groq import ChatGroq


In [3]:
# Set your Groq API key
os.environ["GROQ_API_KEY"] = "YOUR_API_KEY_HERE"


In [16]:
# Load PDF

# Path to your PDF file
pdf_path = "YOUR_PDF"   # <-- put your PDF name here

# Initialize PDF loader
loader = PyPDFLoader(pdf_path)

# Load PDF into Document objects
documents = loader.load()

# Check number of pages loaded
print(f"Total pages loaded: {len(documents)}")

# Preview first page
print(documents[0].page_content[:500])


Total pages loaded: 2


In [17]:
# Text Chunking

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# Split documents into chunks
chunks = text_splitter.split_documents(documents)

# Number of chunks created
print(f"Total chunks created: {len(chunks)}")

# Preview one chunk
print(chunks[0].page_content)


Total chunks created: 5


In [19]:
# Create Embeddings

# Initialize embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Build FAISS Vector Store

vectorstore = FAISS.from_documents(
    documents=chunks,
    embedding=embeddings
)

print("FAISS vector database created successfully!")


FAISS vector database created successfully!


In [7]:
# Initialize Groq LLM

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0
)

# Test LLM Connection

response = llm.invoke("Explain machine learning in one sentence.")
print(response.content)


Machine learning is a subset of artificial intelligence that enables computers to learn from data, identify patterns, and make predictions or decisions without being explicitly programmed for a specific task.


In [8]:
# Create Retriever from FAISS

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

# RAG Prompt Template

from langchain.prompts import PromptTemplate

prompt_template = """
You are an AI assistant answering questions based ONLY on the provided context.
If the answer is not present in the context, say:
"I don't know based on the provided document."

Context:
{context}

Question:
{question}

Answer:
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template
)

# RAG Function

def ask_pdf(question):
    # 1. Retrieve relevant chunks
    docs = retriever.get_relevant_documents(question)
    
    # 2. Combine retrieved text
    context = "\n\n".join([doc.page_content for doc in docs])
    
    # 3. Create final prompt
    final_prompt = prompt.format(
        context=context,
        question=question
    )
    
    # 4. Get answer from Groq
    response = llm.invoke(final_prompt)
    
    return response.content


In [14]:
ask_pdf("ASK YOUR QUESTIONE")


"I don't know based on the provided document."