In [None]:
# LangChain for pipelines
!pip install langchain --quiet

# Install langchain-community for document loaders
!pip install langchain-community --quiet

# FAISS for vector database
!pip install faiss-cpu --quiet

# Hugging Face Transformers & Embeddings
!pip install sentence-transformers transformers huggingface-hub --quiet

# PDF reading
!pip install PyMuPDF --quiet
!pip install pypdf --quiet

# Streamlit (for later deployment)
!pip install streamlit --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/323.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m256.0/323.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.5/323.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import langchain
import faiss
import sentence_transformers
import fitz  # PyMuPDF
import transformers

print("All libraries installed and working ✅")


All libraries installed and working ✅


In [None]:
from langchain.document_loaders import PyPDFLoader

# Load your PDF
pdf_path = "/content/sample-local-pdf.pdf"  # change this if your file has a different name
loader = PyPDFLoader(pdf_path)

# Extract pages as documents
documents = loader.load()

print(f"Total pages extracted: {len(documents)}")

# Preview first page
print("------ Page 1 Text ------")
print(documents[0].page_content[:500])  # prints first 500 characters




Total pages extracted: 3
------ Page 1 Text ------
1	
Sample PDF  Created for testing PDFObject  This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all minutes are the same duration, and one cannot possibly be longer than the other? If these pages are all the same size, can one possibly be longer than the other?  I digress. Here’s some Latin. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ant


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # max characters per chunk
    chunk_overlap=200     # overlap for context
)

# Split documents into chunks
chunks = text_splitter.split_documents(documents)

print(f"Total chunks created: {len(chunks)}")
print("------ Sample Chunk ------")
print(chunks[1].page_content[:500])  # first 500 characters of first chunk


Total chunks created: 16
------ Sample Chunk ------
Sample PDF  Created for testing PDFObject  This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all minutes are the same duration, and one cannot possibly be longer than the other? If these pages are all the same size, can one possibly be longer than the other?  I digress. Here’s some Latin. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante d


In [None]:
# Check all pages
for i, doc in enumerate(documents):
    print(f"--- Page {i+1} ---")
    print(repr(doc.page_content[:500]))  # shows first 500 chars including invisible ones


--- Page 1 ---
'1\t\nSample PDF  Created for testing PDFObject  This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all minutes are the same duration, and one cannot possibly be longer than the other? If these pages are all the same size, can one possibly be longer than the other?  I digress. Here’s some Latin. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ant'
--- Page 2 ---
'2\t\nipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante dapibus diam. Sed nisi. Nulla quis sem at nibh elementum imperdiet. Duis sagittis ipsum. Praesent mauris.   Fusce nec tellus sed augue semper porta. Mauris massa. Vestibulum lacinia arcu eget nulla. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Curabitur sodales ligula in libero. Sed dignissim lacinia nunc. Cu

In [None]:

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS


In [None]:
# Using sentence-transformers for embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:

vectorstore = FAISS.from_documents(chunks, embedding_model)

print("FAISS vector store created ✅")


FAISS vector store created ✅


In [None]:
# Remove chunks with very little text (less than 20 characters)
clean_chunks = [chunk for chunk in chunks if len(chunk.page_content.strip()) > 20]

# Re-create FAISS vector store
vectorstore = FAISS.from_documents(clean_chunks, embedding_model)
print(f"Total chunks after cleaning: {len(clean_chunks)}")


Total chunks after cleaning: 13


In [None]:
# Example: Search for top 2 chunks related to a query
query = "What is this PDF about?"
docs = vectorstore.similarity_search(query, k=2)

for i, doc in enumerate(docs):
    print(f"--- Retrieved Chunk {i+1} ---")
    print(doc.page_content[:500])


--- Retrieved Chunk 1 ---
Sample PDF  Created for testing PDFObject  This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all minutes are the same duration, and one cannot possibly be longer than the other? If these pages are all the same size, can one possibly be longer than the other?  I digress. Here’s some Latin. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante d
--- Retrieved Chunk 2 ---
ligula in libero.   Sed dignissim lacinia nunc. Curabitur tortor. Pellentesque nibh. Aenean quam. In scelerisque sem at dolor. Maecenas mattis. Sed convallis tristique sem. Proin ut ligula vel nunc egestas porttitor. Morbi lectus risus, iaculis vel, suscipit quis, luctus non, massa. Fusce ac turpis quis ligula lacinia aliquet. Mauris ipsum. Nulla metus metus, ullamcorper vel, tincidunt sed, euismod in, nibh.   Quisque volutpat condimentum veli

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI  # or any LLM you prefer


In [None]:
from huggingface_hub import login
login(token="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")


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_name = "declare-lab/flan-alpaca-large"  # or any Flan-T5 variant

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Wrap in pipeline
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)

# Test
output = pipe("Summarize the PDF content:")
print(output[0]['generated_text'])


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Device set to use cpu


The PDF contains an overview of the history of the United States, including its founding in 1776, the Civil War, the American Revolution, and the Civil Rights Movement. It includes a timeline of the Civil Rights Movement, a timeline of the American Revolution, a timeline of the Civil Rights Movement, and a timeline of the Civil Rights Movement. It also includes a timeline of the Civil Rights Movement, a timeline of the Civil Rights Movement, and a timeline of the Civil Rights Movement.


In [None]:
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

llm = HuggingFacePipeline(pipeline=pipe)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

# Test
query = "What is this PDF about?"
result = qa_chain({"query": query})
print("Answer:\n", result['result'])


  llm = HuggingFacePipeline(pipeline=pipe)
Token indices sequence length is longer than the specified maximum sequence length for this model (1767 > 512). Running this sequence through the model will result in indexing errors


Answer:
 This PDF is about testing.
