<a href="https://colab.research.google.com/github/abdurrahmanrussel/rag-pdf-qa/blob/main/RAG_PDF_QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install --quiet langchain sentence-transformers faiss-cpu PyPDF2 pytesseract pdf2image transformers peft trl bitsandbytes
!apt-get install -y -qq tesseract-ocr poppler-utils

In [None]:
#  Hugging Face token

from getpass import getpass
import os

hf_token = getpass("Enter your Hugging Face token: ")
os.environ["HUGGINGFACE_HUB_TOKEN"] = hf_token

In [None]:
#  Upload PDFs

from google.colab import files

uploaded_files = files.upload()
pdf_paths = list(uploaded_files.keys())
print("Uploaded PDFs:", pdf_paths)


In [None]:
#  Extract text (PDF + OCR)

from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import pytesseract

def extract_text_from_pdf(file_path, use_ocr=True):
    text = ""
    try:
        reader = PdfReader(file_path)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    except:
        pass
    if use_ocr or len(text.strip()) == 0:
        images = convert_from_path(file_path)
        for img in images:
            text += pytesseract.image_to_string(img) + "\n"
    return text

all_texts = [extract_text_from_pdf(p) for p in pdf_paths]

In [None]:
#  Chunk text

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = []
for t in all_texts:
    chunks.extend(text_splitter.split_text(t))
print(f"Total chunks created: {len(chunks)}")

In [None]:
#  Create embeddings + FAISS index

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embed_model.encode(chunks, convert_to_numpy=True)

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
print(f"FAISS index contains {index.ntotal} vectors")

In [None]:
# Step 6 - Retrieval function

def retrieve(query, index, chunks, top_k=3):
    query_embedding = embed_model.encode([query])
    D, I = index.search(query_embedding, top_k)
    return [chunks[i] for i in I[0]]


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

model_name = "NousResearch/Llama-2-7b-chat-hf"
adapter_name = "/content/llama2-7b-qlora-adapter"

In [None]:
# Load base model with token
base = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    token=hf_token
)

In [None]:
# Load LoRA adapter if exists
try:
    model = PeftModel.from_pretrained(base, adapter_name)
    print("LoRA adapter loaded successfully!")
except:
    print("Adapter not found, using base model.")
    model = base

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=hf_token)
model.config.use_cache = True

In [None]:
#  Define RAG query function

def answer_query(query, top_k=3, max_new_tokens=300):
    retrieved_chunks = retrieve(query, index, chunks, top_k=top_k)
    context = "\n".join(retrieved_chunks)
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
#  Run queries

query1 = "Summarize the PDF."
print(answer_query(query1))

query2 = "Explain the main topics in the PDF."
print(answer_query(query2))

query3 = "Write a few example questions from this PDF."
print(answer_query(query3))