dependencies

In [None]:
!pip install pymupdf loguru faiss-cpu sentence-transformers

Collecting pymupdf
  Using cached pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting loguru
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3

setting up logger

In [None]:
from loguru import logger

# Remove default handler and log to a file
logger.remove()
logger.add("faiss_log.log", level="INFO")
logger.info("Logger initialized using loguru.")

uploading pdf files

In [None]:
from google.colab import files

uploaded = files.upload()
pdf_path = next(iter(uploaded))  # Get the uploaded file name
logger.info(f"PDF uploaded: {pdf_path}")

Saving Untitled document.pdf to Untitled document (3).pdf


Extract Text from PDF

In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(path):
    doc = fitz.open(path)
    texts = []
    for i, page in enumerate(doc):
        text = page.get_text().strip()
        if text:
            texts.append({
                "page": i + 1,
                "text": text
            })
    logger.info(f"Extracted text from {len(texts)} pages.")
    return texts

texts = extract_text_from_pdf(pdf_path)
print(f" Extracted {len(texts)} pages with text.")

 Extracted 3 pages with text.


storing in faiss

In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

def store_texts_in_faiss(texts, index_path="faiss_colab.index"):
    vectors = [model.encode(entry["text"]) for entry in texts]
    vectors_np = np.array(vectors).astype("float32")
    dimension = vectors_np.shape[1]

    index = faiss.IndexFlatL2(dimension)
    index.add(vectors_np)

    faiss.write_index(index, index_path)
    logger.info(f"Stored {len(vectors_np)} vectors in FAISS index.")
    return index, vectors_np

index, vectors_np = store_texts_in_faiss(texts)
print(f" Stored {len(vectors_np)} vectors in FAISS.")

 Stored 3 vectors in FAISS.


Word search -- implementation

In [None]:
import re

# STEP 6: Custom Query Search + Word Count
query_input = input(" Enter a search phrase: ").strip()

# Embed the query
query_vector = model.encode([query_input]).astype("float32")

# Search for top 3 similar text entries
D, I = index.search(query_vector, k=3)

print(f"\n Top 3 results for: '{query_input}'\n")

def count_word_occurrences(text, word):
    # Case-insensitive exact word count using regex
    return len(re.findall(r'\b' + re.escape(word) + r'\b', text, flags=re.IGNORECASE))

for rank, i in enumerate(I[0], start=1):
    page_num = texts[i]["page"]
    page_text = texts[i]["text"]
    word_count = count_word_occurrences(page_text, query_input)

    print(f" Result {rank}: Page {page_num}")
    print(f" '{query_input}' appears {word_count} time(s).")



 Enter a search phrase: AI

 Top 3 results for: 'AI'

 Result 1: Page 1
 'AI' appears 8 time(s).
 Result 2: Page 2
 'AI' appears 2 time(s).
 Result 3: Page 3
 'AI' appears 1 time(s).
