In [2]:
!pip install sentence-transformers faiss-cpu fitz

Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting fitz
  Using cached fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting httplib2 (from fitz)
  Using cached httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting nipype (from fitz)
  Using cached nipype-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Using cached prov-2.0.2-py3-none-any.whl.metadata (3.7 kB)
Collecting etelemetry>=0.3.1 (from nipype->fitz)
  Using cached etelemetry-0.3.1-py3-none-any.whl.metadata (3.2 kB)
Using cached sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Using cached fitz-0.0.1.dev2-py2.py3-none-any.whl (20 kB)
Using cached httplib2-0.22.0-py3-none-any.whl (96 kB)
Using cached nipype-1.10.0-py3-none-any.whl (3.2 MB)
Using cached etelemetry-0.3.1-py3-none-any.whl (6.4 kB)
Using cached prov-2.0.2-py3-none-any.whl (421 kB)
Installing collected packages: httplib2, prov, etel

In [3]:
import json

with open("extracted_pages.json", "r") as f:
    pages = json.load(f)

print(f"Loaded {len(pages)} pages.")

Loaded 128 pages.


In [4]:
def chunk_text(pages, chunk_size=150):
    chunks = []
    for page in pages:
        words = page['text'].split()
        for i in range(0, len(words), chunk_size):
            chunk_words = words[i:i + chunk_size]
            chunk_text = ' '.join(chunk_words)
            if chunk_text.strip():
                chunks.append({
                    'page_num': page['page_num'],
                    'chunk': chunk_text
                })
    return chunks

chunks = chunk_text(pages)
print(f"Generated {len(chunks)} chunks.")

Generated 378 chunks.


In [5]:
from sentence_transformers import SentenceTransformer

model_embed = SentenceTransformer("all-MiniLM-L6-v2")
texts = [chunk['chunk'] for chunk in chunks]

embeddings = model_embed.encode(texts, convert_to_numpy=True, show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 12/12 [00:06<00:00,  1.82it/s]


In [8]:
import faiss
import numpy as np

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss.write_index(index, "vector_index.faiss")

import json
with open("chunks_metadata.json", "w") as f:
    json.dump(chunks, f)

print("FAISS index and metadata saved.")

FAISS index and metadata saved.
