In [12]:
pip install pypdf

Collecting pypdf
  Downloading pypdf-6.6.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.6.0-py3-none-any.whl (328 kB)
Installing collected packages: pypdf
Successfully installed pypdf-6.6.0
Note: you may need to restart the kernel to use updated packages.


In [13]:
import json
from pathlib import Path
from pypdf import PdfReader

# ---- Paths ----
BASE = Path(".")
DATA_DIR = BASE / "data"
ART_DIR = BASE / "artifacts"

DATA_DIR.mkdir(exist_ok=True)
ART_DIR.mkdir(exist_ok=True)

# ---- PDF path ----
PDF_PATH = BASE / "40 CFR 63.184 (up to date as of 1-08-2026) (1).pdf"

# ---- Read PDF ----
reader = PdfReader(str(PDF_PATH))

pages_text = []
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        pages_text.append(text)

full_text = "\n".join(pages_text)

# ---- Create docs ----
docs = [{
    "source": "40 CFR ยง63.184 (PDF)",
    "text": full_text
}]

# ---- Write output ----
(DATA_DIR / "raw_docs.json").write_text(
    json.dumps(docs, indent=2),
    encoding="utf-8"
)

print("Pages read:", len(reader.pages))
print("Characters extracted:", len(full_text))


Pages read: 14
Characters extracted: 48348


In [14]:
def chunk_text(text, size=800, overlap=150):
    chunks = []
    start = 0
    while start < len(text):
        end = start + size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

chunks = []
for d in docs:
    for i, c in enumerate(chunk_text(d["text"])):
        chunks.append({
            "source": d["source"],
            "chunk_id": i,
            "text": c
        })

(ART_DIR / "chunks.json").write_text(
    json.dumps(chunks, indent=2), encoding="utf-8"
)

len(chunks)


75

In [15]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

texts = [c["text"] for c in chunks]
embeddings = model.encode(texts, normalize_embeddings=True)

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings.astype(np.float32))

faiss.write_index(index, str(ART_DIR / "faiss.index"))

(ART_DIR / "meta.json").write_text(
    json.dumps({
        "model": model_name,
        "count": len(chunks)
    }, indent=2),
    encoding="utf-8"
)

embeddings.shape


(75, 384)

In [16]:
import os
for f in os.listdir("artifacts"):
    print(f, os.path.getsize(f"artifacts/{f}"))


chunks.json 67607
faiss.index 115245
meta.json 73
