In [4]:
#!/usr/bin/env python3
"""
retriever_embeddings.py

1) Load JSON‑extracted paper texts from data/text_json/
2) Split into ~500‑token chunks via TokenTextSplitter(encoding_name="cl100k_base")
3) Embed chunks with sentence-transformers/all-mpnet-base-v2 (CPU)
4) Build & save a FAISS index under retriever/faiss_index/
5) Reload the index and run a sample similarity search
"""

import os
import json
import time
from transformers import AutoTokenizer
from langchain.text_splitter import TokenTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# ─── CONFIG ────────────────────────────────────────────────────────────────────

TEXT_JSON_DIR      = r"C:\Users\offic\AGENT\data\text_json"
INDEX_DIR          = r"C:\Users\offic\AGENT\retriever\faiss_index"
EMBED_MODEL        = "sentence-transformers/all-mpnet-base-v2"

TOKEN_CHUNK_SIZE    = 500
TOKEN_CHUNK_OVERLAP = 50

EMBED_BATCH_SIZE    = 64

# ─── PREPARE ───────────────────────────────────────────────────────────────────

os.makedirs(INDEX_DIR, exist_ok=True)
print(f"Reading JSON from: {TEXT_JSON_DIR}")
print(f"Index will be saved to: {INDEX_DIR}\n")

# ─── STEP 1: LOAD DOCUMENTS ────────────────────────────────────────────────────

docs = []
for fname in sorted(os.listdir(TEXT_JSON_DIR)):
    if fname.endswith(".json"):
        path = os.path.join(TEXT_JSON_DIR, fname)
        with open(path, encoding="utf-8") as f:
            data = json.load(f)
            docs.append(data.get("full_text", ""))
print(f"Loaded {len(docs)} documents.\n")

# ─── STEP 2: TOKEN‑BASED SPLITTING ──────────────────────────────────────────────

print(f"Splitting into ~{TOKEN_CHUNK_SIZE}-token chunks (+{TOKEN_CHUNK_OVERLAP} overlap)")
splitter = TokenTextSplitter(
    encoding_name="cl100k_base",
    chunk_size=TOKEN_CHUNK_SIZE,
    chunk_overlap=TOKEN_CHUNK_OVERLAP,
)

chunks = []
for d in docs:
    chunks.extend(splitter.split_text(d))
print(f"Created {len(chunks)} token chunks.")

# (Optional) inspect chunk sizes
tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL)
lengths = [len(tokenizer.encode(c)) for c in chunks]
print(f"Max tokens/chunk: {max(lengths)}, mean: {sum(lengths)/len(lengths):.1f}\n")

# ─── STEP 3: EMBEDDINGS & FAISS INDEX ──────────────────────────────────────────

print("Initializing embeddings model (CPU):", EMBED_MODEL)
emb = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    encode_kwargs={"batch_size": EMBED_BATCH_SIZE},
)

print("Building FAISS index (this may take a few minutes)...")
t0 = time.time()
vectorstore = FAISS.from_texts(chunks, emb)
vectorstore.save_local(INDEX_DIR)
print(f"✅ Index built & saved in {time.time() - t0:.1f}s\n")

# ─── STEP 4: LOAD & TEST RETRIEVAL ─────────────────────────────────────────────

print("Reloading FAISS index with pickle deserialization enabled")
vs = FAISS.load_local(
    INDEX_DIR,
    emb,
    allow_dangerous_deserialization=True
)
print("Index loaded. Running sample query...")

query = "Which architectures used ResNet?"
results = vs.similarity_search(query, k=5)

print("\nTop 5 chunks for query:")
for i, doc in enumerate(results, 1):
    snippet = doc.page_content.replace("\n", " ")
    print(f"{i}. {snippet[:200]}…\n")


Reading JSON from: C:\Users\offic\AGENT\data\text_json
Index will be saved to: C:\Users\offic\AGENT\retriever\faiss_index

Loaded 67 documents.

Splitting into ~500-token chunks (+50 overlap)
Created 2454 token chunks.


Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors


Max tokens/chunk: 635, mean: 463.3

Initializing embeddings model (CPU): sentence-transformers/all-mpnet-base-v2
Building FAISS index (this may take a few minutes)...
✅ Index built & saved in 1138.7s

Reloading FAISS index with pickle deserialization enabled
Index loaded. Running sample query...

Top 5 chunks for query:
1. 44 ± 0.77 60.60 ± 0.71 Prototypical Networks (Snell et al., 2017) conv (64)×4 49.42 ± 0.78 68.20 ± 0.66 MAML (Finn et al., 2017) conv (32)×4 48.70 ± 1.84 63.11 ± 0.92 R2D2 (Bertinetto et al., 2018) co…

2. V  Output Input Input Output Output + Path A Path B Figure 1: The architecture of ResNet-50. The convolution kernel size, output channel size and stride size (default is 1) are illustrated, simila…

3.  82.32±0.14 71.66±0.23 85.50±0.15 DMF [66] ResNet-12 12.4 M 67.76±0.46 82.71±0.31 71.89±0.52 85.96±0.35 InfoPatch [39] ResNet-12 12.4 M 67.67±0.45 82.44±0.31 - - BML [77] ResNet-12 12.4 M 67.04±0.63 8…

4. ×1, 1024  ×36 conv5 x 7×7  3×3, 512 3×3, 512  ×2  3×