In [2]:
#!/usr/bin/env python3
"""
retriever_rerank_rag.py

1) Load JSON‑extracted texts
2) Token‑chunk to ≤480 tokens
3) Embed & build FAISS
4) Reload FAISS
5) Semantic top‑50 → keyword filter → cross‑encoder rerank → top‑5
6) Build RetrievalQA chain
7) Run CIFAR‑10 query
"""

import os
import json
import time

from transformers import AutoTokenizer, pipeline as hf_pipeline
from langchain.text_splitter import TokenTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from sentence_transformers.cross_encoder import CrossEncoder
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.schema import Document, BaseRetriever

# ─── CONFIG ────────────────────────────────────────────────────────────────────
TEXT_JSON_DIR       = r"C:\Users\offic\AGENT\data\text_json"
INDEX_DIR           = r"C:\Users\offic\AGENT\retriever\faiss_index"

EMBED_MODEL         = "sentence-transformers/msmarco-distilbert-base-v4"
EMBED_BATCH_SIZE    = 64

TOKEN_CHUNK_SIZE    = 480
TOKEN_CHUNK_OVERLAP = 50

SEMANTIC_K          = 50
FINAL_K             = 5
KEYWORD_FILTER      = "CIFAR-10"

RERANK_MODEL        = "cross-encoder/ms-marco-MiniLM-L-6-v2"
READER_MODEL        = "google/flan-t5-large"

# ─── PREPARE ───────────────────────────────────────────────────────────────────
os.makedirs(INDEX_DIR, exist_ok=True)
print("JSON dir:", TEXT_JSON_DIR)
print("Index dir:", INDEX_DIR)

# ─── STEP 1: LOAD DOCUMENTS ────────────────────────────────────────────────────
docs = []
for fn in sorted(os.listdir(TEXT_JSON_DIR)):
    if fn.endswith(".json"):
        with open(os.path.join(TEXT_JSON_DIR, fn), encoding="utf-8") as f:
            docs.append(json.load(f).get("full_text", ""))
print(f"Loaded {len(docs)} documents\n")

# ─── STEP 2: TOKEN‑BASED SPLITTING ──────────────────────────────────────────────
print(f"Chunking to ~{TOKEN_CHUNK_SIZE} tokens (+{TOKEN_CHUNK_OVERLAP} overlap)")
splitter = TokenTextSplitter(
    encoding_name="cl100k_base",
    chunk_size=TOKEN_CHUNK_SIZE,
    chunk_overlap=TOKEN_CHUNK_OVERLAP,
)
chunks = [chunk for doc in docs for chunk in splitter.split_text(doc)]
print(f"→ {len(chunks)} chunks")

# sanity check
tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL)
lengths = [len(tokenizer.encode(c)) for c in chunks]
print("Max tokens/chunk:", max(lengths), "Mean:", sum(lengths)/len(lengths), "\n")

# ─── STEP 3: EMBEDDING & FAISS BUILD ────────────────────────────────────────────
print("Embedding and building FAISS index")
emb = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    encode_kwargs={"batch_size": EMBED_BATCH_SIZE, "truncation": True, "max_length": 512},
)
t0 = time.time()
vs = FAISS.from_texts(chunks, emb)
vs.save_local(INDEX_DIR)
print(f"Index built in {time.time() - t0:.1f}s\n")

# ─── STEP 4: RELOAD INDEX ───────────────────────────────────────────────────────
print("Reloading FAISS index")
vs = FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)

# ─── STEP 5: SEMANTIC → KEYWORD → RERANK ────────────────────────────────────────
print(f"Loading cross-encoder: {RERANK_MODEL}")
reranker = CrossEncoder(RERANK_MODEL)

def retrieve_and_rerank(query: str):
    # 1) semantic top‑K
    sem_docs = vs.similarity_search(query, k=SEMANTIC_K)
    # 2) keyword filter
    filtered = [d for d in sem_docs if KEYWORD_FILTER.lower() in d.page_content.lower()]
    candidates = filtered if len(filtered) >= FINAL_K else sem_docs
    # 3) rerank via cross‑encoder
    pairs = [[query, d.page_content] for d in candidates]
    scores = reranker.predict(pairs)
    ranked = [candidates[i] for i in sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)]
    return ranked[:FINAL_K]

# quick test of reranker
q = "Which approaches used ResNet backbone?"
top_docs = retrieve_and_rerank(q)
print(f"Top‑{FINAL_K} reranked passages:")
for i, d in enumerate(top_docs, 1):
    print(f"{i}. {d.page_content[:200].replace(chr(10),' ')}…\n")

# ─── STEP 6: BUILD & RUN RETRIEVALQA ────────────────────────────────────────────
print("Setting up RetrievalQA with FLAN-T5")
hf_pipe = hf_pipeline("text2text-generation", model=READER_MODEL, max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=hf_pipe)

class RerankRetriever(BaseRetriever):
    """A LangChain Retriever that wraps our reranking function."""
    def get_relevant_documents(self, query: str) -> list[Document]:
        return retrieve_and_rerank(query)

    async def aget_relevant_documents(self, query: str) -> list[Document]:
        return self.get_relevant_documents(query)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=RerankRetriever(),
    chain_type="stuff",
    return_source_documents=True
)

print("\n=== Running RAG Query ===")
res = qa({"query": q})
print("\nAnswer:\n", res["result"])
print("\nSources:")
for i, doc in enumerate(res["source_documents"], 1):
    snippet = doc.page_content.replace("\n", " ")
    print(f"{i}. {snippet[:200]}…")


JSON dir: C:\Users\offic\AGENT\data\text_json
Index dir: C:\Users\offic\AGENT\retriever\faiss_index
Loaded 67 documents

Chunking to ~480 tokens (+50 overlap)
→ 2570 chunks


Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


Max tokens/chunk: 626 Mean: 444.6225680933852 

Embedding and building FAISS index
Index built in 642.1s

Reloading FAISS index
Loading cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2
Top‑5 reranked passages:
1.  The predominant methodology of the baselines: Ind.: inductive inference, TF: transductive feature extraction methods, TI: transductive inference methods. Conv: convolutional blocks, RN: ResNet backbo…

2.  ﬁnal output feature di- mension is 64. The ResNet-12 backbone is used in most of the state-of-the-art models (Zhang et al. 2020; Ye et al. 2020; Liu et al. 2020). It consists of four residual blocks,…

3. /a 85.0 ± n/a APN ResNet-12 69.87 ±0.32 86.35 ±0.41 FRN ResNet-12 71.16 ±0.22 86.01 ±0.15 HGNN ResNet-12 72.05 ±0.23 86.49 ± 0.15 Table 2: Results on TieredImageNet Methods Backbone 1-shot 5-shot Prot…

4. -18, and ­DenseNet42. EfficientNetV2 was selected for its state-of-the-art performance on mod- ern, image classification datasets. ResNet-18 and DenseNet were selecte

Device set to use cpu
  class RerankRetriever(BaseRetriever):
  class RerankRetriever(BaseRetriever):



=== Running RAG Query ===


Token indices sequence length is longer than the specified maximum sequence length for this model (2481 > 512). Running this sequence through the model will result in indexing errors



Answer:
 most of the state-of-the-art models (Zhang et al. 2020; Ye et al. 2020; Liu et al., and ­DenseNet42.

Sources:
1.  The predominant methodology of the baselines: Ind.: inductive inference, TF: transductive feature extraction methods, TI: transductive inference methods. Conv: convolutional blocks, RN: ResNet backbo…
2.  ﬁnal output feature di- mension is 64. The ResNet-12 backbone is used in most of the state-of-the-art models (Zhang et al. 2020; Ye et al. 2020; Liu et al. 2020). It consists of four residual blocks,…
3. /a 85.0 ± n/a APN ResNet-12 69.87 ±0.32 86.35 ±0.41 FRN ResNet-12 71.16 ±0.22 86.01 ±0.15 HGNN ResNet-12 72.05 ±0.23 86.49 ± 0.15 Table 2: Results on TieredImageNet Methods Backbone 1-shot 5-shot Prot…
4. -18, and ­DenseNet42. EfficientNetV2 was selected for its state-of-the-art performance on mod- ern, image classification datasets. ResNet-18 and DenseNet were selected for their state-of-the-art perfo…
5. ﬁcult to tune for larger architectures. We speculate tha