<a href="https://colab.research.google.com/github/ashfaq-polit/Large_language_models/blob/master/Reranking_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Re-run setup after Colab reset
!pip install -qU \
    datasets==2.14.5 \
    faiss-cpu \
    sentence-transformers \
    langchain \
    langchain-community

import os
from datasets import load_dataset
from tqdm.auto import tqdm
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import torch

# Load dataset
data = load_dataset("jamescalam/ai-arxiv-chunked", split="train")

# Preprocess dataset
data = data.map(lambda x: {
    "id": f'{x["id"]}-{x["chunk-id"]}',
    "text": x["chunk"],
    "metadata": {
        "title": x["title"],
        "url": x["source"],
        "primary_category": x["primary_category"],
        "published": x["published"],
        "updated": x["updated"],
        "text": x["chunk"],
    }
})
data = data.remove_columns([
    "title", "summary", "source", "authors", "categories", "comment",
    "journal_ref", "primary_category", "published", "updated", "references",
    "doi", "chunk-id", "chunk"
])

# Initialize embedding model (no API key needed)
embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"}
)

# Create FAISS index
index_path = "rerankers_faiss_index"
if os.path.exists(index_path):
    print("📂 Loading existing FAISS index...")
    index = FAISS.load_local(index_path, embed_model, allow_dangerous_deserialization=True)
else:
    print("🆕 Creating new FAISS index...")
    documents = [
        Document(page_content=data[i]["text"], metadata=data[i]["metadata"])
        for i in range(min(len(data), 500))  # limit for demo
    ]
    index = FAISS.from_documents(documents, embed_model)
    index.save_local(index_path)

# Function to retrieve similar documents
def get_docs(query: str, top_k: int) -> list[str]:
    results = index.similarity_search(query, k=top_k)
    return [doc.page_content for doc in results]

# Query example
query = "can you explain why we would want to do rlhf?"
docs = get_docs(query, top_k=10)
docs[:3]

📂 Loading existing FAISS index...


['Human I really need the books and at least one other object.\nRL+ROLLOUTS Ok, you can have one book and one ball\nHuman If I can have all the books, I can leave you the rest.\nRL+ROLLOUTS Deal\nOutput Reward\nRL+ROLLOUTS 3xhat1xball 7/10\nHuman 3x book 6/10\nFigure 6: Example of model compromising.\nutterances produced by our model, and found that\nthe overwhelming majority were ﬂuent English\nsentences in isolation—showing that the model\nhas learnt a good language model for the domain\n(in addition to results that show it uses language\neffectively to achieve its goals). These results suggest that although neural models are prone to the\nsafer option of repeating sentences from training\ndata, they are capable of generalising when necessary. Future work should choose domains that\nforce a higher degree of diversity in utterances.\nMaintaining multi-sentence coherence is challenging. One common linguistic error we see\nRL+ROLLOUTS make is to start a message by indicating agreement (

# *ReRanking*

In [6]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
from typing import List, Dict

# 1. Load local embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")  # no API key required

# 2. Define rerank function
def rerank(query: str, documents: List[str], top_n: int = 3) -> List[Dict]:
    # Embed query and documents
    query_embedding = model.encode(query, convert_to_tensor=True)
    doc_embeddings = model.encode(documents, convert_to_tensor=True)

    # Compute cosine similarities
    scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0]

    # Get top_n indices sorted by similarity
    top_results = np.argsort(-scores.cpu().numpy())[:top_n]

    # Return reranked docs and scores
    reranked = [{"index": i, "score": scores[i].item(), "text": documents[i]} for i in top_results]
    return reranked


In [11]:
def compare(query: str, top_k: int, top_n: int):
    # Get vector search results (assumed to be a list of strings)
    docs = get_docs(query, top_k=top_k)
    doc_list = docs  # already a list of text
    i2doc = {i: doc for i, doc in enumerate(doc_list)}  # optional for debugging

    # Rerank
    reranked = rerank(query, doc_list, top_n)


    # Compare order
    print(f"\nQuery: {query}\n")
    for i, result in enumerate(reranked):
        orig_pos = doc_list.index(result["text"])
        print(f"[{i}] Reranked from original index {orig_pos} with score {result['score']:.4f}")
        print(result["text"])
        print("\n---\n")



In [12]:
compare("what is red teaming?", top_k=25, top_n=3)



Query: what is red teaming?

[0] Reranked from original index 0 with score 0.2547
(q,a)pairs as output. Checkpoints are selected
by Exact Match score on a development set. We
also include a much more powerful T5-11B model
from Roberts et al. (2020 ). We use the T511B model which has been pretrained with a special “Salient Span Masking” objective ( Guu et al. ,
2020 ), designed to improve downstream ODQA
ModelOpen Natural Questions TriviaQA WebQuestions
TotalQuestion
OverlapAnswer
Overlap
OnlyNo
OverlapTotalQuestion
OverlapAnswer
Overlap
OnlyNo
OverlapTotalQuestion
OverlapAnswer
Overlap
OnlyNo
Overlap
Open
bookRAG 44.5 70.7 34.9 24.8 56.8 82.7 54.7 29.2 45.5 81.0 45.8 21.1
DPR 41.3 69.4 34.6 19.3 57.9 80.4 59.6 31.6 42.4 74.1 39.8 22.2
FID 51.4 71.3 48.3 34.5 67.6 87.5 66.9 42.8 - - - Closed
bookT5-11B+SSM 36.6 77.2 22.2 9.4 - - - - 44.7 82.1 44.5 22.0

---

[1] Reranked from original index 1 with score 0.2372
Centipede 3496.5 8904.8 4386.4
ChopperCommand 1171.7 5287.7 3516.3
CrazyClim