# RAG Prep

### Loading & Dependencies

In [1]:
import json
import re
from typing import List, Dict
import spacy

In [2]:
data = []
with open("naukri_blogs.json", 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.strip()))

### NER formation

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
def extract_ners(text: str) -> List[Dict]:
    doc = nlp(text)
    ners = []
    seen = set()
    for ent in doc.ents: # entities
        if ent.label_ in ["ORG", "PERSON"]:
            cleaned = " ".join(ent.text.split()) # text
            cleaned = re.sub(r'[^\w\s]', '', cleaned) #punctuation remove
            cleaned = re.sub(r'\d+', '', cleaned) # numbers remove
            cleaned = cleaned.strip()
            if cleaned:
                entity = (cleaned.lower(), ent.label_)
                if entity not in seen: # remove duplicates
                    ners.append({
                        "word": cleaned,
                        "label": ent.label_,
                        "score": 1.0
                    })
                    seen.add(entity)
            
    return ners

In [5]:
def add_ners(data: List[Dict]):
    for i, item in enumerate(data):
        body_text = item['body']
        title_text = item['title']
        ner_body = extract_ners(body_text)
        ner_title = extract_ners(title_text)
        ner_list = ner_body + ner_title
        ner_list.append({'word': item['topic'], 'score': 1.0})
        item['ner'] = ner_list
    return data

In [6]:
data = add_ners(data)

### BM25 Inverted Index Formation

In [7]:
def bm25_keywords(data: List[Dict])-> List[Dict]:
    for i, item in enumerate(data):
        # lower -> remove punctuation -> remove numbers -> tokenize -> remove stop words -> lemmatization
        body_text = item['body']
        body_text = body_text.lower()
        body_text = re.sub(r'[^\w\s]', '', body_text)
        body_text = re.sub(r'\d+', '', body_text)
        
        title_text = item['title']
        title_text = title_text.lower()
        title_text = re.sub(r'[^\w\s]', '', title_text)
        title_text = re.sub(r'\d+', '', title_text)

        btext = nlp(body_text)
        ttext = nlp(title_text)

        keywords = []
        for text in btext:
            if not text.is_stop and text.is_alpha and len(text.text)>1:
                keywords.append(text.lemma_)
        for text in ttext:
            if not text.is_stop and text.is_alpha and len(text.text)>1:
                keywords.append(text.lemma_)
        item['bm25'] = keywords
    return data

In [8]:
data = bm25_keywords(data)

### SPLADE

In [9]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

In [10]:
splade_tokenizer = AutoTokenizer.from_pretrained("naver/splade-v3-distilbert")
splade_model = AutoModelForMaskedLM.from_pretrained("naver/splade-v3-distilbert")
splade_model.eval() # set to eval for inference

2025-06-22 21:05:53.453455: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-22 21:05:53.465533: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750606553.478426 2047716 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750606553.482653 2047716 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750606553.494566 2047716 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0

In [11]:
def splade_vectors(data: List[Dict], chunk_size= 128, overlap = 32) -> List[Dict]:
    for idx, item in enumerate(data):
        text = item['body']
        tokens = splade_tokenizer.encode(text, add_special_tokens=False)
        
        chunks_text: List[str] = []
        for i in range(0, len(tokens), chunk_size - overlap): # go till min of both
            chunk_tokens = tokens[i : i + chunk_size]
            chunk_text = splade_tokenizer.decode(chunk_tokens, skip_special_tokens=True)
            chunks_text.append(chunk_text)

        splade_vectors: List[Dict[int, float]] = [] # splade vector of each chunk
        for chunk_content in chunks_text:
            inputs = splade_tokenizer(
                chunk_content,
                return_tensors="pt",
                max_length=splade_tokenizer.model_max_length,
                truncation=True)

            with torch.no_grad():
                output = splade_model(**inputs)
                sparse_embedding = torch.max( # splade formula
                    torch.log(1 + torch.relu(output.logits)) * inputs.attention_mask.unsqueeze(-1),
                    dim=1
                ).values.squeeze(0)

            # removing the zero values as it's mostly sparse
            non_zero_indices = sparse_embedding.nonzero(as_tuple=True)[0]
            non_zero_values = sparse_embedding[non_zero_indices]

            temp = {idx.item(): val.item() for idx, val in zip(non_zero_indices, non_zero_values)}
            splade_vectors.append(temp)
        item['splade'] = splade_vectors
    return data

In [12]:
data = splade_vectors(data)

Token indices sequence length is longer than the specified maximum sequence length for this model (2097 > 512). Running this sequence through the model will result in indexing errors


In [13]:
len(data[0]['splade'][1])

133

In [14]:
data[0].keys()

dict_keys(['topic', 'title', 'url', 'body', 'ner', 'bm25', 'splade'])

### Dense Indexes (FAISS) & Metadata formation

In [15]:
from sentence_transformers import SentenceTransformer

In [16]:
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

In [17]:
import torch
import os
import numpy as np
import faiss
import pickle as pkl

In [18]:
def normal_split(text, sz= 512, ov = 64) -> List[str]:
    tokens = text.split()
    chunks, i = [], 0
    while i < len(tokens):
        j = min(i + sz, len(tokens))
        chunks.append(" ".join(tokens[i:j]))
        if j == len(tokens):
            break
        i = j - ov
    return chunks

In [19]:
def encode(model, chunks, batch_size=32) -> List[List[float]]:
    embeddings = []
    for i in range(0, len(chunks), batch_size):
        torch.cuda.empty_cache()
        batch = chunks[i:i+batch_size]
        emb = model.encode(batch, convert_to_tensor=False)
        embeddings.extend(emb)
        torch.cuda.empty_cache()
    return embeddings

In [20]:
def process(model, data: List[Dict]) -> List[Dict]:
    output = []
    for i, item in enumerate(data):
        title_text = item['title']
        body_text = item['body']
        total_text = title_text + body_text
        
        doc_embedding = encode(model, [total_text])[0]

        output.append({
            'topic': item['topic'],
            'title': item['title'],
            'url': item['url'],
            'body': item['body'], # whole text
            'doc_idx': i,         # unique identifier for the original document
            'ner': item['ner'],
            'bm25': item['bm25'],
            'splade': item['splade'],
            'embedding': doc_embedding, # single embedding for the whole document
        })
    return output

In [21]:
def extract(data):
    emb_list = [item['embedding'] for item in data]
    embeddings = np.vstack(emb_list)
    meta =    [{'topic': item['topic'],
                'title': item['title'],
                'doc_idx': item['doc_idx'], # to know which document this belongs to so that during RRF it can be merged easily document wise
                'url': item['url'],
                'body': item['body'],
                'ner' : item['ner'],
                'bm25': item['bm25'],
                'splade': item['splade'],
    } for item in data]
    return embeddings, meta

In [22]:
def save_pickle(path, obj):
    with open(path, 'wb') as f:
        pkl.dump(obj, f)

In [23]:
processed = process(model, data)

In [24]:
embeddings, metadata = extract(processed)

In [25]:
metadata[0].keys()

dict_keys(['topic', 'title', 'doc_idx', 'url', 'body', 'ner', 'bm25', 'splade'])

In [26]:
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
faiss.normalize_L2(embeddings)
index.add(embeddings)

In [27]:
os.makedirs('indexes', exist_ok=True)
faiss.write_index(index, os.path.join('indexes', 'faiss.index'))
save_pickle(os.path.join('indexes', 'embeddings.pkl'), embeddings)
save_pickle(os.path.join('indexes', 'metadata.pkl'), metadata)