# RAG Recommendations & Evaluation

### Loading dependencies & data, Helper functions

In [1]:
import faiss
import pickle
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer

2025-06-22 21:42:34.987156: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-22 21:42:34.999012: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750608755.012380 2079463 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750608755.016040 2079463 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750608755.026755 2079463 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
queries_f = []
with open('queries_naukri.txt', encoding="utf-8") as f:
    for line in f:
        query = line.strip()
        if query:
            queries_f.append(query)

In [3]:
queries_f[0]

'Essential entrepreneur skills to develop'

In [4]:
queries_e = []
with open('exact_queries.txt', encoding="utf-8") as f:
    for line in f:
        query = line.strip()
        if query:
            queries_e.append(query)

In [5]:
queries_e[450]

'Popular Artificial Intelligence & Machine Learning Careers'

In [6]:
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

In [7]:
with open("metadata.pkl", 'rb') as f:
    metadata = pickle.load(f)

In [8]:
metadata[0].keys()

dict_keys(['topic', 'title', 'doc_idx', 'url', 'body', 'ner', 'bm25', 'splade'])

In [9]:
type(metadata)

list

In [10]:
type(metadata[0])

dict

In [11]:
index = faiss.read_index("faiss.index")

In [12]:
def print_recomm(recom):
    for i, (score, doc_dict) in enumerate(recom):
        title = doc_dict.get('title', 'N/A')
        url = doc_dict.get('url', 'N/A')
        doc_idx = doc_dict.get('doc_idx', 'N/A')
        
        print(f"Rank {i+1}:")
        print(f"  Score: {score:.4f}")
        print(f"  Doc Index: {doc_idx}")
        print(f"  Title: {title}")
        print(f"  URL: {url}")
        print("-" * 30)

## Recommendations Flow

### FAISS Recommendations

In [13]:
import numpy as np

In [14]:
def query_embed(text):
    emb = model.encode([text], convert_to_tensor=False)
    emb = np.array(emb, dtype='float32')
    faiss.normalize_L2(emb)
    return emb

In [15]:
def dense_search(query, topk = 10):
    qemb = query_embed(query)
    
    D, I = index.search(qemb.reshape(1, -1), topk * 10) 
    
    res = []
    seen_urls = set() 

    for score, idx in zip(D[0], I[0]):
        if not (0 <= idx < len(metadata)):
            continue 
            
        doc_info = metadata[idx] 
        curr_url = doc_info.get('url') 
        
        if curr_url is None:
            continue 

        if curr_url not in seen_urls:
            res.append((score, doc_info))
            seen_urls.add(curr_url)
            
            if len(res) >= topk:
                break 
                
    return res


In [16]:
dense_recomm = dense_search(queries_f[0], topk = 10)

In [17]:
print_recomm(dense_recomm)

Rank 1:
  Score: 0.6653
  Doc Index: 392
  Title: 10 Essential Entrepreneurial Skills & How to Develop Them
  URL: https://www.naukri.com/blog/entrepreneurial-skills/
------------------------------
Rank 2:
  Score: 0.6274
  Doc Index: 420
  Title: 8 Most Important Qualities of an Entrepreneur and How to Develop Them
  URL: https://www.naukri.com/blog/qualities-of-an-entrepreneur/
------------------------------
Rank 3:
  Score: 0.5456
  Doc Index: 626
  Title: Upgrade Yourself for These Top 20 Work-From-Home Jobs
  URL: https://www.naukri.com/blog/upgrade-yourself-for-these-top-20-work-from-home-jobs-covid-article3/
------------------------------
Rank 4:
  Score: 0.5265
  Doc Index: 640
  Title: Top 100 MOOCs for Lifelong Learners: Pick a New Skill and Make the Most of This Lockdown
  URL: https://www.naukri.com/blog/top-100-moocs-for-lifelong-learners-pick-a-new-skill-and-make-the-most-of-this-lockdown-covid-article3/
------------------------------
Rank 5:
  Score: 0.5175
  Doc Index: 

### SPLADE Recommendations

In [18]:
import torch
from typing import List, Dict, Tuple, Union
from transformers import AutoModelForMaskedLM, AutoTokenizer

In [19]:
splade_tokenizer = AutoTokenizer.from_pretrained("naver/splade-v3-distilbert")
splade_model = AutoModelForMaskedLM.from_pretrained("naver/splade-v3-distilbert")
splade_model.eval() # set to eval for inference

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
splade_model.to(device)

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0

In [21]:
def splade_query_embed(text: str) -> Dict[int, float]:
    inputs = splade_tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        max_length=splade_tokenizer.model_max_length,
        truncation=True
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        output = splade_model(**inputs)
    
    logits = output.logits
    relu_logits = torch.relu(logits)
    log_relu_logits = torch.log(1 + relu_logits)
    max_pooled_logits = torch.max(log_relu_logits * inputs["attention_mask"].unsqueeze(-1), dim=1).values
    
    sparse_vector = {}
    for i, score in enumerate(max_pooled_logits[0]):
        if score > 0:
            sparse_vector[i] = score.item()
    return sparse_vector

In [22]:
def dot_product(vec1: Dict[int, float], vec2: Dict[int, float]) -> float:
    dp = 0.0
    for token_id, score1 in vec1.items():
        if token_id in vec2:
            dp += score1 * vec2[token_id]
    return dp

In [23]:
def norm(vec: Union[Dict[int, float], List[Dict[int, float]], List[float]]) -> float:
    sum_sq = 0.0
    if isinstance(vec, dict):
        for score_val in vec.values():
            sum_sq += score_val * score_val
    elif isinstance(vec, list):
        for item_element in vec:
            if isinstance(item_element, dict) and len(item_element) == 1:
                score_val = next(iter(item_element.values()))
                sum_sq += score_val * score_val
            elif isinstance(item_element, (int, float)):
                score_val = item_element
                sum_sq += score_val * score_val
    return np.sqrt(sum_sq)

In [24]:
def splade_search(query: str, meta: List[Dict], topk: int = 10) -> List[Tuple[float, Dict]]:
    q_vec = splade_query_embed(query)
    q_norm = norm(q_vec)
    if q_norm == 0:
        return []
    doc_scores = []
    for d_idx, d_item in enumerate(meta):
        chunk_vecs = d_item.get('splade')
        if not isinstance(chunk_vecs, list) or not chunk_vecs:
            doc_scores.append((0.0, d_idx))
            continue
            
        max_sim = 0.0
        for c_vec in chunk_vecs:
            if isinstance(c_vec, dict):
                c_norm = norm(c_vec)
                if c_norm > 0:
                    sim = dot_product(q_vec, c_vec) / (q_norm * c_norm)
                    if sim > max_sim:
                        max_sim = sim
        doc_scores.append((max_sim, d_idx))
        
    doc_scores.sort(key=lambda x: x[0], reverse=True)
    res = []
    seen_urls = set()
    for score, d_idx in doc_scores:
        if not (0 <= d_idx < len(meta)):
            continue
            
        doc_info = meta[d_idx]
        current_url = doc_info.get('url')
        if current_url is None:
            continue
            
        if current_url not in seen_urls:
            res.append((score, doc_info))
            seen_urls.add(current_url)

            if len(res) >= topk:
                break   
    return res

In [25]:
splade_recomm = splade_search(queries_f[0], metadata, 10)

In [26]:
print_recomm(splade_recomm)

Rank 1:
  Score: 0.5046
  Doc Index: 392
  Title: 10 Essential Entrepreneurial Skills & How to Develop Them
  URL: https://www.naukri.com/blog/entrepreneurial-skills/
------------------------------
Rank 2:
  Score: 0.4258
  Doc Index: 1
  Title: 8 Most Important Qualities of an Entrepreneur and How to Develop Them
  URL: https://www.naukri.com/blog/qualities-of-an-entrepreneur/
------------------------------
Rank 3:
  Score: 0.3292
  Doc Index: 4
  Title: What is Management? Definition, Functions, Skills, and Job Roles
  URL: https://www.naukri.com/blog/what-is-management/
------------------------------
Rank 4:
  Score: 0.3249
  Doc Index: 675
  Title: What Are the Crucial Aerospace Engineer Skills?
  URL: https://www.naukri.com/blog/what-are-the-crucial-aerospace-engineer-skills/
------------------------------
Rank 5:
  Score: 0.3178
  Doc Index: 400
  Title: What To Do After BCA?: Scope and Career Options
  URL: https://www.naukri.com/blog/what-to-do-after-bca/
----------------------

### BM25 Recommendations

In [27]:
import re
from rank_bm25 import BM25Okapi

In [28]:
corpus = []
indices_map = []
for idx, doc in enumerate(metadata):
    keywords = doc['bm25']
    if isinstance(keywords, list) and all(isinstance(k, str) for k in keywords):
        corpus.append(keywords)
        indices_map.append(idx)
    else:
        corpus.append([])
        indices_map.append(idx)

In [29]:
bm25_ranker = BM25Okapi(corpus) if corpus else None

In [30]:
def tokenize(text: str) -> List[str]:
    return [word.lower() for word in re.findall(r'\b\w+\b', text)]

In [31]:
def bm25_search(query: str, meta: List[Dict], topk: int = 10, bm25: BM25Okapi = None, doc_map: List[int] = None) -> List[Tuple[float, Dict]]:
    if bm25 is None or doc_map is None:
        raise ValueError("BM25 ranker and document map must be initialized and passed.")
    query = tokenize(query)
    doc_scores = bm25.get_scores(query)
    
    scored_docs_unsorted = []
    for i, score in enumerate(doc_scores):
        orig_doc_idx = doc_map[i]
        if not (0 <= orig_doc_idx < len(meta)):
            continue
        scored_docs_unsorted.append((score, meta[orig_doc_idx]))
    scored_docs_unsorted.sort(key=lambda x: x[0], reverse=True)
    res = []
    seen_urls = set()
    for score, doc_info in scored_docs_unsorted:
        current_url = doc_info.get('url')

        if current_url is None:
            continue

        if current_url not in seen_urls:
            res.append((score, doc_info))
            seen_urls.add(current_url)

            if len(res) >= topk:
                break      
    return res

In [32]:
bm25_recomm = bm25_search(queries_f[0], metadata, 10, bm25_ranker,indices_map)

In [33]:
print_recomm(bm25_recomm)

Rank 1:
  Score: 8.9482
  Doc Index: 1
  Title: 8 Most Important Qualities of an Entrepreneur and How to Develop Them
  URL: https://www.naukri.com/blog/qualities-of-an-entrepreneur/
------------------------------
Rank 2:
  Score: 8.7505
  Doc Index: 0
  Title: 10 Essential Entrepreneurial Skills & How to Develop Them
  URL: https://www.naukri.com/blog/entrepreneurial-skills/
------------------------------
Rank 3:
  Score: 8.0806
  Doc Index: 880
  Title: Polish Your Resume On Naukri
  URL: https://www.naukri.com/blog/time-to-revisit-your-profile-on-naukri/
------------------------------
Rank 4:
  Score: 7.3472
  Doc Index: 269
  Title: Campus Placement & Role of Career Counselling in Colleges.
  URL: https://www.naukri.com/blog/campus-placement-role-of-career-counselling-in-colleges/
------------------------------
Rank 5:
  Score: 7.1546
  Doc Index: 187
  Title: Career Opportunities for Freshers
  URL: https://www.naukri.com/blog/career-opportunities-for-freshers/
-------------------

## RRF (Retrieval Rank Fusion)

In [34]:
from typing import List, Dict, Tuple

In [35]:
def RRF( ranked_lists: List[List[Tuple[float, Dict]]], weights: List[float] = None, k_const: int = 60, top_n: int = 15 ) -> List[Tuple[float, Dict]]:
    fused_scores = {}
    url_to_doc = {} 

    if weights is None:
        weights = [1.0] * len(ranked_lists)
    elif len(weights) != len(ranked_lists):
        raise ValueError("Number of weights must match number of ranked lists.")

    for list_idx, r_list in enumerate(ranked_lists):
        current_weight = weights[list_idx]
        
        for rank, (score, chunk_dict) in enumerate(r_list):
            doc_url = chunk_dict.get('url') 
            if doc_url is None:
                continue

            if doc_url not in url_to_doc:
                url_to_doc[doc_url] = chunk_dict 

            fused_scores[doc_url] = fused_scores.get(doc_url, 0.0) + (current_weight * (1.0 / (k_const + rank + 1)))

    sorted_fused_scores = sorted(fused_scores.items(), key=lambda item: item[1], reverse=True)

    res = []
    seen_urls = set() 
    for doc_url, rrf_score in sorted_fused_scores:
        if doc_url not in seen_urls:
            res.append((rrf_score, url_to_doc[doc_url]))
            seen_urls.add(doc_url)
            if len(res) >= top_n:
                break
    return res

In [36]:
def rrf_recomm( bm25_recom: List[Tuple[float, Dict]], splade_recom: List[Tuple[float, Dict]], faiss_recom: List[Tuple[float, Dict]], w1: float, w2: float, w3: float, topk: int = 10) -> List[Tuple[float, Dict]]:
    whole_list = [bm25_recom, splade_recom, faiss_recom]
    weights = [w1, w2, w3]
    fused_results = RRF(whole_list, weights, 60, topk)
    return fused_results

In [39]:
final_recomm = rrf_recomm(bm25_recomm, splade_recomm, dense_recomm, 2, 1.5, 1, 10)

## Observing Final Recommendations

In [42]:
print(queries_f[0]) # query

Essential entrepreneur skills to develop


In [40]:
print_recomm(final_recomm)

Rank 1:
  Score: 0.0732
  Doc Index: 0
  Title: 10 Essential Entrepreneurial Skills & How to Develop Them
  URL: https://www.naukri.com/blog/entrepreneurial-skills/
------------------------------
Rank 2:
  Score: 0.0731
  Doc Index: 1
  Title: 8 Most Important Qualities of an Entrepreneur and How to Develop Them
  URL: https://www.naukri.com/blog/qualities-of-an-entrepreneur/
------------------------------
Rank 3:
  Score: 0.0450
  Doc Index: 276
  Title: 4 Leaders Whose Leadership Qualities & Skills Changed Everything
  URL: https://www.naukri.com/blog/4-leaders-whose-leadership-qualities-skills-changed-everything/
------------------------------
Rank 4:
  Score: 0.0317
  Doc Index: 880
  Title: Polish Your Resume On Naukri
  URL: https://www.naukri.com/blog/time-to-revisit-your-profile-on-naukri/
------------------------------
Rank 5:
  Score: 0.0312
  Doc Index: 269
  Title: Campus Placement & Role of Career Counselling in Colleges.
  URL: https://www.naukri.com/blog/campus-placement

### Evaluation