In [None]:
!pip install sentence-transformers torch faiss-cpu rank_bm25 scikit-learn


In [9]:
# 1) Upgrade pip
!pip install --upgrade pip

# 2) Uninstall any CPU‑only faiss
!pip uninstall -y faiss faiss-cpu

# 3) Install the GPU build of Faiss
#    Replace 'cu118' and 'torch2.0' with your CUDA & PyTorch versions if different.
!pip install faiss-gpu \
  -f https://dl.fbaipublicfiles.com/faiss/wheels/cu118/torch2.0/index.html

# 4) Install the rest
!pip install sentence-transformers torch rank_bm25 scikit-learn


Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1
[0mLooking in links: https://dl.fbaipublicfiles.com/faiss/wheels/cu118/torch2.0/index.html
[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m


In [11]:
# train_embeddings_gpu_encode.py

import os
import pandas as pd
import numpy as np
import pickle
import torch

# CPU Faiss
import faiss
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.metrics import ndcg_score

# 1) Paths & dirs
DATA_PATH  = "/content/arxiv_data_210930-054931.csv"
MODELS_DIR = "models_gpu_encode"
os.makedirs(MODELS_DIR, exist_ok=True)

# 2) Device for SBERT
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[INFO] SBERT will run on: {device}")

# 3) Load & clean
df = pd.read_csv(DATA_PATH)
for col in ["terms", "titles", "abstracts"]:
    df[col] = df[col].fillna("").astype(str)

# 4) Build the “combo” field
df["combo"] = (
    df["terms"].str.replace(",", " ")
  + "  ||  "
  + df["titles"]
  + "  ||  "
  + df["abstracts"]
)

# 5) BM25 index (CPU)
tokenized = [doc.lower().split() for doc in df["combo"]]
bm25      = BM25Okapi(tokenized)
with open(os.path.join(MODELS_DIR, "bm25.pkl"), "wb") as f:
    pickle.dump(bm25, f)

# 6) Load SBERT on GPU (if available)
model = SentenceTransformer("all-mpnet-base-v2", device=device)

# 7) Encode on GPU
emb_tensor = model.encode(
    df["combo"].tolist(),
    convert_to_tensor=True,
    show_progress_bar=True,
    device=device
)

# 8) Normalize (so inner‑product == cosine)
emb_tensor = torch.nn.functional.normalize(emb_tensor, p=2, dim=1)

# 9) Move back to CPU & save embeddings
embeddings = emb_tensor.cpu().numpy()
np.save(os.path.join(MODELS_DIR, "embeddings.npy"), embeddings)

# 10) Build FAISS index (CPU)
d     = embeddings.shape[1]
index = faiss.IndexFlatIP(d)  # exact inner‑product index
index.add(embeddings)
faiss.write_index(index, os.path.join(MODELS_DIR, "faiss_index.bin"))

# 11) Save minimal DataFrame
df[["terms", "titles", "abstracts"]].to_pickle(os.path.join(MODELS_DIR, "dataset.pkl"))

# 12) Offline evaluation helpers
def precision_at_k(ret, rel, k):
    return len(set(ret[:k]) & set(rel)) / k

def recall_at_k(ret, rel, k):
    return len(set(ret[:k]) & set(rel)) / len(rel)

def get_gt(i, k=5):
    return bm25.get_top_n(tokenized[i], list(range(len(df))), n=k)

# 13) Evaluate on first 100 docs
results = []
for idx in range(100):
    gt       = get_gt(idx, k=5)
    bm_cands = bm25.get_top_n(tokenized[idx], list(range(len(df))), n=100)
    sims     = (embeddings[bm_cands] @ embeddings[idx]).flatten()
    ranked   = [bm_cands[i] for i in sims.argsort()[::-1]]
    top10    = ranked[:10]
    results.append({
        "P@5":    precision_at_k(top10, gt, 5),
        "R@5":    recall_at_k(top10, gt, 5),
        "nDCG@5": ndcg_score([[1 if i in gt else 0 for i in top10]], k=5)
    })

eval_df = pd.DataFrame(results)
print("\nOffline evaluation (first 100 docs):")
print(eval_df.mean())


[INFO] SBERT will run on: cuda


Batches:   0%|          | 0/1756 [00:00<?, ?it/s]

TypeError: missing a required argument: 'y_score'

In [13]:
# evaluate.py

import os
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import ndcg_score

# ─── Adjust this to the folder you actually generated ───
MODELS_DIR = "models_gpu_encode"  # or "models_gpu"

# 1) Load the minimal DataFrame
df = pd.read_pickle(os.path.join(MODELS_DIR, "dataset.pkl"))

# 2) Reconstruct the "combo" text and tokenize for BM25
df["combo"] = (
    df["terms"].str.replace(",", " ")
  + "  ||  "
  + df["titles"]
  + "  ||  "
  + df["abstracts"]
)
tokenized = [doc.lower().split() for doc in df["combo"]]

# 3) Load the BM25 index
with open(os.path.join(MODELS_DIR, "bm25.pkl"), "rb") as f:
    bm25 = pickle.load(f)

# 4) Load the embeddings
embeddings = np.load(os.path.join(MODELS_DIR, "embeddings.npy"))

# 5) Metric helper functions
def precision_at_k(retrieved, relevant, k):
    if k == 0: return 0.0
    return len(set(retrieved[:k]) & set(relevant)) / k

def recall_at_k(retrieved, relevant, k):
    if not relevant: return 0.0
    return len(set(retrieved[:k]) & set(relevant)) / len(relevant)

def get_gt(idx, k=5):
    # Treat top‑k BM25 hits as “ground truth”
    return bm25.get_top_n(tokenized[idx], list(range(len(df))), n=k)

# 6) Run evaluation on first 100 docs
results = []
for idx in range(100):
    # 6a) Ground truth via BM25
    gt = get_gt(idx, k=5)

    # 6b) BM25 prune to top 100 candidates
    bm_cands = bm25.get_top_n(tokenized[idx], list(range(len(df))), n=100)

    # 6c) Semantic scores (cosine via inner‑product on normalized vectors)
    sims = (embeddings[bm_cands] @ embeddings[idx]).flatten()

    # 6d) Pick top 10
    top_idx   = sims.argsort()[::-1][:10]           # positions in bm_cands
    top10_ids = [bm_cands[i] for i in top_idx]      # actual doc IDs
    top_scores= sims[top_idx]                       # their similarity scores

    # 6e) Build binary relevance vector
    relevance = [1 if doc_id in gt else 0 for doc_id in top10_ids]

    # 6f) Compute metrics
    p5   = precision_at_k(top10_ids, gt, 5)
    r5   = recall_at_k(top10_ids, gt, 5)
    ndcg = ndcg_score([relevance], [top_scores], k=5)

    results.append({"P@5": p5, "R@5": r5, "nDCG@5": ndcg})

# 7) Aggregate & print
eval_df = pd.DataFrame(results)
print("Offline evaluation (first 100 docs):")
print(eval_df.mean())


Offline evaluation (first 100 docs):
P@5       0.604000
R@5       0.604000
nDCG@5    0.904901
dtype: float64


# Document Recommender Evaluation

## Pipeline Overview
1. **Data Loading & Preprocessing**  
   - Load `dataset.pkl`, which contains the original `terms`, `titles`, and `abstracts` for each document.  
   - Reconstruct a single “combo” string per document:  
     ```
     combo = terms + " || " + title + " || " + abstract
     ```
   - Tokenize `combo` for BM25.

2. **Index Building**  
   - **BM25**: build an inverted‐index (`rank_bm25`) on the tokenized combos for fast lexical retrieval.  
   - **SBERT Embeddings**: encode each `combo` into a 768‑dim vector using `all‑mpnet‑base‑v2`, L2‑normalize them, and store as `embeddings.npy`.  
   - **FAISS**: build an exact inner‑product (cosine) index (`IndexFlatIP`) over those embeddings for millisecond‑scale ANN search.

3. **Offline Evaluation** (first 100 documents)  
   For each query document _i_ (0–99):
   1. **Ground‑Truth**: take the top 5 BM25 hits as “relevant” items.  
   2. **Candidate Generation**: retrieve top 100 BM25 candidates.  
   3. **Semantic Reranking**: compute cosine similarities between the query embedding and each candidate, then sort.  
   4. **Top‑10 Selection**: pick the 10 highest‑scoring documents.  
   5. **Metrics Computation**:  
      - **Precision@5**: fraction of the top 5 recommendations that are in the ground‑truth set.  
      - **Recall@5**: fraction of the ground‑truth set that appears in the top 5 recommendations.  
      - **nDCG@5**: position‐sensitive measure that rewards putting relevant docs earlier; uses the actual similarity scores as the ranking confidence.

## Results
Offline evaluation (first 100 docs):
P@5       0.604000
R@5       0.604000
nDCG@5    0.904901



- **Precision@5 = 0.60**  
  On average, 3 out of the top 5 recommendations are “relevant” (BM25‑derived).  
- **Recall@5 = 0.60**  
  We recover about 60% of the BM25 top‑5 items within our own top 5.  
- **nDCG@5 = 0.90**  
  The relevant items we do find are ranked very highly in the list.

### Are these good?
- A Precision/Recall of ~0.60 indicates a solid semantic reranker: it recovers the majority of BM25 hits.  
- An nDCG near 0.9 shows that when it does find relevant items, they tend to appear at the very top of the recommendation list.  
- **Caveat**: our “ground truth” is itself BM25’s top 5, so these numbers measure how well the semantic model **agrees** with BM25. If you have real human‑labeled relevance, you could get a more meaningful evaluation.

---

*This pipeline demonstrates a hybrid content‑based recommender combining **lexical** (BM25) recall, **semantic** (SBERT+FAISS) reranking, and an offline evaluation suite (P@K, R@K, nDCG@K).