BM25

In [None]:
import pandas as pd

df_corpus = pd.read_csv("hotels.csv", encoding="latin1")
df_corpus.columns = df_corpus.columns.str.strip()

df_corpus["combined"] = (
    "Country: " + df_corpus["countyName"].fillna("").astype(str) + "; "
    "City: " + df_corpus["cityName"].fillna("").astype(str) + "; "
    "Hotel: " + df_corpus["HotelName"].fillna("").astype(str) + "; "
    "Description: " + df_corpus["Description"].fillna("").astype(str) + "; "
)

In [None]:
# --- switch to bm25s ---
# pip install "bm25s[full]"
import bm25s
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import time

# ---- load your data (same as before) ----
# df_corpus: DataFrame with column "combined"
# queries_df: CSV with column "prompt"
queries_df = pd.read_csv("final_benchmark.csv")

# ---- tokenize & index corpus with bm25s ----
# Using builtin tokenizer with English stopwords; remove stopwords="en" if undesired
corpus_texts = df_corpus["combined"].astype(str).tolist()
# tokenize once
corpus_tokens = bm25s.tokenize(df_corpus["combined"].astype(str).tolist(), stopwords="en")
retriever = bm25s.BM25()
retriever.index(corpus_tokens)

query_texts = queries_df["query"].astype(str).tolist()
query_tokens = bm25s.tokenize(query_texts)  # list of token-id lists

start_all = time.time()
doc_ids, scores = retriever.retrieve(query_tokens, k=3)  # shapes: (N, 3) each
elapsed_all = time.time() - start_all

rows, latencies = [], []
for i, prompt_text in enumerate(query_texts):
    ids = doc_ids[i].tolist()
    top_docs = df_corpus.iloc[ids]["combined"].astype(str).tolist()
    while len(top_docs) < 3:
        top_docs.append("")
    rec_text = f"I have 3 recommendations: 1. {top_docs[0]} 2. {top_docs[1]} 3. {top_docs[2]}"
    rows.append({"prompt": prompt_text, "recommendations": rec_text})

# If you still want per-query latencies, time in chunks or loop; otherwise:
latencies = [elapsed_all / len(query_texts)] * len(query_texts)

# ---- latency stats ----
latencies = np.array(latencies, dtype=float)
p50 = np.percentile(latencies, 50)
p99 = np.percentile(latencies, 99)
print(f"Latency p50 (median): {p50:.4f} sec")
print(f"Latency p99: {p99:.4f} sec")


In [None]:
# ---- save results ----
results_df = pd.DataFrame(rows)
results_df["latency_sec"] = latencies  # optional: add latency column
print(results_df.head())

# Save to CSV
results_df.to_csv("exp1_bm25s_desc_results.csv", index=False)

In [None]:
from datasets import load_dataset
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json
from tqdm.auto import tqdm

def norm(ex):
    def S(x):  # safe string
        return "" if x is None else str(x)
    def F(x):  # safe float
        if x in (None, ""): return None
        try: return float(x)
        except: return None
    def J(x):  # json -> string
        if isinstance(x, (dict, list)):
            return json.dumps(x, ensure_ascii=False)
        return S(x)

    return {
        "date": S(ex.get("date")),
        "rating": F(ex.get("rating")),
        "title": S(ex.get("title")),
        "text": S(ex.get("text")),
        "property_dict": J(ex.get("property_dict", {})),
        "Name": S(ex.get("Name")),
        "City": S(ex.get("City")),
        "County": S(ex.get("County")),
    }

schema = pa.schema([
    ("date", pa.string()),
    ("rating", pa.float64()),
    ("title", pa.string()),
    ("text", pa.string()),
    ("property_dict", pa.string()),
    ("Name", pa.string()),
    ("City", pa.string()),
    ("County", pa.string()),
])

stream = load_dataset("json", data_files="HotelRec_V5.jsonl", split="train", streaming=True)

out_path = "HotelRec_V5.parquet"
writer = pq.ParquetWriter(out_path, schema)
buf, B = [], 200_000

for ex in tqdm(stream, desc="Normalizing + writing"):
    buf.append(norm(ex))
    if len(buf) >= B:
        writer.write_table(pa.Table.from_pandas(pd.DataFrame(buf), schema=schema, preserve_index=False))
        buf.clear()

if buf:
    writer.write_table(pa.Table.from_pandas(pd.DataFrame(buf), schema=schema, preserve_index=False))
writer.close()

# Later: super fast loads
# df = pd.read_parquet("HotelRec_V5.parquet")


In [None]:
df = pd.read_parquet("HotelRec.parquet")
df

In [None]:
# 1) Build the 'combined' column on your existing df
import pandas as pd
import numpy as np
import time

def _s(sr):
    return (
        sr.fillna("")
          .astype(str)
          .str.replace(r"\s+", " ", regex=True)
          .str.strip()
    )

# clean pieces
name   = _s(df.get("Name"))
city   = _s(df.get("City"))
county = _s(df.get("County"))          # your data uses 'County'
title  = _s(df.get("title"))
text   = _s(df.get("text")).str.slice(0, 2000)

# rating: numeric -> string ('' if NA)
rating_num = pd.to_numeric(df.get("rating"), errors="coerce")
rating_str = rating_num.round(1).astype("Float64").astype(str).replace("<NA>", "")

df["combined"] = (
    "Hotel: " + name +
    "; City: " + city +
    "; Country: " + county +
    "; Rating: " + rating_str +
    ". Title: " + title +
    ". Review: " + text
)

# If you want a lighter object for retrieval:
df_corpus = df[["combined"]]   # or: df_corpus = pd.DataFrame({"combined": df["combined"]})
print("Corpus size:", df_corpus.shape)


In [None]:
# 2) BM25S retrieval over the 'combined' column (your block)
# pip install "bm25s[full]"
import bm25s

# Tokenize corpus (stopwords="en" optional)
corpus_texts  = df_corpus["combined"].astype(str).tolist()
corpus_tokens = bm25s.tokenize(corpus_texts, stopwords="en")

retriever = bm25s.BM25()
retriever.index(corpus_tokens)

# Load queries. Supports either 'query' or 'prompt'
queries_df = pd.read_csv("final_benchmark.csv")
if "query" in queries_df.columns:
    query_texts = queries_df["query"].astype(str).tolist()
elif "prompt" in queries_df.columns:
    query_texts = queries_df["prompt"].astype(str).tolist()
else:
    raise ValueError("final_benchmark.csv must have a 'query' or 'prompt' column.")

query_tokens = bm25s.tokenize(query_texts)

start_all = time.time()
doc_ids, scores = retriever.retrieve(query_tokens, k=3)  # shapes: (N, 3)
elapsed_all = time.time() - start_all
avg_latency = elapsed_all / max(1, len(query_texts))

# Build result rows
rows = []
for i, q in enumerate(query_texts):
    ids = doc_ids[i].tolist()
    top_docs = df_corpus.iloc[ids]["combined"].astype(str).tolist()
    while len(top_docs) < 3:
        top_docs.append("")
    rec_text = f"I have 3 recommendations: 1. {top_docs[0]} 2. {top_docs[1]} 3. {top_docs[2]}"
    rows.append({"prompt": q, "recommendations": rec_text})

# Latency stats (avg duplicated per-query; for true per-query, time inside loop)
latencies = np.array([avg_latency] * len(rows), dtype=float)
p50 = np.percentile(latencies, 50)
p99 = np.percentile(latencies, 99)
print(f"Latency p50 (median): {p50:.4f} sec")
print(f"Latency p99: {p99:.4f} sec")

# Save
results_df = pd.DataFrame(rows)
results_df["latency_sec"] = latencies
print(results_df.head(3))
results_df.to_csv("exp1_bm25s_rev_results.csv", index=False)
print("[OK] Saved exp1_bm25s_rev_results.csv")


# dense retrieval

## Descriptions

In [None]:
import pandas as pd

df_corpus = pd.read_csv("hotels.csv", encoding="latin1")
df_corpus.columns = df_corpus.columns.str.strip()

df_corpus["combined"] = (
    "Country: " + df_corpus["countyName"].fillna("").astype(str) + "; "
    "City: " + df_corpus["cityName"].fillna("").astype(str) + "; "
    "Hotel: " + df_corpus["HotelName"].fillna("").astype(str) + "; "
    "Description: " + df_corpus["Description"].fillna("").astype(str) + "; "
)


import time
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import torch

# ---- load your data ----
# df_corpus: DataFrame with column "combined"
# queries_df: CSV with column "query"
queries_df = pd.read_csv("final_benchmark.csv")

# ---- embed corpus with all-MiniLM-L6-v2 ----
# model = SentenceTransformer("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True)
model = SentenceTransformer("google/embeddinggemma-300m", model_kwargs = {"torch_dtype" : torch.bfloat16},
                             )
corpus_texts = df_corpus["combined"].astype(str).tolist()

start_index = time.time()
corpus_emb = model.encode_document(
    corpus_texts, batch_size=128, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True
).astype("float32")

# cosine via inner product on unit-normalized vectors
index = faiss.IndexFlatIP(corpus_emb.shape[1])
index.add(corpus_emb)
index_build_secs = time.time() - start_index

# ---- embed queries & retrieve top-3 ----
query_texts = queries_df["query"].astype(str).tolist()

start_all = time.time()
query_emb = model.encode_query(
    query_texts, batch_size=256, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False
).astype("float32")

scores, ids = index.search(query_emb, k=3)  # shapes: (N, 3)
elapsed_all = time.time() - start_all

# ---- build recommendations per query ----
rows = []
for i, q in enumerate(query_texts):
    top_ids = ids[i].tolist()
    top_docs = df_corpus.iloc[top_ids]["combined"].astype(str).tolist()
    while len(top_docs) < 3:
        top_docs.append("")
    rec_text = f"I have 3 recommendations: 1. {top_docs[0]} 2. {top_docs[1]} 3. {top_docs[2]}"
    rows.append({"prompt": q, "recommendations": rec_text})

# ---- latency stats (overall, + per-query avg like before) ----
latencies = np.array([elapsed_all / max(len(query_texts), 1)] * len(query_texts), dtype=float)
p50 = np.percentile(latencies, 50)
p99 = np.percentile(latencies, 99)

print(f"Index build time: {index_build_secs:.3f} sec")
print(f"Query batch time: {elapsed_all:.3f} sec for {len(query_texts)} queries")
print(f"Latency p50 (median): {p50:.4f} sec")
print(f"Latency p99: {p99:.4f} sec")

# Optional: results DataFrame
results_df = pd.DataFrame(rows)
results_df = pd.DataFrame(rows)
results_df["latency_sec"] = latencies  # optional: add latency column
print(results_df.head())

# Save to CSV
results_df.to_csv("exp1_300m_desc_results_g.csv", index=False)
faiss.write_index(index, "exp1_300m_desc_index_g.faiss")


In [None]:
df = pd.read_parquet("HotelRec_V5.parquet")
df

In [None]:
# 1) Build the 'combined' column on your existing df
import pandas as pd
import numpy as np
import time

def _s(sr):
    return (
        sr.fillna("")
          .astype(str)
          .str.replace(r"\s+", " ", regex=True)
          .str.strip()
    )

# clean pieces
name   = _s(df.get("Name"))
city   = _s(df.get("City"))
county = _s(df.get("County"))          # your data uses 'County'
title  = _s(df.get("title"))
text   = _s(df.get("text")).str.slice(0, 2000)

# rating: numeric -> string ('' if NA)
rating_num = pd.to_numeric(df.get("rating"), errors="coerce")
rating_str = rating_num.round(1).astype("Float64").astype(str).replace("<NA>", "")

df["combined"] = (
    "Hotel: " + name +
    "; City: " + city +
    "; Country: " + county +
    "; Rating: " + rating_str +
    ". Title: " + title +
    ". Review: " + text
)

# If you want a lighter object for retrieval:
df_corpus = df[["combined"]]   # or: df_corpus = pd.DataFrame({"combined": df["combined"]})
print("Corpus size:", df_corpus.shape)


In [None]:
df_corpus.to_parquet("exp1_rev.parquet", index=False)  # default engine=pyarrow if installed


In [None]:
# pip install datasets pyarrow
from datasets import load_dataset

# Load your parquet file as a Hugging Face Dataset
dataset = load_dataset("parquet", data_files="exp1_rev.parquet")["train"]

# Inspect it
print(dataset)
print(dataset[0])


In [None]:
# pip install -U sentence-transformers faiss-cpu datasets tqdm

import os, time, math
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import faiss

# ----------------------------
# Config
# ----------------------------
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
BATCH_SIZE = 1024
EMB_DIM = 384                     # all-MiniLM-L6-v2 output dim
MEMMAP_PATH = "corpus_emb.f16.memmap"   # stored as float16 to save disk, convert to f32 when adding to FAISS
USE_FLOAT16_DISK = True

# HNSW params (tune for speed/accuracy tradeoff)
HNSW_M = 32                       # graph degree (16–64 is common)
HNSW_EF_CONSTRUCTION = 200        # build-time accuracy/speed
HNSW_EF_SEARCH = 128              # query-time accuracy/speed

K = 3                             # top-k to retrieve

model = SentenceTransformer(MODEL_NAME)

N = len(dataset)
print(f"Corpus size: {N:,}")

dtype_disk = np.float16 if USE_FLOAT16_DISK else np.float32
embs_mm = np.memmap(MEMMAP_PATH, dtype=dtype_disk, mode="w+", shape=(N, EMB_DIM))

t0 = time.time()
for start in tqdm(range(0, N, BATCH_SIZE), desc="Encoding corpus"):
    end = min(start + BATCH_SIZE, N)
    texts = dataset[start:end]["combined"]  # pulls only this batch
    batch_emb = model.encode(
        texts,
        batch_size=BATCH_SIZE,
        convert_to_numpy=True,
        normalize_embeddings=True,  # unit norm -> cosine == dot == monotonic to L2
        show_progress_bar=False
    ).astype("float32")            # model returns float32

    # store to disk (optionally downcast)
    if USE_FLOAT16_DISK:
        embs_mm[start:end] = batch_emb.astype("float16")
    else:
        embs_mm[start:end] = batch_emb
embs_mm.flush()
encode_secs = time.time() - t0
print(f"[OK] Encoded corpus in {encode_secs/60:.2f} min. Saved memmap: {MEMMAP_PATH}")

# ----------------------------
# 2) Build HNSW index incrementally (L2 on normalized vectors ≡ cosine ranking)
# ----------------------------
index = faiss.IndexHNSWFlat(EMB_DIM, HNSW_M)  # default metric: L2
index.hnsw.efConstruction = HNSW_EF_CONSTRUCTION
index.hnsw.efSearch = HNSW_EF_SEARCH

# Add vectors shard-by-shard to avoid RAM spikes
ADD_SHARD = 200_000
t0 = time.time()
for start in tqdm(range(0, N, ADD_SHARD), desc="Building HNSW (add)"):
    end = min(start + ADD_SHARD, N)
    shard = np.array(embs_mm[start:end], dtype="float32")  # FAISS expects float32
    # already normalized above; if you skipped normalize_embeddings, do:
    # faiss.normalize_L2(shard)
    index.add(shard)
build_secs = time.time() - t0
print(f"[OK] HNSW built in {build_secs/60:.2f} min. ntotal={index.ntotal:,}")

faiss.write_index(index, "hnsw_cosine.faiss")
print("[OK] Saved index -> hnsw_cosine.faiss")

# ----------------------------
# 3) Load queries, encode, and search
# ----------------------------
queries_df = pd.read_csv("final_benchmark.csv")
if "query" in queries_df.columns:
    query_texts = queries_df["query"].astype(str).tolist()
elif "prompt" in queries_df.columns:
    query_texts = queries_df["prompt"].astype(str).tolist()
else:
    raise ValueError("final_benchmark.csv must have a 'query' or 'prompt' column.")

tqdm.write(f"Num queries: {len(query_texts):,}")

t0 = time.time()
q_emb = model.encode(
    query_texts,
    batch_size=BATCH_SIZE,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
).astype("float32")

scores, ids = index.search(q_emb, k=K)
elapsed_all = time.time() - t0
avg_latency = elapsed_all / max(1, len(query_texts))
p50 = float(np.percentile([avg_latency] * len(query_texts), 50))
p99 = float(np.percentile([avg_latency] * len(query_texts), 99))
print(f"Query batch time: {elapsed_all:.3f} sec for {len(query_texts)} queries")
print(f"Latency P50: {p50:.4f} s | P99: {p99:.4f} s (per-query avg proxy)")

# ----------------------------
# 4) Fetch top-k documents efficiently & build results
# ----------------------------
# Gather unique corpus ids once, fetch in one shot via select, then map back
unique_ids = sorted(set(int(x) for row in ids for x in row if x >= 0))
id_to_pos = {cid: i for i, cid in enumerate(unique_ids)}
subset = dataset.select(unique_ids)  # keeps order of unique_ids
subset_texts = subset["combined"]

def get_doc(doc_id):
    return subset_texts[id_to_pos[int(doc_id)]]

rows = []
for i, q in enumerate(query_texts):
    t_ids = ids[i].tolist()
    t_docs = [get_doc(did) if did >= 0 else "" for did in t_ids]
    while len(t_docs) < K:
        t_docs.append("")
    rec_text = f"I have {K} recommendations: " + " ".join(
        [f"{j+1}. {t_docs[j]}" for j in range(K)]
    )
    rows.append({"prompt": q, "recommendations": rec_text})

results_df = pd.DataFrame(rows)
results_df["latency_sec"] = avg_latency
print(results_df.head(3))
results_df.to_csv("exp1_22m_hnsw_results.csv", index=False)
print("[OK] Saved -> exp1_22m_hnsw_results.csv")

