In [None]:
import boardgame_rag.index_build as index_build

In [11]:
import sys, pathlib
sys.path.append(str(pathlib.Path().resolve() / "src"))


In [2]:
import os, pathlib, sys
print("cwd:", os.getcwd())              # ensure you’re at the project root
print("python:", sys.executable)   

cwd: /Users/ayman/Desktop/boardgame_rag/sandbox
python: /Users/ayman/Desktop/boardgame_rag/.venv/bin/python


In [10]:
import pickle, pathlib, numpy as np

abs_path = pathlib.Path("/Users/ayman/Desktop/boardgame_rag/indices/bm25.pkl")
obj = pickle.loads(abs_path.read_bytes())

In [6]:
p = pathlib.Path("indices")
print(p/"bm25.pkl")

indices/bm25.pkl


In [None]:
import pickle, pathlib, numpy as np
# p = pathlib.Path("indices") (notebook directory doesnt quite work like the .py example)
# obj = pickle.loads((p/"bm25.pkl").read_bytes())
abs_path = pathlib.Path("/Users/ayman/Desktop/boardgame_rag/indices/bm25.pkl")
obj = pickle.loads(abs_path.read_bytes())
bm25 = obj["bm25"]
doc_ids = obj["doc_ids"]

query = "dice trading"
q_toks = query.lower().split()
scores = bm25.get_scores(q_toks)
topk = np.argsort(scores)[-5:][::-1]
print("BM25 top-5 doc_ids:", [doc_ids[i] for i in topk])
print("BM25 top-5 scores:", [float(scores[i]) for i in topk])


BM25 top-5 doc_ids: ['G496', 'G492', 'G281', 'G043', 'G141']
BM25 top-5 scores: [4.0601331742826785, 4.0601331742826785, 4.0601331742826785, 3.997655703103537, 2.590171820281613]


_______________________

# exploring cbm25 vs faiss values

In [23]:
import pickle
from pathlib import Path

with open("../indices/bm25.pkl", "rb") as f:
    obj = pickle.load(f)
bm25 = obj["bm25"]

idf = bm25.idf 
# sort by how many docs each token appears in
by_freq = sorted(bm25.doc_freqs[1].items(), key=lambda x: x[1])
print("10 rarest tokens (appear in fewest docs):")
for term, freq in by_freq[:10]:
    print(f"{term:<15} → {freq} docs  (IDF={idf[term]:.2f})")

print("\n10 most common tokens (appear in most docs):")
for term, freq in by_freq[-10:]:
    print(f"{term:<15} → {freq} docs  (IDF={idf[term]:.2f})")


10 rarest tokens (appear in fewest docs):
pandemic        → 1 docs  (IDF=2.85)
1               → 1 docs  (IDF=1.78)
blends          → 1 docs  (IDF=0.44)
worker          → 1 docs  (IDF=2.06)
placement,      → 1 docs  (IDF=2.46)
drafting        → 1 docs  (IDF=2.89)
within          → 1 docs  (IDF=0.44)
a               → 1 docs  (IDF=0.44)
cooperative     → 1 docs  (IDF=2.33)
frame.          → 1 docs  (IDF=0.44)

10 most common tokens (appear in most docs):
around          → 1 docs  (IDF=0.44)
45              → 1 docs  (IDF=1.62)
minutes.        → 1 docs  (IDF=0.44)
actions         → 1 docs  (IDF=1.83)
are             → 1 docs  (IDF=1.83)
scarce;         → 1 docs  (IDF=1.83)
turn            → 1 docs  (IDF=1.83)
order           → 1 docs  (IDF=1.83)
tension         → 1 docs  (IDF=1.83)
matters         → 1 docs  (IDF=1.83)


In [None]:
import pickle, faiss, numpy as np
from sentence_transformers import SentenceTransformer

# BM25
with open("../indices/bm25.pkl", "rb") as f:
    obj = pickle.load(f)
bm25, doc_ids = obj["bm25"], obj["doc_ids"]

# FAISS
index = faiss.read_index("../indices/faiss.index")
with open("../indices/faiss_meta.pkl", "rb") as f:
    meta = pickle.load(f)
embedder = SentenceTransformer(meta["model"])

query = "deck building games under 45 minutes"

bm_scores = bm25.get_scores(query.lower().split())
top_bm = np.argsort(bm_scores)[::-1][:5]

query_vec = embedder.encode([query])
faiss.normalize_L2(query_vec)
sims, idxs = index.search(query_vec.astype(np.float32), 5)
top_vec = idxs[0]; sims = sims[0]

print("BM25 top docs:")
for i in top_bm:
    print(f"{doc_ids[i]}  score={bm_scores[i]:.3f}")

print("\nFAISS (vector) top docs:")
for i, s in zip(top_vec, sims):
    print(f"{doc_ids[i]}  sim={s:.3f}")



BM25 top docs:
G447  score=4.946
G063  score=4.859
G244  score=4.775
G424  score=4.328
G250  score=4.328

FAISS (vector) top docs:
G422  sim=0.634
G061  sim=0.630
G463  sim=0.611
G278  sim=0.608
G073  sim=0.590


# peaking into faiss

In [41]:
import pickle
with open("../indices/faiss_meta.pkl", "rb") as f:
    meta = pickle.load(f)
print(meta.keys())
print("Model:", meta["model"])
print("First 5 doc_ids:", meta["doc_ids"][:5])
print("Total docs:", len(meta["doc_ids"]))

dict_keys(['doc_ids', 'model'])
Model: sentence-transformers/all-MiniLM-L6-v2
First 5 doc_ids: ['G000', 'G001', 'G002', 'G003', 'G004']
Total docs: 500


In [44]:
import faiss
import numpy as np

index = faiss.read_index("../indices/faiss.index")
print("Index type:", type(index))
print("Vectors stored:", index.ntotal)

# First vector:
v0 = index.reconstruct(0)         # shape (d,)
print("dim:", v0.shape, "sample:", v0[:10])

# First 5 vectors:
v5 = index.reconstruct_n(0, 5)    # shape (5, d)
print("batch shape:", v5.shape)





Index type: <class 'faiss.swigfaiss.IndexFlatIP'>
Vectors stored: 500
dim: (384,) sample: [-0.01303484 -0.00710506  0.01056533  0.02435474 -0.05928582  0.02253141
 -0.06277668  0.03914481 -0.08805666 -0.02986085]
batch shape: (5, 384)


In [47]:
v5

array([[-0.01303484, -0.00710506,  0.01056533, ...,  0.02319394,
        -0.03680485,  0.07563792],
       [ 0.00870815, -0.04122691,  0.0200754 , ..., -0.08355469,
        -0.08820572,  0.07077812],
       [ 0.02140957, -0.01016314,  0.02196506, ..., -0.01133927,
        -0.06008958,  0.05124512],
       [-0.0026161 , -0.00440962, -0.04676261, ...,  0.00977282,
         0.02119232,  0.05478996],
       [ 0.0088317 ,  0.0677369 ,  0.00347244, ..., -0.08001143,
        -0.11992447,  0.03989131]], shape=(5, 384), dtype=float32)