In [16]:
import pandas as pd
import numpy as np

ALL = r"C:\Users\jvlas\source\repos\TrioLearn\backend\data\topics\all_topic_vectors.parquet"
df = pd.read_parquet(ALL)
print(df.shape)
print(df.columns.tolist())

# Try option A: vectors stored in one list/array column
vec_col = None
for c in df.columns:
    s = df[c].dropna().head(10)
    if len(s) and s.apply(lambda x: isinstance(x, (list, tuple, np.ndarray))).all():
        vec_col = c
        break

if vec_col is not None:
    X = np.vstack(df[vec_col].apply(lambda v: np.asarray(v, dtype=np.float32)).values)
else:
    # Option B: vectors spread across many numeric columns (e.g., v0..v383)
    num = df.select_dtypes(include=[np.number])
    if num.shape[1] < 5:
        raise ValueError("Could not find vector column(s). Inspect df.head().")
    X = num.astype(np.float32).to_numpy()

# Normalize for cosine
Xn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-12)

# Integrity checks
assert np.isfinite(Xn).all(), "Non-finite values in normalized vectors."
print("Vector matrix:", Xn.shape, "| finite:", np.isfinite(Xn).all())
print(df[['modality']].value_counts().to_string() if 'modality' in df.columns else "No modality column")


(4267, 5)
['modality', 'external_id', 'title', 'topic_vector', 'num_topics']
Vector matrix: (4267, 30) | finite: True
modality
book        2823
video        821
course       623


In [None]:
# --- Prep: ensure df["text_for_embed"] exists and is non-empty ---

import pandas as pd

if "text_for_embed" not in df.columns:
    # Try to build from any reasonable text columns found in this df
    TEXT_CAND = [c for c in [
        "title","course_title","book_title","video_title","name",
        "description","skills","tags","summary","text"
    ] if c in df.columns]

    if not TEXT_CAND:
        raise ValueError(
            "No text columns found to build 'text_for_embed'. "
            "It looks like you loaded a vectors-only table (e.g., all_topic_vectors.parquet). "
            "Load a metadata CSV (courses/books/videos) that has titles/descriptions, "
            "or switch to the topic-vector retrieval path."
            f"\nAvailable columns in current df: {list(df.columns)}"
        )

    # Coerce to pandas StringDtype and build text_for_embed
    for c in TEXT_CAND:
        df[c] = df[c].astype("string")

    def clean_join(row):
        parts = []
        for c in TEXT_CAND:
            v = row[c]
            if v is not pd.NA and v is not None:
                s = str(v).strip()
                if s:
                    parts.append(s)
        return " ".join(parts)

    df["text_for_embed"] = df.apply(clean_join, axis=1)

# Drop empty/very short texts
df = df[df["text_for_embed"].apply(lambda x: isinstance(x, str) and len(x) >= 5)].reset_index(drop=True)
print("Docs ready for SBERT:", len(df))



# Recommender (SBERT) + smoke test

In [20]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# --- 0) Coerce candidate text columns to safe string dtype ---
TEXT_CAND = [c for c in ["title","description","skills","tags","summary","text"] if c in df.columns]
assert TEXT_CAND, "description"

for c in TEXT_CAND:
    # Pandas StringDtype handles NA cleanly
    df[c] = df[c].astype("string")

# --- 1) Build robust text_for_embed without using .str on the whole Series ---
def clean_join(row):
    parts = []
    for c in TEXT_CAND:
        v = row[c]
        if v is not pd.NA and v is not None:
            s = str(v).strip()
            if s:
                parts.append(s)
    return " ".join(parts)

df["text_for_embed"] = df.apply(clean_join, axis=1)

# Drop empty/very short docs (no .str; use len() on Python strings)
df = df[df["text_for_embed"].apply(lambda x: isinstance(x, str) and len(x) >= 5)].reset_index(drop=True)
print("Kept docs:", len(df), "| dropped empties/very short texts.")

# --- 2) Embed corpus with SBERT ---
_sbert = SentenceTransformer("all-MiniLM-L6-v2")
Xn = _sbert.encode(df["text_for_embed"].tolist(), normalize_embeddings=True)
Xn = np.asarray(Xn, dtype=np.float32)
print("Corpus SBERT shape:", Xn.shape)


# --- 3) Safe top-k from query text ---
def topk_from_query_text(query_text, Xn, model=_sbert, k=10):
    q = (query_text or "").strip()
    if len(q) < 3:
        raise ValueError(f"Query too short/empty: {repr(query_text)}")
    qv = model.encode([q], normalize_embeddings=True)[0].astype(np.float32)
    sims = Xn @ qv
    k = min(k, len(sims))
    idx = np.argpartition(-sims, k-1)[:k]
    idx = idx[np.argsort(-sims[idx])]
    return idx, sims[idx]

# --- 4) Build a non-empty query pool WITHOUT .str accessor ---
TITLE_CANDS = [c for c in ["title","name","course_title","book_title","video_title"] if c in df.columns]
title_col = TITLE_CANDS[0] if TITLE_CANDS else None

if title_col:
    mask = df[title_col].apply(lambda x: isinstance(x, str) and len(x.strip()) >= 3)
    qpool = df.loc[mask, title_col].tolist()
else:
    qpool = df["text_for_embed"].tolist()

if not qpool:
    qpool = ["deep learning","bayesian inference","computer vision","SQL databases","transformers for NLP"]

queries = pd.Series(qpool).sample(min(5, len(qpool)), random_state=42).tolist()

# --- 5) Smoke test ---
for q in queries:
    idx, sims = topk_from_query_text(q, Xn, k=10)
    print("\nQuery:", q[:80])
    for r, (i, s) in enumerate(zip(idx, sims), 1):
        title_disp = ""
        if title_col:
            val = df.iloc[i][title_col]
            title_disp = (val if isinstance(val, str) else str(val))[:70]
        mod = df.iloc[i]["modality"] if "modality" in df.columns else "?"
        print(f"{r:2d}. [{mod}] {title_disp} | cos={s:.3f}")

# --- 6) Sanity: embeddings not collapsed ---
print("\nStd of corpus embeddings (should be > 0):", f"{float(np.std(Xn)):.6f}")


AssertionError: description

# Self-retrieval (Recall@1) sanity

Pick random items and check whether each item retrieves itself (or a near-duplicate) at rank 1 when using its own text as the query. This catches indexing or alignment bugs. 
Pass criteria: high Recall@1 (often >0.8) indicates vectors and indexing are consistent.

In [3]:
rng = np.random.default_rng(0)
sample_idx = rng.choice(len(df), size=min(100, len(df)), replace=False)

# Build queries from existing items
texts_for_queries = []
for i in sample_idx:
    if title_col:
        texts_for_queries.append(str(df[title_col].iloc[i]))
    else:
        # fallback: if you have a 'text' or 'description' column
        txt_col = next((c for c in ["text","description","summary"] if c in df.columns), None)
        texts_for_queries.append(str(df[txt_col].iloc[i]) if txt_col else f"doc_{i}")

Q = embed_text_minilm(texts_for_queries)

hits = 0
for j, (i, qv) in enumerate(zip(sample_idx, Q)):
    idx, sims = topk_from_query_vec(qv, Xn, k=1)
    # consider “self” if exact index OR very high cosine (>=0.99)
    if idx[0] == i or float(sims[0]) >= 0.99:
        hits += 1

recall1 = hits / len(sample_idx)
print(f"Self-retrieval Recall@1 on n={len(sample_idx)}: {recall1:.3f}")


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 384 is different from 30)

In [15]:
# Build text_for_embed from whatever columns exist
TEXT_CAND = [c for c in [
    "title","course_title","book_title","video_title","name",
    "description","skills","tags","summary","text"
] if c in df.columns]
if not TEXT_CAND:
    raise ValueError(f"No text columns found. Available: {list(df.columns)}")

for c in TEXT_CAND:
    df[c] = df[c].astype("string")

def clean_join(row):
    parts = []
    for c in TEXT_CAND:
        v = row[c]
        if v is not pd.NA and v is not None:
            s = str(v).strip()
            if s:
                parts.append(s)
    return " ".join(parts)

df["text_for_embed"] = df.apply(clean_join, axis=1)
df = df[df["text_for_embed"].apply(lambda x: isinstance(x,str) and len(x) >= 5)].reset_index(drop=True)
print("Docs ready for SBERT:", len(df))


Docs ready for SBERT: 0
