In [2]:
from pathlib import Path
import pandas as pd
import re

# 1) Load metadata
META_PATH = Path("..")/".." / "backend" / "data" / "interim" / "books_metadata.csv"
meta = pd.read_csv(META_PATH)

print("[meta] rows:", len(meta))
print("[meta] columns:", list(meta.columns))

# 2) Flexible ID picker (case-insensitive, supports common variants)
def pick_id_flexible(df: pd.DataFrame):
    cols = [c for c in df.columns if isinstance(c, str)]
    lut = {c.lower(): c for c in cols}

    # exact matches first (in priority order)
    for key in ["item_id","external_id","global_id","id","book_id","isbn","isbn13","isbn_13"]:
        if key in lut:
            return lut[key]

    # substring matches next
    for c in cols:
        lc = c.lower()
        if any(tok in lc for tok in ["itemid","externalid","globalid","bookid","isbn","asin","doi"]):
            return c

    # as a last resort: a unique URL column
    for c in cols:
        if c.lower() in {"url","link"} and df[c].nunique() == len(df):
            return c

    return None

id_col = pick_id_flexible(meta)

if id_col:
    print(f"[meta] using ID column: {id_col!r}")
    meta = meta.rename(columns={id_col: "item_id"})
else:
    print("[warn] No obvious ID column in metadata — we'll continue WITHOUT topics and create a temporary item_id.")
    meta = meta.reset_index().rename(columns={"index":"item_id"})

# 3) (Optional) Try to join topics ONLY if we have a real item_id
K = 10
df = meta.copy()  # start from metadata; recommender works with this alone

if id_col:
    try:
        TOPICS_PATH = Path("csv") / f"books_dom_topics_K{K}_CLEAN.csv"
        dom = pd.read_csv(TOPICS_PATH)

        # pick ID in topics file
        def pick_id_topics(df):
            for c in ["item_id","external_id","global_id","id","book_id","isbn","isbn13","isbn_13"]:
                if c in df.columns: return c
            return None

        dom_id = pick_id_topics(dom)
        if dom_id:
            dom = dom.rename(columns={dom_id: "item_id"})
            # normalize dominant_topic name if needed
            if "dominant_topic" not in dom.columns:
                for alt in ["dom_topic","topic","topic_id"]:
                    if alt in dom.columns:
                        dom = dom.rename(columns={alt:"dominant_topic"})
                        break
            df = df.merge(dom[["item_id","dominant_topic"]], on="item_id", how="left")
            print("[ok] topics joined on item_id")
        else:
            print("[info] Could not find an ID column in topics CSV; skipping topics join.")
    except Exception as e:
        print("[info] Skipping topics join:", e)

# 4) Build a clean text field from whatever text columns exist
TEXT_CANDIDATES = ["title","description","summary","subtitle","authors","categories","tags"]
TEXT_COLS = [c for c in TEXT_CANDIDATES if c in df.columns]
assert TEXT_COLS, "No usable text columns; need one of title/description/summary/subtitle/authors/categories/tags."

def clean(s):
    s = re.sub(r"http\S+|www\.\S+", " ", str(s).lower())
    return " ".join(s.split())

df["text"] = df[TEXT_COLS].astype(str).agg(" ".join, axis=1).map(clean)

print(f"[ready] rows={len(df)}, text_cols={TEXT_COLS}")
df.head(2)


[meta] rows: 3507
[meta] columns: ['keyword', 'volume_id', 'title', 'authors', 'description', 'categories', 'publishedDate', 'pageCount', 'language', 'averageRating', 'ratingsCount', 'previewLink', 'infoLink', 'isbn13', 'isbn10']
[meta] using ID column: 'isbn13'
[ok] topics joined on item_id
[ready] rows=3507, text_cols=['title', 'description', 'authors', 'categories']


Unnamed: 0,keyword,volume_id,title,authors,description,categories,publishedDate,pageCount,language,averageRating,ratingsCount,previewLink,infoLink,item_id,isbn10,dominant_topic,text
0,machine learning,u8OWDwAAQBAJ,An Introduction to Machine Learning,"Gopinath Rebala, Ajay Ravi, Sanjay Churiwala","Just like electricity, Machine Learning will r...",Technology & Engineering,2019-05-07,275.0,en,,,http://books.google.co.uk/books?id=u8OWDwAAQBA...,https://play.google.com/store/books/details?id...,9783030000000.0,3030157296,,an introduction to machine learning just like ...
1,machine learning,i8hQhp1a62UC,Encyclopedia of Machine Learning,"Claude Sammut, Geoffrey I. Webb","This comprehensive encyclopedia, in A-Z format...",Computers,2011-03-28,1061.0,en,,,http://books.google.co.uk/books?id=i8hQhp1a62U...,http://books.google.co.uk/books?id=i8hQhp1a62U...,9780387000000.0,387307680,,encyclopedia of machine learning this comprehe...


In [4]:
# ===  TF-IDF recommender + sanity check (using .toarray().ravel()) ===
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import numpy as np
import pandas as pd

# Build TF-IDF index
tfidf = TfidfVectorizer(min_df=2, max_df=0.9, ngram_range=(1,2))
X = tfidf.fit_transform(df["text"])
X = normalize(X)

def recommend_books(query: str, k: int = 10):
    qv = normalize(tfidf.transform([clean(query)]))
    sims = (qv @ X.T).toarray().ravel()  # <- Option 2: dense vector
    idx = np.argsort(-sims)[:k]
    cols = ["item_id"] + [c for c in ["title","authors","categories","level","url"] if c in df.columns]
    out = df.iloc[idx][cols].copy()
    out["similarity"] = sims[idx]
    return out.reset_index(drop=True)

# Title-Echo@3 sanity check
def title_echo_at3(sample=50, seed=42):
    rng = np.random.default_rng(seed)
    n = min(sample, len(df))
    idxs = rng.choice(len(df), size=n, replace=False)
    titles = df["title"] if "title" in df.columns else df["text"].str.slice(0,120)
    hits = 0
    for i in idxs:
        qv = normalize(tfidf.transform([clean(str(titles.iloc[i]))]))
        sims = (qv @ X.T).toarray().ravel()  # <- Option 2 here as well
        top3 = np.argpartition(-sims, 2)[:3]
        hits += int(i in top3)
    rate = hits / n
    print(f"[metric] Title-Echo@3 = {rate:.3f} on n={n}")
    return rate

# Run sanity + sample query
title_echo_at3()
recommend_books("transformers and attention for nlp", k=10)


[metric] Title-Echo@3 = 0.720 on n=50


Unnamed: 0,item_id,title,authors,categories,similarity
0,9781098000000.0,"Natural Language Processing with Transformers,...","Lewis Tunstall, Leandro von Werra, Thomas Wolf",Computers,0.189424
1,9781492000000.0,Practical Natural Language Processing,"Sowmya Vajjala, Bodhisattwa Majumder, Anuj Gup...",Computers,0.18267
2,9781098000000.0,Natural Language Processing with Transformers,"Lewis Tunstall, Leandro von Werra, Thomas Wolf",,0.179438
3,9781803000000.0,Transformers for Natural Language Processing,Denis Rothman,Computers,0.178685
4,9789820000000.0,Natural Language Processing,Raymond Lee,Computers,0.176154
5,9781839000000.0,Hands-On Python Natural Language Processing,"Aman Kedia, Mayank Rasu",Computers,0.146739
6,9781638000000.0,Transfer Learning for Natural Language Processing,Paul Azunre,Computers,0.13883
7,9781638000000.0,Real-World Natural Language Processing,Masato Hagiwara,Computers,0.130691
8,9783032000000.0,Handbook on Natural Language Processing for Re...,"Alessio Ferrari, Gouri Ginde",Computers,0.125995
9,9789820000000.0,Natural Language Processing,Raymond S. T. Lee,Computers,0.121881


In [6]:
import numpy as np

def intra_list_diversity_from_query(query: str, k: int = 10) -> float:
    qv = normalize(tfidf.transform([clean(query)]))
    sims = (qv @ X.T).toarray().ravel()
    idx = np.argsort(-sims)[:k]
    Xk = X[idx]                         # normalized TF-IDF rows for top-K
    S = (Xk @ Xk.T).toarray()           # cosine similarity matrix
    iu = np.triu_indices(k, 1)
    return 1.0 - float(S[iu].mean()) if k > 1 else 0.0

# example usage:
print("ILD@10:", round(intra_list_diversity_from_query("transformers and attention for nlp", k=10), 3))


ILD@10: 0.777
