<a href="https://colab.research.google.com/github/busraguven/book-recommender/blob/main/book_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 📚 Book Recommender Project — Starter Notebook
# Author: [Ulviye Busra Guven]
# Goal: Build a multimodal book recommender (text + metadata)

!pip install pandas numpy scikit-learn sentence-transformers faiss-cpu -q
import pandas as pd
import numpy as np


In [None]:
data = [
    {"title": "The Great Gatsby", "author": "F. Scott Fitzgerald", "genre": "Classic",
     "description": "A tragic story of wealth, love and the American dream."},
    {"title": "Pride and Prejudice", "author": "Jane Austen", "genre": "Romance",
     "description": "A witty exploration of manners, marriage and social standing."},
    {"title": "Dune", "author": "Frank Herbert", "genre": "Sci-Fi",
     "description": "An epic saga of politics, prophecy, and survival on the desert planet Arrakis."},
    {"title": "1984", "author": "George Orwell", "genre": "Dystopian",
     "description": "A chilling vision of a totalitarian future and the loss of freedom."},
    {"title": "The Hobbit", "author": "J.R.R. Tolkien", "genre": "Fantasy",
     "description": "A hobbit embarks on a perilous adventure with dwarves and dragons."}
]

df = pd.DataFrame(data)
df


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

df["embeddings"] = df["description"].apply(lambda x: model.encode(x))
df.head()


In [None]:
import faiss

# Create the index
dim = len(df["embeddings"][0])
index = faiss.IndexFlatIP(dim)  # cosine similarity via dot product
emb_matrix = np.vstack(df["embeddings"].values)
faiss.normalize_L2(emb_matrix)
index.add(emb_matrix)

# Function to get top-N similar books
def recommend(title, top_n=3):
    idx = df.index[df["title"] == title][0]
    query = df.loc[idx, "embeddings"].reshape(1, -1)
    faiss.normalize_L2(query)
    scores, indices = index.search(query, top_n + 1)
    results = df.iloc[indices[0][1:]][["title", "author", "genre"]]
    results["score"] = scores[0][1:]
    return results

recommend("The Hobbit")


In [None]:
# ---- Add metadata features ----

# Encode genres numerically
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

genre_encoder = LabelEncoder()
df["genre_encoded"] = genre_encoder.fit_transform(df["genre"])

# Let's pretend we also have "popularity" or "rating" (for demonstration)
# You can later replace this with real data
df["popularity"] = [1200, 1800, 1500, 2200, 2600]  # random example values

# Normalize popularity (so it scales 0–1)
scaler = MinMaxScaler()
df["popularity_scaled"] = scaler.fit_transform(df[["popularity"]])

df[["title", "genre", "genre_encoded", "popularity_scaled"]]


Unnamed: 0,title,genre,genre_encoded,popularity_scaled
0,The Great Gatsby,Classic,0,0.0
1,Pride and Prejudice,Romance,3,0.428571
2,Dune,Sci-Fi,4,0.214286
3,1984,Dystopian,1,0.714286
4,The Hobbit,Fantasy,2,1.0


In [None]:
import numpy as np

def recommend_hybrid(title, top_n=3):
    """
    Recommend top_n books similar to the given title using
    hybrid text + metadata similarity.
    """
    # ---- Configurable weights (can tweak later) ----
    w_text = 0.8
    w_meta = 0.2

    # ---- Find the query index ----
    idx = df.index[df["title"] == title][0]

    # ---- Get FAISS similarities ----
    query = df.loc[idx, "embeddings"].reshape(1, -1)
    faiss.normalize_L2(query)
    scores, indices = index.search(query, len(df))

    # ---- Realign FAISS scores to DataFrame rows ----
    aligned_text = np.zeros(len(df), dtype=float)
    aligned_text[indices[0]] = scores[0]

    # ---- Metadata signals ----
    genre_match = (df["genre"] == df.loc[idx, "genre"]).astype(float).to_numpy()
    pop_scaled  = df["popularity_scaled"].to_numpy()
    meta_signal = 0.7 * genre_match + 0.3 * pop_scaled

    # ---- Combine into hybrid score ----
    hybrid = w_text * aligned_text + w_meta * meta_signal

    # ---- Exclude the query book ----
    hybrid[idx] = -np.inf

    # ---- Retrieve top_n results ----
    top_idx = np.argpartition(-hybrid, range(top_n))[:top_n]
    top_idx = top_idx[np.argsort(-hybrid[top_idx])]

    recs = df.iloc[top_idx][["title", "author", "genre"]].copy()
    recs["hybrid_score"] = hybrid[top_idx]

    print(f"\n📖 Because you liked **{title}**, you might enjoy:")
    return recs.reset_index(drop=True)

recommend_hybrid("The Hobbit", top_n=3)



📖 Because you liked **The Hobbit**, you might enjoy:


Unnamed: 0,title,author,genre,hybrid_score
0,Dune,Frank Herbert,Sci-Fi,0.196122
1,1984,George Orwell,Dystopian,0.17368
2,Pride and Prejudice,Jane Austen,Romance,0.098678


In [None]:
recommend_hybrid("Pride and Prejudice", top_n=4)


📖 Because you liked **Pride and Prejudice**, you might enjoy:


Unnamed: 0,title,author,genre,hybrid_score
0,1984,George Orwell,Dystopian,0.242643
1,The Great Gatsby,F. Scott Fitzgerald,Classic,0.20232
2,Dune,Frank Herbert,Sci-Fi,0.182828
3,The Hobbit,J.R.R. Tolkien,Fantasy,0.132964


In [None]:
interactions = [
    ("u1","The Hobbit"),
    ("u1","The Great Gatsby"),
    ("u2","1984"),
    ("u2","Dune"),
    ("u3","Pride and Prejudice"),
    ("u4","Dune"),
    ("u5","1984"),
]
import pandas as pd
ui = pd.DataFrame(interactions, columns=["user","title"])


In [None]:
# Build per-user relevant sets (proxy: same-genre as liked titles)
user_rel = {}
for u, liked in ui.values:
    liked_genre = df.loc[df["title"]==liked, "genre"].item()
    rel = set(df.loc[df["genre"]==liked_genre, "title"]) - {liked}
    user_rel.setdefault(u, set()).update(rel)


In [None]:
# Popularity from your earlier column (or count interactions per title)
pop = df[["title","popularity_scaled"]].sort_values("popularity_scaled", ascending=False)

def baseline_popular(user, k=10):
    # Recommend top-K popular titles excluding what user already liked
    liked = set(ui.loc[ui["user"]==user, "title"])
    candidates = [t for t in pop["title"].tolist() if t not in liked]
    return candidates[:k]


In [None]:
def model_recs(user, k=10):
    # Use the most recent (or first) liked title as query
    liked = ui.loc[ui["user"]==user, "title"].tolist()
    if not liked: return []
    seed = liked[-1]
    return recommend_hybrid(seed, top_n=k)["title"].tolist()

In [None]:
import math

def dcg_at_k(recommended, relevant, k=10):
    dcg = 0.0
    for i, item in enumerate(recommended[:k], start=1):
        rel = 1 if item in relevant else 0
        dcg += (2**rel - 1) / math.log2(i+1)
    return dcg

def ndcg_at_k(recommended, relevant, k=10):
    dcg = dcg_at_k(recommended, relevant, k)
    ideal = dcg_at_k(sorted(relevant, key=lambda x: 1, reverse=True), relevant, k)
    return dcg / ideal if ideal > 0 else 0.0


In [None]:
def eval_system(get_recs, k=10):
    scores = []
    for u in ui["user"].unique():
        recs = get_recs(u, k)
        rel = user_rel.get(u, set())
        scores.append(ndcg_at_k(recs, rel, k))
    return sum(scores)/len(scores) if scores else 0.0

ndcg_pop = eval_system(baseline_popular, k=min(10, len(df)))
ndcg_model = eval_system(model_recs, k=min(10, len(df)))
lift = (ndcg_model - ndcg_pop) / (ndcg_pop + 1e-9)

print(f"Baseline NDCG@{min(10, len(df))}: {ndcg_pop:.3f}")
print(f"Model    NDCG@{min(10, len(df))}: {ndcg_model:.3f}")
print(f"Relative lift: {100*lift:.1f}%")


📖 Because you liked **The Great Gatsby**, you might enjoy:

📖 Because you liked **Dune**, you might enjoy:

📖 Because you liked **Pride and Prejudice**, you might enjoy:

📖 Because you liked **Dune**, you might enjoy:

📖 Because you liked **1984**, you might enjoy:
Baseline NDCG@5: 0.000
Model    NDCG@5: 0.000
Relative lift: 0.0%
