In [1]:
import sys
print(sys.executable)

C:\Users\ghaza\Desktop\Ironhack_Data_Analytics\week_10\project\Book_Recomendation_System\.venv\Scripts\python.exe


In [2]:
from sentence_transformers import SentenceTransformer
print("SBERT is ready ✅")

SBERT is ready ✅


## Loading CSV & sanity checks

In [3]:
import pandas as pd

In [4]:
df= pd.read_csv(r"../data/clean/books_merged_clean.csv")

In [5]:
print("Shape:", df.shape)
df.head()

Shape: (1038, 6)


Unnamed: 0,title,author,published_year,language,subjects,cover
0,21st century houses: riba award-winning homes,dominic bradbury,2022,english,"domestic architecture, architecture, awards, m...",https://openlibrary.org/images/icons/avatar_bo...
1,architecture china 2020 building with nature j...,l. xiangning,2022,english,"architecture, awards, sustainable architecture...",https://openlibrary.org/images/icons/avatar_bo...
2,cyberarts 2021: international compendium prix ...,markus jandl,2022,english,"computer art, awards, computer animation, prix...",https://covers.openlibrary.org/b/id/13794706-M...
3,"deutsche bank ""artists of the year"" 2021: maxw...",maxwell alexandre,2022,english,"modern art, exhibitions, art, awards, maxwell ...",https://openlibrary.org/images/icons/avatar_bo...
4,in the shadow of trees,belgium) photobrussels festival (6th 202...,2022,french,"artistic photography, exhibitions, awards, pho...",https://openlibrary.org/images/icons/avatar_bo...


In [6]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 1038 entries, 0 to 1037
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   title           1038 non-null   str  
 1   author          1038 non-null   str  
 2   published_year  1038 non-null   int64
 3   language        1038 non-null   str  
 4   subjects        1038 non-null   str  
 5   cover           1038 non-null   str  
dtypes: int64(1), str(5)
memory usage: 48.8 KB


In [7]:
df.isna().sum()

title             0
author            0
published_year    0
language          0
subjects          0
cover             0
dtype: int64

In [8]:
df.duplicated(subset=["title", "author"]).sum()

np.int64(1)

## Creating a text function

In [9]:
import re

def clean_text(s):
    s = str(s).lower().strip()
    s = re.sub(r"\s+", " ", s)   # normalize whitespace
    s = s.replace(",", " ")      # remove comma noise
    return s

In [10]:
df["content"] = (
    df["title"].apply(clean_text) + " [SEP] " +
    df["author"].apply(clean_text) + " [SEP] " +
    df["subjects"].apply(clean_text)
)

In [11]:
#Sanity check
df[["title", "author", "subjects", "content"]].head(3)

Unnamed: 0,title,author,subjects,content
0,21st century houses: riba award-winning homes,dominic bradbury,"domestic architecture, architecture, awards, m...",21st century houses: riba award-winning homes ...
1,architecture china 2020 building with nature j...,l. xiangning,"architecture, awards, sustainable architecture...",architecture china 2020 building with nature j...
2,cyberarts 2021: international compendium prix ...,markus jandl,"computer art, awards, computer animation, prix...",cyberarts 2021: international compendium prix ...


In [12]:
df[["content"]].head(2)

Unnamed: 0,content
0,21st century houses: riba award-winning homes ...
1,architecture china 2020 building with nature j...


## Generating SBERT embeddings

#### 1. Loading the SBERT model

In [13]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [14]:
#Confirming the model works
test_vec = model.encode(["hello world"], normalize_embeddings=True)
print(test_vec.shape)

(1, 384)


#### 2. Encoding all books into embeddings

In [None]:
embeddings = model.encode(
    df["content"].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # IMPORTANT for cosine similarity
)

print("Embeddings shape:", embeddings.shape)

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

#### 3. Sanity Check

In [None]:
embeddings[:2]

## Similarity search & recommendations

#### 1. Computing cosine similarity matrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

#### 2. Basic semantic-search recommender

In [None]:
def recommend(query, top_k=10):
    query = clean_text(query)

    # encode the query with the same SBERT model
    q_vec = model.encode([query], normalize_embeddings=True)

    # cosine similarity between query and all book embeddings
    sims = cosine_similarity(q_vec, embeddings).flatten()

    # top results
    top_idx = sims.argsort()[::-1][:top_k]

    results = df.loc[top_idx, ["title", "author", "published_year", "language", "subjects", "cover"]].copy()
    results["similarity"] = sims[top_idx]

    results = results.reset_index(drop=True)
    results.index = results.index + 1
    return results


In [None]:
# Testing with exact title name
recommend("vegeterian", top_k=10)

In [None]:
# Testing with exact title name
recommend("portugal design", top_k=10)

In [None]:
# Testing with free text search
recommend("award-winning sustainable architecture", top_k=10)

#### 3. Adding filters (language + year)

In [None]:
def recommend_filtered(query, top_k=10, language=None, min_year=None, max_year=None):
    query = clean_text(query)
    q_vec = model.encode([query], normalize_embeddings=True)

    sims = cosine_similarity(q_vec, embeddings).flatten()

    df_sim = df.copy()
    df_sim["similarity"] = sims

    # filters
    if language:
        df_sim = df_sim[df_sim["language"].str.lower() == language.lower()]

    if min_year is not None:
        df_sim = df_sim[df_sim["published_year"] >= min_year]

    if max_year is not None:
        df_sim = df_sim[df_sim["published_year"] <= max_year]

    results = (
        df_sim
        .sort_values("similarity", ascending=False)
        .head(top_k)
        [["title", "author", "published_year", "language", "subjects", "cover", "similarity"]]
    )
    
    results = results.reset_index(drop=True)
    results.index = results.index + 1

    return results

In [None]:
#Testing with filters
recommend_filtered(
    "cooking",
    language="english",
    min_year=2017,
    top_k=10
)


In [None]:
#Testing with filters
recommend_filtered(
    "modern architecture houses",
    language="english",
    min_year=2015,
    top_k=10
)


#### Saving embeddings & data

In [None]:
import numpy as np

np.save("book_embeddings.npy", embeddings)
df.to_csv("books_with_content.csv", index=False)


In [None]:
embeddings = np.load("book_embeddings.npy")
df = pd.read_csv("books_with_content.csv")