Embedding Book Metadata with all-MiniLM-L6-v2

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
os.chdir("..")  # Go up to project root

In [4]:
df_books = pd.read_csv("C:/Users/jvlas/source/repos/TrioLearn/data/interim/books_metadata.csv")

In [5]:
# Fill missing fields (avoid errors during string joining)
df_books.fillna("", inplace=True)

  df_books.fillna("", inplace=True)


In [6]:
# Combine text fields into one string for embedding
df_books["text_for_embedding"] = (
    df_books["title"] + " " +
    df_books["description"] + " " +
    df_books["categories"]
)

In [7]:
# Load the pre-trained Sentence-BERT model

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
print(" Model loaded")



  from .autonotebook import tqdm as notebook_tqdm
Downloading .gitattributes: 1.23kB [00:00, 615kB/s]
Downloading config.json: 100%|██████████| 190/190 [00:00<00:00, 189kB/s]
Downloading README.md: 10.5kB [00:00, 10.3MB/s]
Downloading config.json: 100%|██████████| 612/612 [00:00<?, ?B/s] 
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 76.6kB/s]
Downloading data_config.json: 39.3kB [00:00, 19.5MB/s]
Downloading model.safetensors: 100%|██████████| 90.9M/90.9M [00:02<00:00, 39.0MB/s]
Downloading model.onnx: 100%|██████████| 90.4M/90.4M [00:01<00:00, 51.6MB/s]
Downloading model_O1.onnx: 100%|██████████| 90.4M/90.4M [00:01<00:00, 54.0MB/s]
Downloading model_O2.onnx: 100%|██████████| 90.3M/90.3M [00:01<00:00, 54.0MB/s]
Downloading model_O3.onnx: 100%|██████████| 90.3M/90.3M [00:01<00:00, 53.5MB/s]
Downloading model_O4.onnx: 100%|██████████| 45.2M/45.2M [00:00<00:00, 56.7MB/s]
Downloading model_qint8_arm64.onnx: 100%|██████████| 23.0M/23.0M [00:00<00:00, 52.4MB/s]

 Model loaded


In [8]:
#  Compute embeddings (batch process for performance)
embeddings = model.encode(df_books["text_for_embedding"].tolist(), show_progress_bar=True)


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]


In [9]:
# Store as separate columns
embedding_df = pd.DataFrame(embeddings, columns=[f"emb_{i}" for i in range(embeddings.shape[1])])


In [10]:
# Concatenate with original metadata
df_books_embedded = pd.concat([df_books, embedding_df], axis=1)

In [11]:
processed_dir = os.path.join("data", "processed")
os.makedirs(processed_dir, exist_ok=True)

save_path = os.path.join(processed_dir, "books_with_embeddings.csv")
df_books_embedded.to_csv(save_path, index=False)

print(f"Saved {len(df_books_embedded)} books with embeddings to:", save_path)

Saved 20 books with embeddings to: data\processed\books_with_embeddings.csv
