Dataset: https://www.kaggle.com/datasets/dylanjcastillo/7k-books-with-metadata

In [1]:
# Importing necessary library
import pandas as pd
import numpy as np
from supabase import create_client, Client
from dotenv import load_dotenv
import os

In [3]:
# Preprocessing the dataset
books = pd.read_csv("./data/books.csv", sep=",", on_bad_lines='skip')
books.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [4]:
books = books[
    [
        "isbn13",
        "isbn10",
        "title",
        "authors",
        "categories",
        "thumbnail",
        "description",
        "num_pages",
    ]
]
books.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,num_pages
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,247.0
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,241.0
2,9780006163831,6163831,The One Tree,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,479.0
3,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",512.0
4,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,170.0


In [7]:
books = books.dropna()
print(books.isnull().sum())

isbn13         0
isbn10         0
title          0
authors        0
categories     0
thumbnail      0
description    0
num_pages      0
dtype: int64


In [8]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)
    return embedding.tolist()

books["embedding"] = books["description"].apply(get_embedding)
books.head()

  from .autonotebook import tqdm as notebook_tqdm
2025-05-06 17:58:28.857159: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-06 17:58:29.411809: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746568709.673903    8318 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746568709.757891    8318 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746568710.435163    8318 computation_placer.cc:177] computation placer already r

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,num_pages,embedding
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,247.0,"[-0.04012787342071533, -0.002791573293507099, ..."
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,241.0,"[-0.04499633237719536, -0.06780105084180832, -..."
2,9780006163831,6163831,The One Tree,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,479.0,"[-0.08650575578212738, -0.0319964699447155, 0...."
3,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",512.0,"[-0.04028574749827385, 0.03217523545026779, -0..."
4,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,170.0,"[-0.03791794180870056, 0.0736776664853096, -0...."


In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize the text splitter
# Uses a text splitter to split the data into smaller documents.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

In [12]:
books.dtypes

isbn13           int64
isbn10          object
title           object
authors         object
categories      object
thumbnail       object
description     object
num_pages      float64
embedding       object
dtype: object

In [15]:
load_dotenv()

url = os.getenv("VITE_SUPABASE_URL")
key = os.getenv("VITE_SUPABASE_API_KEY")
supabase: Client = create_client(url, key)

for idx, row in books.iterrows():
    description = row["description"]
    embedding = get_embedding(description)
    supabase.table("recommendations").insert(
        {
            "isbn13": row["isbn13"],
            "isbn10": row["isbn10"],
            "title": row["title"],
            "authors": row["authors"],
            "categories": row["categories"],
            "thumbnail": row["thumbnail"],
            "description": row["description"],
            "num_pages": row["num_pages"],
            "embedding": row["embedding"],
        }
    ).execute()