In [2]:
!pip install -q python-dotenv langchain langchain_community langchain_openai langchain_chroma

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00

In [3]:
import torch

from langchain.schema import Document
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
books = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Projects/Book Recommendation/books/books_with_emotions.csv")

In [None]:
books

In [6]:
import numpy as np
cover_not_found = "/content/drive/MyDrive/Colab Notebooks/Projects/Book Recommendation/images/cover_not_found_resize.png"
books["large_thumbnail"] = books["thumbnail"] + "/fife=w800"
books["large_thumbnail"] = books["large_thumbnail"].fillna(cover_not_found)

In [None]:
books

In [7]:
from dotenv import load_dotenv
import os
from openai import OpenAI

# Load environment variables from .env in project root
load_dotenv("/content/drive/MyDrive/Colab Notebooks/Projects/Book Recommendation/.env")

# Create OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [8]:
# Load in embedding and file to load/store vector db
embedding = OpenAIEmbeddings()
persist_directory = "/content/drive/MyDrive/Colab Notebooks/Projects/Book Recommendation/databases/chroma_books_db"

In [None]:
# tagged_description_file = "/content/drive/MyDrive/Colab Notebooks/Projects/Book Recommendation/texts/tagged_description.txt"

# # Create tagged_description text file

# with open(tagged_description_file, "w") as f:
#   for description in books["tagged_description"]:
#     f.write(description + "\n")

# # Load tagged_description text file
# with open(tagged_description_file, "r", encoding="utf-8") as f:
#     lines = f.readlines()

# raw_documents = [Document(page_content=line.strip()) for line in lines if line.strip()]

# # Find the largest document length, size=5800
# largest_len = max(len(doc.page_content) for doc in raw_documents)
# print("Largest document length:", largest_len)

# # Split the raw descriptions into documents
# text_splitter = CharacterTextSplitter(separator="\n", chunk_size=6000, chunk_overlap=0)
# documents = text_splitter.split_documents(raw_documents)

# # Create and save vector db
# db_books = Chroma.from_documents(documents, embedding, persist_directory=persist_directory)

In [9]:
# Load vector db from storage
db_books = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [10]:
# Confirm 5197 book entries
collection = db_books._collection  # underlying chromadb collection object
print("📌 Collection name:", collection.name)
print("📊 Number of entries:", collection.count())

📌 Collection name: langchain
📊 Number of entries: 5197


In [11]:
def retrieve_semantic_recommendation(
  db_books: Chroma,
  books: pd.DataFrame,
  query: str,
  category: str = "All",
  tone: str = "All",
  initial_top_k: int = 50,
  final_top_k: int = 16
  ) -> pd.DataFrame:

  # Step 1: Retrieve similar documents
  documents = db_books.similarity_search(query, k=initial_top_k)

  if not documents:  # Handle empty search result
      return pd.DataFrame()

  # Step 2: Extract ISBNs from the retrieved documents
  try:
      isbn_list = [int(doc.page_content.strip('"').split()[0]) for doc in documents]
  except Exception as e:
      print(f"Error extracting ISBNs: {e}")
      return pd.DataFrame()

  # Step 3: Filter books by retrieved ISBNs
  book_list = books[books["isbn13"].isin(isbn_list)].head(final_top_k)

  if book_list.empty:   # ✅ use .empty instead of `if not book_list`
      return pd.DataFrame()

  # Step 4: Optional category filter
  if category != "All":
      book_list = book_list[book_list["simple_categories"] == category].head(final_top_k)

  # Step 5: Optional tone-based sorting
  if tone != "All":
      tone_col_map = {
          "Happy": "joy",
          "Surprising": "surprise",
          "Angry": "anger",
          "Suspenseful": "fear",
          "Sad": "sadness"
      }
      if tone in tone_col_map:
          col = tone_col_map[tone]
          book_list = book_list.sort_values(by=col, ascending=False).head(final_top_k)

  return book_list.reset_index(drop=True)



In [12]:
def generate_recommendations(
  db_books: Chroma,
  books: pd.DataFrame,
  query: str,
  category: str = "All",
  tone: str = "All"

) -> str:
  recommendations = retrieve_semantic_recommendation(db_books, books, query, category, tone)
  results = []

  for row in recommendations.itertuples(index=False):
    description = row.description
    truncated_desc_split = description.split()
    truncated_desc_str = " ".join(truncated_desc_split[:30]) + "..."

    author_split = row.authors.split(";")

    if len(author_split) == 2:
        author_str = f"{author_split[0]} and {author_split[1]}"
    elif len(author_split) > 2:
        author_str = f"{', '.join(author_split[:-1])}, and {author_split[-1]}"
    else:
        author_str = row.authors

    caption = f"{row.title} by {author_str}: {truncated_desc_str}"
    results.append((row.large_thumbnail, caption))

  return results



In [13]:
categories = ["All"] + sorted(books["simple_categories"].unique())
tones = ["All"] + ["Happy", "Surprising", "Angry", "Suspenseful", "Sad"]

In [14]:
import gradio as gr

In [15]:
with gr.Blocks(theme="glass") as dashboard:
  gr.Markdown("# Semantic book recommender")

  with gr.Row():
    user_query = gr.Textbox(label="Please enter a description of a book:",
                            placeholder="e.g. A story about forgiveness")

    category_dropdown = gr.Dropdown(choices = categories,
                                    label="Select a category:",
                                    value="All")

    tone_dropdown = gr.Dropdown(choices=tones,
                                label="Select an emotional tone",
                                value="All")

    submit_button = gr.Button("Find Recommendation")

    gr.Markdown("## Recommendation")
    output = gr.Gallery(label="Recommended books", columns=8, rows=2)

    submit_button.click(
                        fn=lambda query, category, tone: generate_recommendations(db_books, books, query, category, tone),
                        inputs=[user_query, category_dropdown, tone_dropdown],
                        outputs=output
    )

In [16]:
dashboard.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://42af5b82e9b60ab8e7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


