In [1]:
from dotenv import load_dotenv
import pandas as pd
load_dotenv()
import os
dataset_path = os.getenv("DATASET_PATH")

In [2]:
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,Title,Authors,Description,Category,Publisher,Price Starting With ($),Publish Date (Month),Publish Date (Year)
0,Goat Brothers,"By Colton, Larry",,"History , General",Doubleday,8.79,January,1993
1,The Missing Person,"By Grumbach, Doris",,"Fiction , General",Putnam Pub Group,4.99,March,1981
2,Don't Eat Your Heart Out Cookbook,"By Piscatella, Joseph C.",,"Cooking , Reference",Workman Pub Co,4.99,September,1983
3,When Your Corporate Umbrella Begins to Leak: A...,"By Davis, Paul D.",,,Natl Pr Books,4.99,April,1991
4,Amy Spangler's Breastfeeding : A Parent's Guide,"By Spangler, Amy",,,Amy Spangler,5.32,February,1997


# EDA

In [3]:
if df.Authors.str.startswith("By ").any():
    df.Authors = df.Authors.str[3:]

df.head()

Unnamed: 0,Title,Authors,Description,Category,Publisher,Price Starting With ($),Publish Date (Month),Publish Date (Year)
0,Goat Brothers,"Colton, Larry",,"History , General",Doubleday,8.79,January,1993
1,The Missing Person,"Grumbach, Doris",,"Fiction , General",Putnam Pub Group,4.99,March,1981
2,Don't Eat Your Heart Out Cookbook,"Piscatella, Joseph C.",,"Cooking , Reference",Workman Pub Co,4.99,September,1983
3,When Your Corporate Umbrella Begins to Leak: A...,"Davis, Paul D.",,,Natl Pr Books,4.99,April,1991
4,Amy Spangler's Breastfeeding : A Parent's Guide,"Spangler, Amy",,,Amy Spangler,5.32,February,1997


## Category

In [27]:
df.Category.dropna().str.split(" , ").head()

0                           [ History, General]
1                           [ Fiction, General]
2                         [ Cooking, Reference]
6    [ Self-help, Personal Growth, Self-Esteem]
7                            [ Poetry, General]
Name: Category, dtype: object

# Vectorize book descriptions

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-small")


In [4]:
from langchain_chroma import Chroma

# Check if the vector database already exists
db_path = "./chroma_langchain_db"

# Create directory if it doesn't exist
if not os.path.exists(db_path):
    os.makedirs(db_path)

vector_store = Chroma(
    collection_name="books",
    embedding_function=embeddings,
    persist_directory=db_path
)

## Populate vectordb

In [None]:
from langchain_core.documents import Document
from tqdm import tqdm

# Filter out rows without descriptions since we need text to embed
df_with_desc = df[df.Description.notna()].copy()

# Create Document objects with book descriptions and metadata
documents = []
for _, row in tqdm(df_with_desc.iterrows(), total=len(df_with_desc)):
    doc = Document(
        page_content=f"Title: {row['Title']} - Description: {row['Description']} - Category: {row['Category'] if pd.notna(row['Category']) else 'N/A'}",
        metadata={
            'title': row['Title'],
            'authors': row['Authors'],
            'category': row['Category'] if pd.notna(row['Category']) else '',
            'publisher': row['Publisher'] if pd.notna(row['Publisher']) else '',
            'price': row['Price Starting With ($)'],
            'publish_month': row['Publish Date (Month)'],
            'publish_year': row['Publish Date (Year)']
        }
    )

    # Add documents to the vector store
    vector_store.add_documents([doc])
print(f"Added {len(documents)} books to the vector store")

In [18]:
# Calculate 20% of the dataframe
sample_size = int(len(df_with_desc) * 0.2)

# Sort by multiple columns to get diverse sample
# Sort by category, year, and price to maximize diversity
df_sorted = df_with_desc.sort_values(
    by=['Category', 'Publish Date (Year)', 'Price Starting With ($)'],
    na_position='first'
)

# Take evenly spaced samples to maximize diversity
step = len(df_sorted) // sample_size
df_sample = df_sorted.iloc[::step][:sample_size]

print(f"Sampling {len(df_sample)} books out of {len(df_with_desc)} (20%)")

# Add sampled documents to vector store
for _, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
    doc = Document(
        page_content=f"Title: {row['Title']} - Description: {row['Description']} - Category: {row['Category'] if pd.notna(row['Category']) else 'N/A'}",
        metadata={
            'title': row['Title'],
            'authors': row['Authors'],
            'category': row['Category'] if pd.notna(row['Category']) else '',
            'publisher': row['Publisher'] if pd.notna(row['Publisher']) else '',
            'price': row['Price Starting With ($)'],
            'publish_year': row['Publish Date (Year)']
        }
    )
    vector_store.add_documents([doc])

print(f"Successfully added {len(df_sample)} diverse books to the vector store")

Sampling 14040 books out of 70204 (20%)


 57%|█████▋    | 7977/14040 [38:55<29:34,  3.42it/s]  


KeyboardInterrupt: 

In [5]:
vector_store._collection.count()

8279

# Search

## Similary search

In [9]:
results = vector_store.similarity_search("galaxy", k=5)

for result in results:
    print('title:',result.metadata['title'])
    print('authors:',result.metadata['authors'])
    print('category:',result.metadata['category'])
    print("-----")

title: Cycles of Fire: Stars, Galaxies and the Wonder of Deep Space
authors: Hartmann, William K.
category: 
-----
title: Exultant (Destiny's Children)
authors: Baxter, Stephen
category:  Fiction , Science Fiction , Hard Science Fiction
-----
title: The World and Its Wonders (Child Horizons)
authors: Blackwood, Paul Everett
category: 
-----
title: The Whole Shebang: A State-of-the-Universe(s) Report
authors: Ferris, Timothy
category:  Science , Cosmology
-----
title: Stargate
authors: Emmerich, Roland and Devlin, Dean
category: 
-----


## MRR Search

In [10]:
vector_store.max_marginal_relevance_search("galaxy", k=5, fetch_k=20)

[Document(id='4f007f88-dcb5-463e-9c41-d89c2b8f7f69', metadata={'authors': 'Hartmann, William K.', 'category': '', 'publish_year': 1987, 'title': 'Cycles of Fire: Stars, Galaxies and the Wonder of Deep Space', 'price': 6.38, 'publisher': 'Workman Pub Co', 'publish_month': 'October'}, page_content='Title: Cycles of Fire: Stars, Galaxies and the Wonder of Deep Space - Description: Uses paintings to illustrate possible stars and planets, describes the evolution of stars, and discusses nebulae, galaxies, and the Big Bang - Category: N/A'),
 Document(id='048bc987-0bf6-4789-8ea0-38065e731789', metadata={'publish_month': 'January', 'price': 8.29, 'publisher': 'Simon & Schuster, Inc.', 'publish_year': 1997, 'authors': 'Ferris, Timothy', 'title': 'The Whole Shebang: A State-of-the-Universe(s) Report', 'category': ' Science , Cosmology'}, page_content='Title: The Whole Shebang: A State-of-the-Universe(s) Report - Description: From the world-acclaimed author of Coming of Age in the Milky Way comes