In [1]:
from dotenv import load_dotenv, find_dotenv
import os
load_dotenv(find_dotenv())
import openai
openai.openai_key = os.getenv("OPEN_AI_KEY")
import pandas as pd
import numpy as np
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
# from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import csv
from icecream import ic
from rich import print as rprint

In [2]:
# Define the column we want to embed vs which ones we want to store as metadat
columns_to_embed = ["Description", "Features"]
columns_to_metadata = ["Product Name", "Price", "Rating", "Description", "Features"]

In [3]:
filepath = "/teamspace/studios/Data_Studio/product_listing/product_listing.csv"
data = pd.read_csv(filepath)
data.head(2)

Unnamed: 0,Product Name,Price,Rating,Description,Features
0,Ultimate Wireless Bluetooth Earbuds,79.99,4.4,Elevate your music experience with our Ultimat...,- High-fidelity sound with deep bass and clear...
1,SmartHome Security Camera System,199.99,4.6,Keep your home safe and secure with our SmartH...,- 1080p HD cameras for crystal-clear video qua...


In [4]:
docs = []
with open(filepath, newline="", encoding='utf-8-sig') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    # ic(csv_reader)
    for i, row in enumerate(csv_reader):
        to_metadata = {col: row[col] for col in columns_to_metadata if col in row} 
        # ic(to_metadata)
        values_to_embed = {k: row[k] for k in columns_to_embed if k in row}
        # ic(values_to_embed)
        to_embed = '\n'.join(f"{k.strip()}: {v.strip()}" for k, v in values_to_embed.items())
        # ic(to_embed)
        newDoc = Document(page_content = to_embed, metadata = to_metadata)
        docs.append(newDoc)

In [5]:
# Split the document using Character splitting
splitter = CharacterTextSplitter(separator="\n", chunk_size = 500, chunk_overlap = 0, length_function = len)
documents = splitter.split_documents(docs)

Now that we have the chunks, we will generate the embeddings and insert the values into Chroma. Each vector inserted will have both the vector representation that will be used for similarity search as well as the metadata values we added.

In [10]:
# Generate embeddings from documents and store in Chroma
embeddings_model = OpenAIEmbeddings()
db = Chroma.from_documents(documents, OpenAIEmbeddings())

In [17]:
# Query the vector DB for information
query = "Heart rate monitor"
docs = db.similarity_search(query)
rprint(docs[0].page_content)
rprint(docs[0].metadata)

# Advanced Querying
To really take advantage of that metadata we generated, we can go beyond and leverage the Langchain SelfQueryRetriever. We can define a schema for the metadata easily and have it been used to generated filters using LLMs.



In [12]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [16]:
# Create the metadata schema on the values on the csv
metadata_field_info = [
    AttributeInfo(
        name = "Product Name",
        description = "Name of the Product",  # Corrected the typo here
        type = "string"),

    AttributeInfo(
        name = "Price",
        description = "The price of the product as a number. Ex. 149.99",
        type="string"),

    AttributeInfo(
        name = "Rating",
        description = "The rating of the product as a number from 0 to 5. Ex. 4.5",
        type = "string"),

    AttributeInfo(
        name = "Description",
        description = "Description of the product", 
        type="string"),

    AttributeInfo(
        name = "Features",
        description="Features of the product", 
        type="string")
]

document_content_description = "Product listing"

# Now configure the retriever
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, db, document_content_description, metadata_field_info, verbose = True)

# retriever values
rprint(retriever.get_relevant_documents(" heart rate monitor"))


In [10]:
data.columns

Index(['Product Name', 'Price', 'Rating', 'Description', 'Features'], dtype='object')