In [1]:
import os
import csv
import pickle
import numpy as np
import pandas as pd
from langchain_chroma import Chroma
from langchain.storage import InMemoryStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.retrievers import ParentDocumentRetriever


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain.chains.llm import LLMChain
* 'allow_population_by_field_name' has been renamed to 'populate_by_name'


In [2]:
embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [3]:
loader = CSVLoader(
    encoding="utf8",
    file_path="product_info.csv",
    #metadata_columns=["title", "average_rating", "rating_number", "price", "store", "categories"]
  )
documents = loader.load()

In [4]:
documents

[Document(metadata={'source': 'product_info.csv', 'row': 0}, page_content='title: Braveman USB Wired Mouse, 3 Buttons, 4 LED Backlit and 4 Adjustable DPI Level(800/1200/1600/2400) Compatible with Desktop/Laptop/PC - Red\naverage_rating: 4.2\nrating_number: 8\nfeatures: [\'【Buy one get one mouse pad by adding both to cart】Adjustable DPI & LED backlit - 4-level adjustable DPI(800/1200/1600/2400). The optical sensor works precisely with different tracking speed. Ideal for web surfing, competitive gaming, and photo editing. The breathing light will be switched to red/blue/green/purple\', \'Compact, Ergonomic & Symmetrical Design - Low-friction base smoothly glides over work surfaces. Ergo-design naturally fits your hand and relieves fatigue for long-term use. High-precise optical sensor and sturdy construction deliver reliable performance\', \'Simple to Use - 3 buttons and 1 scroll wheel for improving efficiency, just "plug and play". No additional software, drive or firmware required.\', 

In [5]:
print(documents[0])

page_content='title: Braveman USB Wired Mouse, 3 Buttons, 4 LED Backlit and 4 Adjustable DPI Level(800/1200/1600/2400) Compatible with Desktop/Laptop/PC - Red
average_rating: 4.2
rating_number: 8
features: ['【Buy one get one mouse pad by adding both to cart】Adjustable DPI & LED backlit - 4-level adjustable DPI(800/1200/1600/2400). The optical sensor works precisely with different tracking speed. Ideal for web surfing, competitive gaming, and photo editing. The breathing light will be switched to red/blue/green/purple', 'Compact, Ergonomic & Symmetrical Design - Low-friction base smoothly glides over work surfaces. Ergo-design naturally fits your hand and relieves fatigue for long-term use. High-precise optical sensor and sturdy construction deliver reliable performance', 'Simple to Use - 3 buttons and 1 scroll wheel for improving efficiency, just "plug and play". No additional software, drive or firmware required.', 'Support System - Works with Windows 2000/7/8/10/XP/Vista/ME Mac OS Sy

In [6]:
def save_to_pickle(obj, filename):
    with open(filename, "wb") as file:
        pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL)

def load_from_pickle(filename):
    with open(filename, "rb") as file:
        return pickle.load(file)

#save_to_pickle(retriever.doc_store.store, docstore_path)

In [7]:
child_splitter = RecursiveCharacterTextSplitter(chunk_size=600)
vector_store = Chroma(
    collection_name="product",
    embedding_function=embeddings,
    persist_directory="./parentDB"
)

In [8]:
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vector_store,
    docstore=store,
    child_splitter=child_splitter,
)

In [10]:
retriever.add_documents(documents, ids=None)

In [11]:
len(list(store.yield_keys()))

20

In [12]:
len(vector_store.get()['documents'])

125

### Save the retriever locally for later usage

In [13]:
save_to_pickle(retriever.docstore.store, "docstore.pkl")

### Load the docstore and reconstruct the retriever

In [14]:
db = Chroma(
    collection_name="product",
    embedding_function=embeddings,
    persist_directory="./parentDB"
)
store_dict = load_from_pickle("docstore.pkl")
store = InMemoryStore()
store.mset(list(store_dict.items()))

In [26]:
retriever = ParentDocumentRetriever(
    vectorstore=db,
    docstore=store,
    child_splitter=child_splitter,
    search_kwargs={"k": 5}
)

In [27]:
retrieved_docs = retriever.invoke("comfort of wear")

In [28]:
# 5 child documents belonging to 3 parent documents are retrieved.
retrieved_docs

[Document(metadata={'source': 'product_info.csv', 'row': 0}, page_content='title: Braveman USB Wired Mouse, 3 Buttons, 4 LED Backlit and 4 Adjustable DPI Level(800/1200/1600/2400) Compatible with Desktop/Laptop/PC - Red\naverage_rating: 4.2\nrating_number: 8\nfeatures: [\'【Buy one get one mouse pad by adding both to cart】Adjustable DPI & LED backlit - 4-level adjustable DPI(800/1200/1600/2400). The optical sensor works precisely with different tracking speed. Ideal for web surfing, competitive gaming, and photo editing. The breathing light will be switched to red/blue/green/purple\', \'Compact, Ergonomic & Symmetrical Design - Low-friction base smoothly glides over work surfaces. Ergo-design naturally fits your hand and relieves fatigue for long-term use. High-precise optical sensor and sturdy construction deliver reliable performance\', \'Simple to Use - 3 buttons and 1 scroll wheel for improving efficiency, just "plug and play". No additional software, drive or firmware required.\', 