## Import Libraries and Setup

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
from haystack import Pipeline, Document
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore
from pymongo import MongoClient
import json
from getpass import getpass

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load environment variables
load_dotenv()

True

## Create Sample Datasets

### Products Dataset

In [3]:
products_data = [
    {
        "title": "Samsung Galaxy S24",
        "description": "Latest Samsung smartphone with advanced camera and AI features. Perfect for photography enthusiasts.",
        "price": 15000000,  # in IDR
        "material": "Glass",
        "category": "Electronics",
        "brand": "Samsung",
        "asin": "SAMS24001"
    },
    {
        "title": "Nike Air Max 270",
        "description": "Comfortable running shoes with excellent cushioning and modern design.",
        "price": 2500000,
        "material": "Synthetic",
        "category": "Shoes",
        "brand": "Nike", 
        "asin": "NIKE270001"
    },
    {
        "title": "Leather Office Chair",
        "description": "Ergonomic office chair with genuine leather upholstery for maximum comfort.",
        "price": 3500000,
        "material": "Leather",
        "category": "Furniture",
        "brand": "ErgoMax",
        "asin": "ERGO001"
    },
    {
        "title": "Cotton T-Shirt",
        "description": "100% organic cotton t-shirt, comfortable and breathable for daily wear.",
        "price": 250000,
        "material": "Cotton",
        "category": "Clothing",
        "brand": "EcoWear",
        "asin": "ECO001"
    },
    {
        "title": "Wooden Dining Table",
        "description": "Solid wood dining table that seats 6 people, perfect for family meals.",
        "price": 8000000,
        "material": "Wood",
        "category": "Furniture", 
        "brand": "WoodCraft",
        "asin": "WOOD001"
    }
]


In [4]:
products_df = pd.DataFrame(products_data)
print("Products Dataset:")
print(products_df.head())

Products Dataset:
                  title                                        description  \
0    Samsung Galaxy S24  Latest Samsung smartphone with advanced camera...   
1      Nike Air Max 270  Comfortable running shoes with excellent cushi...   
2  Leather Office Chair  Ergonomic office chair with genuine leather up...   
3        Cotton T-Shirt  100% organic cotton t-shirt, comfortable and b...   
4   Wooden Dining Table  Solid wood dining table that seats 6 people, p...   

      price   material     category      brand        asin  
0  15000000      Glass  Electronics    Samsung   SAMS24001  
1   2500000  Synthetic        Shoes       Nike  NIKE270001  
2   3500000    Leather    Furniture    ErgoMax     ERGO001  
3    250000     Cotton     Clothing    EcoWear      ECO001  
4   8000000       Wood    Furniture  WoodCraft     WOOD001  


### Common Information Dataset

In [5]:
common_info_data = [
    {
        "title": "Shipping Information",
        "content": "We offer free shipping for orders above Rp 500,000. Standard delivery takes 2-3 business days within Jakarta and 3-5 days for other cities. Express delivery is available for an additional fee of Rp 25,000.",
        "category": "shipping"
    },
    {
        "title": "Payment Methods",
        "content": "We accept various payment methods including credit cards (Visa, MasterCard), bank transfers, e-wallets (GoPay, OVO, DANA), and cash on delivery (COD) for selected areas.",
        "category": "payment"
    },
    {
        "title": "Return Policy",
        "content": "You can return items within 30 days of purchase. Items must be in original condition with tags attached. Return shipping is free for defective items, but customer pays for returns due to change of mind.",
        "category": "return"
    },
    {
        "title": "Refund Process",
        "content": "Refunds are processed within 5-7 business days after we receive your returned item. The refund will be credited to your original payment method. For e-wallet payments, refund may take up to 14 days.",
        "category": "refund"
    },
    {
        "title": "Size Guide",
        "content": "Please check our size guide before ordering clothing items. Measurements are in centimeters. For shoes, we recommend ordering half a size up if you're between sizes.",
        "category": "sizing"
    },
    {
        "title": "Customer Support",
        "content": "Our customer support is available Monday-Friday 9AM-6PM WIB. You can reach us via WhatsApp, email, or live chat on our website. Average response time is 2-4 hours.",
        "category": "support"
    }
]

In [6]:
common_info_df = pd.DataFrame(common_info_data)
print("\nCommon Information Dataset:")
print(common_info_df.head())


Common Information Dataset:
                  title                                            content  \
0  Shipping Information  We offer free shipping for orders above Rp 500...   
1       Payment Methods  We accept various payment methods including cr...   
2         Return Policy  You can return items within 30 days of purchas...   
3        Refund Process  Refunds are processed within 5-7 business days...   
4            Size Guide  Please check our size guide before ordering cl...   

   category  
0  shipping  
1   payment  
2    return  
3    refund  
4    sizing  


## MongoDB Atlas Connection Setup

In [7]:
# Get MongoDB connection string
if not os.environ.get("MONGO_CONNECTION_STRING"):
    mongo_string = getpass("Enter MongoDB Atlas Connection String: ")
    os.environ["MONGO_CONNECTION_STRING"] = mongo_string

# Test MongoDB connection
client = MongoClient(os.environ["MONGO_CONNECTION_STRING"])
db = client.smartshopper_store
print(f"Connected to MongoDB. Database: {db.name}")

Connected to MongoDB. Database: smartshopper_store


## Create Document Stores

### Products Document Store

In [8]:
products_document_store = MongoDBAtlasDocumentStore(
    database_name="smartshopper_store",
    collection_name="products",
    vector_search_index="vector_index",
    full_text_search_index="search_index",
)

### Common Information Document Store 

In [9]:
common_info_document_store = MongoDBAtlasDocumentStore(
    database_name="smartshopper_store", 
    collection_name="common_info",
    vector_search_index="common_info_vector_index",
    full_text_search_index="common_info_search_index",
)

## Build Documents for Storage

### Products Documents

In [10]:
products_documents = []
for index, row in products_df.iterrows():
    # Clean description
    descriptions = row["description"].strip("[]").strip('""')
    
    doc = Document(
        content=f"{row['title']}\n{descriptions}",
        meta={
            'asin': row['asin'],
            'title': row['title'], 
            'brand': row['brand'],
            'price': row['price'],
            'material': row['material'],
            'category': row['category'],
        }
    )
    products_documents.append(doc)

print(f"Created {len(products_documents)} product documents")
print("Sample product document:")
print(products_documents[0])

Created 5 product documents
Sample product document:
Document(id=f91f54a6ab6bea13fe88ed4c039f92a8a14e876cd8d1596f7be04a5d6bc2562a, content: 'Samsung Galaxy S24
Latest Samsung smartphone with advanced camera and AI features. Perfect for photo...', meta: {'asin': 'SAMS24001', 'title': 'Samsung Galaxy S24', 'brand': 'Samsung', 'price': 15000000, 'material': 'Glass', 'category': 'Electronics'})


### Common Information Documents

In [11]:
common_info_documents = []
for index, row in common_info_df.iterrows():
    doc = Document(
        content=row["content"],
        meta={
            'title': row['title'],
            'category': row['category'],
        }
    )
    common_info_documents.append(doc)

print(f"\nCreated {len(common_info_documents)} common info documents")
print("Sample common info document:")
print(common_info_documents[0])


Created 6 common info documents
Sample common info document:
Document(id=042fc1fedf1e48d12015d8c49c6ec307f1d9234d33d7a67f2a902cb688b598af, content: 'We offer free shipping for orders above Rp 500,000. Standard delivery takes 2-3 business days within...', meta: {'title': 'Shipping Information', 'category': 'shipping'})


## Create Storage Pipelines

### Products Storage Pipeline

In [12]:
products_pipeline = Pipeline()
products_pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder())
products_pipeline.add_component("writer", DocumentWriter(document_store=products_document_store, policy="OVERWRITE")) # type: ignore
products_pipeline.connect("embedder", "writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x0000023EEDB33EE0>
🚅 Components
  - embedder: SentenceTransformersDocumentEmbedder
  - writer: DocumentWriter
🛤️ Connections
  - embedder.documents -> writer.documents (List[Document])

### Common Info Storage Pipeline

In [13]:
common_info_pipeline = Pipeline()
common_info_pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder())
common_info_pipeline.add_component("writer", DocumentWriter(document_store=common_info_document_store, policy="OVERWRITE")) # type: ignore
common_info_pipeline.connect("embedder", "writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x0000023EEDB33580>
🚅 Components
  - embedder: SentenceTransformersDocumentEmbedder
  - writer: DocumentWriter
🛤️ Connections
  - embedder.documents -> writer.documents (List[Document])

## Store Data

### Store Products

In [14]:
print("Storing products data...")
products_result = products_pipeline.run({
    "embedder": {
        "documents": products_documents
    }
})
print(f"Stored {len(products_documents)} product documents successfully!")

### Store Common Information
print("Storing common information data...")
common_info_result = common_info_pipeline.run({
    "embedder": {
        "documents": common_info_documents
    }
})
print(f"Stored {len(common_info_documents)} common info documents successfully!")

Storing products data...


Batches: 100%|██████████| 1/1 [00:00<00:00,  4.50it/s]


Stored 5 product documents successfully!
Storing common information data...


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.39it/s]


Stored 6 common info documents successfully!


## Create Reference Collections

### Materials Collection


In [15]:
materials = list(products_df['material'].unique())
materials_collection = db.materials
materials_collection.drop()  # Clear existing data
for material in materials:
    materials_collection.insert_one({"name": material})
print(f"Created materials collection with {len(materials)} items: {materials}")

Created materials collection with 5 items: ['Glass', 'Synthetic', 'Leather', 'Cotton', 'Wood']


### Categories Collection


In [16]:
categories = list(products_df['category'].unique())
categories_collection = db.categories  
categories_collection.drop()  # Clear existing data
for category in categories:
    categories_collection.insert_one({"name": category})
print(f"Created categories collection with {len(categories)} items: {categories}")

Created categories collection with 4 items: ['Electronics', 'Shoes', 'Furniture', 'Clothing']


## Verify Storage


In [17]:
print("\nVerification:")
print(f"Products in document store: {products_document_store.count_documents()}")
print(f"Common info in document store: {common_info_document_store.count_documents()}")
print(f"Materials in collection: {materials_collection.count_documents({})}")
print(f"Categories in collection: {categories_collection.count_documents({})}")

print("Data storage completed successfully!")


Verification:
Products in document store: 5
Common info in document store: 6
Materials in collection: 5
Categories in collection: 4
Data storage completed successfully!
