# Storing Data pada MongoDB Atlas

## Load Dataset

In [None]:
import pandas as pd
df = pd.read_pickle('../data/datasets.pkl')

In [None]:
df.head()

## Building Dataset

In [None]:
from haystack import Document
documents = []
for index, row in df.iterrows():
    descriptions = row["description"].strip("[]").strip("''")

    doc = Document(
        content = f"{row['title']}\n {descriptions}",
        meta = {
            'asin': row['asin'],
            'title': row['title'],
            'brand': row['brand'],
            'price': row['price'],
            'gender': row['gender'],
            'material': row['material'],
            'category': row['category'],
        }
    )
    documents.append(doc)

In [None]:
documents[:5]

## Membuat Storing Pipeline

In [None]:
import os
from getpass import getpass
os.environ["MONGO_CONNECTION_STRING"] = getpass("Masukkan MongoDB Connection String Anda: ")

In [None]:
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
pipeline_storing = Pipeline()

In [None]:
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore
document_store = MongoDBAtlasDocumentStore(
    database_name="depato_store",
    collection_name="products",
    vector_search_index="vector_index",
    full_text_search_index="search_index",
)

In [None]:
pipeline = Pipeline()
pipeline.add_component("embedder",SentenceTransformersDocumentEmbedder())
pipeline.add_component("writer",DocumentWriter(document_store=document_store,policy=DuplicatePolicy.OVERWRITE))

pipeline.connect("embedder","writer")

In [None]:
pipeline.run({
    "embedder":{
        "documents":documents
    }
})

## Menyimpan Category dan Material di koleksi 

In [None]:
from pymongo import MongoClient
import os
client = MongoClient(os.environ['MONGO_CONNECTION_STRING'])
db = client.depato_store
material_collection = db.materials
category_collection = db.categories

In [None]:
materials = df['material'].unique().tolist()
categories = df['category'].unique().tolist()

In [None]:
documents_material= [ {"name":m} for m in materials]
documents_category = [ {"name":c} for c in categories]

In [None]:
material_collection.insert_many(documents_material)
category_collection.insert_many(documents_category)