In [1]:
import pandas as pd
from api.core.mongo import db
from langchain_text_splitters import RecursiveJsonSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
import logging

logging.basicConfig(level=logging.DEBUG)

In [2]:
rows = pd.DataFrame(list(db["products"].find({
    "product_type": {"$in": ["ski", "phone"]},
}))).drop(columns=["_id"]).to_dict(orient="records")

DEBUG:pymongo.serverSelection:{"message": "Server selection started", "selector": "Primary()", "operation": "find", "topologyDescription": "<TopologyDescription id: 66f9679e22cac55c20cec60e, topology_type: ReplicaSetWithPrimary, servers: [<ServerDescription ('cluster0-shard-00-00.6le3g.mongodb.net', 27017) server_type: RSSecondary, rtt: 0.0630000000000166>, <ServerDescription ('cluster0-shard-00-01.6le3g.mongodb.net', 27017) server_type: RSPrimary, rtt: 0.046999999999997044>, <ServerDescription ('cluster0-shard-00-02.6le3g.mongodb.net', 27017) server_type: RSSecondary, rtt: 0.0630000000000166>]>", "clientId": {"$oid": "66f9679e22cac55c20cec60e"}}
DEBUG:pymongo.serverSelection:{"message": "Server selection succeeded", "selector": "Primary()", "operation": "find", "topologyDescription": "<TopologyDescription id: 66f9679e22cac55c20cec60e, topology_type: ReplicaSetWithPrimary, servers: [<ServerDescription ('cluster0-shard-00-00.6le3g.mongodb.net', 27017) server_type: RSSecondary, rtt: 0.06

In [3]:
splitter = RecursiveJsonSplitter(max_chunk_size=100)
docs = splitter.create_documents(texts=rows)

In [4]:
# Load the embedding model (https://huggingface.co/nomic-ai/nomic-embed-text-v1")
model = HuggingFaceEmbeddings(model_name="nomic-ai/nomic-embed-text-v1", model_kwargs={ "trust_remote_code": True })

  from tqdm.autonotebook import tqdm, trange
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: nomic-ai/nomic-embed-text-v1
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /nomic-ai/nomic-embed-text-v1/resolve/main/modules.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /nomic-ai/nomic-embed-text-v1/resolve/main/config_sentence_transformers.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /nomic-ai/nomic-embed-text-v1/resolve/main/README.md HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /nomic-ai/nomic-embed-text-v1/resolve/main/modules.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /nomic-ai/nomic-embed-text-v1/resolve/main/sentence_bert_config.json HT

In [None]:
# Store the data as vector embeddings in Atlas

#: import os
#: os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents = docs, 
    embedding = model,
    collection = db["embeddings"],
    index_name = "vector_index"
)