In [2]:
import os

from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings
from pymongo import MongoClient
from tqdm import tqdm

from langchain_rag.embedding.summarizer import Summarizer

load_dotenv()



True

In [3]:
prod_mongo_client = MongoClient(os.environ["PROD_MONGO_CONNECTION_STRING"])
dev_mongo_client = MongoClient(os.environ["DEV_MONGO_CONNECTION_STRING"])
summarizer = Summarizer()

In [4]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=1536)

vector_store = MongoDBAtlasVectorSearch(
    collection=dev_mongo_client["lab_dev"]["langchain_embedding"],
    index_name="test_vector_store_index",
    relevance_score_fn="cosine",
    embedding=embeddings,
)

In [6]:
cursor = prod_mongo_client["prod"]["post"].find({}, {}).sort("createdAt", -1).limit(100)

for mongo_doc in tqdm(cursor, desc="Embedding documents"):
    content_id = str(mongo_doc["_id"])
    
    existing_doc = dev_mongo_client["lab_dev"]["langchain_embedding"].find_one({"content_id": content_id}, {"_id": 1})
    if existing_doc is not None:
        continue
        
    try:
        summary = summarizer.summarize_content(content_id, embed_image_count=3)
        doc = Document(page_content=summary, metadata={"content_id": content_id})
        vector_store.add_documents(documents=[doc])
    except Exception as e:
        print(f"Error occurred while embedding documents [{content_id}]: {e}")

Embedding documents: 9it [00:00, 39.98it/s]

Error occurred while embedding documents [6750c46344b8a54a7e5e86a0]: not enough values to unpack (expected 2, got 1)


Embedding documents: 29it [04:05,  8.40s/it]

Error occurred while embedding documents [67500705b8b7920da63a3a35]: not enough values to unpack (expected 2, got 1)


Embedding documents: 75it [15:18, 12.45s/it]

Error occurred while embedding documents [674dcbd28809430f63299847]: not enough values to unpack (expected 2, got 1)


Embedding documents: 84it [17:08,  8.96s/it]

Error occurred while embedding documents [674d98ed5383694484d09afe]: not enough values to unpack (expected 2, got 1)


Embedding documents: 87it [17:39,  8.50s/it]

Error occurred while embedding documents [674d95245383694484d09a21]: not enough values to unpack (expected 2, got 1)


Embedding documents: 100it [21:39, 12.99s/it]
