In [1]:
import os

from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings
from pymongo import MongoClient
from tqdm import tqdm

from langchain_rag.embedding.summarizer import Summarizer

load_dotenv()



True

In [2]:
prod_mongo_client = MongoClient(os.environ["MONGO_CONNECTION_STRING"])
dev_mongo_client = MongoClient(os.environ["MONGO_CONNECTION_STRING"])
summarizer = Summarizer()

In [3]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=1536)

vector_store = MongoDBAtlasVectorSearch(
    collection=dev_mongo_client["lab_dev"]["langchain_embedding"],
    index_name="test_vector_store_index",
    relevance_score_fn="cosine",
    embedding=embeddings,
)

In [4]:
cursor = dev_mongo_client["lab_dev"]["langchain_existing_embedding"].find({}, {})

dev_mongo_client["lab_dev"]["langchain_embedding"].update_many({}, {"$set": {"isHidden": True}})

for mongo_doc in tqdm(cursor, desc="Embedding documents"):
    content_id = str(mongo_doc["content_id"])
    
    embedding_doc = dev_mongo_client["lab_dev"]["langchain_embedding"].find_one({"content_id": content_id}, {"_id": 1})
    if embedding_doc is not None:
        dev_mongo_client["lab_dev"]["langchain_embedding"].update_one({"content_id": content_id}, {"$set": {"isHidden": False}})
        continue
    
    try:
        summary = summarizer.summarize_content(content_id, embed_image_count=3)
        doc = Document(page_content=summary, metadata={"content_id": content_id, "isHidden": False})
        vector_store.add_documents(documents=[doc])
    except Exception as e:
        print(f"Error occurred while embedding documents [{content_id}]: {e}")

Embedding documents: 100it [00:29,  3.36it/s]
