In [4]:
import json
import os

from dotenv import load_dotenv
from pymongo import MongoClient
from tqdm import tqdm

load_dotenv()

True

In [5]:
prod_mongo_client = MongoClient(os.environ["PROD_MONGO_CONNECTION_STRING"])
dev_mongo_client = MongoClient(os.environ["DEV_MONGO_CONNECTION_STRING"])

In [11]:
def get_content_embedding_doc(content_id: str):
    daylog_doc = prod_mongo_client["prod"]["post"].find_one({"_id": content_id})
        
    space_doc = prod_mongo_client["prod"]["space"].find_one({"_id": daylog_doc["spaceId"]})
    space_type_docs = prod_mongo_client["prod"]["space_type"].find({"_id": {"$in": space_doc["spaceTypeIds"]}})
    area_docs = prod_mongo_client["daytrip_area"]["node"].find({"_id": {"$in": list(map(lambda x: x["ref"], space_doc["areas"]))}})
    resource_meta_docs = prod_mongo_client["prod"]["resource_meta"].find({"_id": {"$in": list(map(lambda x: x["ref"], daylog_doc["images"]))}})

    keyword = None
    for resource_meta_doc in resource_meta_docs:
        if "prompting" in resource_meta_doc:
            keyword = {
                "summary": resource_meta_doc["prompting"]["keyword"]["summary"],
                "contents": resource_meta_doc["prompting"]["keyword"]["contents"],
                "moods": resource_meta_doc["prompting"]["keyword"]["moods"],
            }
            break
    
    payload = {
        "name": space_doc["names"]["en"],
        "spaceTypeNames": list(map(lambda x: x["names"]["en"], space_type_docs)),
        "location": space_doc["locationDescriptions"]["en"],
        "areaNames": list(map(lambda x: x["name"]["en"], area_docs)),
        "summary": daylog_doc["prompting"]["summary"]["en"],
        "helpfulTip": {
            "operatingHours": daylog_doc["prompting"]["helpfulTip"]["operatingHours"],
            "operatingDates": daylog_doc["prompting"]["helpfulTip"]["operatingDates"],
            "menu": daylog_doc["prompting"]["helpfulTip"]["menu"],
            "price": daylog_doc["prompting"]["helpfulTip"]["price"],
            "tipsOnVisiting": daylog_doc["prompting"]["helpfulTip"]["tipsOnVisiting"],
            "parkingAvailability": daylog_doc["prompting"]["helpfulTip"]["parkingAvailability"],
        },
        "keyword": keyword,
    }

    return {
        "content_id": content_id,
        "embedding": daylog_doc["embedding"]["search"],
        "text": json.dumps(payload),
    }

In [12]:
cursor = prod_mongo_client["prod"]["post"]\
    .find({"embedding": {"$exists": True}, "prompting": {"$exists": True}, "public": True, "ownerPublic": True, "countryCode": "KR"}, {})\
    .sort("createdAt", -1)\
    .limit(100)

for doc in tqdm(cursor, desc="Processing documents"):
    content_id = doc["_id"]
    try:
        embedding_doc = get_content_embedding_doc(content_id)
    except Exception as e:
        print(f"Error processing {content_id}: {e}")
        continue
    
    dev_mongo_client["lab_dev"]["langchain_existing_embedding"].replace_one(
        {"content_id": embedding_doc["content_id"]},
        embedding_doc,
        upsert=True,
    )

Processing documents: 100it [00:10,  9.92it/s]
