In [2]:
import json
import os

from dotenv import load_dotenv
from pymongo import MongoClient
from tqdm import tqdm

load_dotenv()

True

In [3]:
prod_mongo_client = MongoClient(os.environ["MONGO_CONNECTION_STRING"])
dev_mongo_client = MongoClient(os.environ["MONGO_CONNECTION_STRING"])

In [4]:
def get_content_embedding_doc(content_id: str):
    daylog_doc = prod_mongo_client["prod"]["post"].find_one({"_id": content_id})
        
    space_doc = prod_mongo_client["prod"]["space"].find_one({"_id": daylog_doc["spaceId"]})
    space_type_docs = prod_mongo_client["prod"]["space_type"].find({"_id": {"$in": space_doc["spaceTypeIds"]}})
    area_docs = prod_mongo_client["daytrip_area"]["node"].find({"_id": {"$in": list(map(lambda x: x["ref"], space_doc["areas"]))}})
    resource_meta_docs = prod_mongo_client["prod"]["resource_meta"].find({"_id": {"$in": list(map(lambda x: x["ref"], daylog_doc["images"]))}})

    keyword = None
    for resource_meta_doc in resource_meta_docs:
        if "prompting" in resource_meta_doc:
            keyword = {
                "summary": resource_meta_doc["prompting"]["keyword"]["summary"],
                "contents": resource_meta_doc["prompting"]["keyword"]["contents"],
                "moods": resource_meta_doc["prompting"]["keyword"]["moods"],
            }
            break
    
    payload = {
        "name": space_doc["names"]["en"],
        "spaceTypeNames": list(map(lambda x: x["names"]["en"], space_type_docs)),
        "location": space_doc["locationDescriptions"]["en"],
        "areaNames": list(map(lambda x: x["name"]["en"], area_docs)),
        "summary": daylog_doc["prompting"]["summary"]["en"],
        "helpfulTip": {
            "operatingHours": daylog_doc["prompting"]["helpfulTip"]["operatingHours"],
            "operatingDates": daylog_doc["prompting"]["helpfulTip"]["operatingDates"],
            "menu": daylog_doc["prompting"]["helpfulTip"]["menu"],
            "price": daylog_doc["prompting"]["helpfulTip"]["price"],
            "tipsOnVisiting": daylog_doc["prompting"]["helpfulTip"]["tipsOnVisiting"],
            "parkingAvailability": daylog_doc["prompting"]["helpfulTip"]["parkingAvailability"],
        } if "helpfulTip" in daylog_doc["prompting"] else None,
        "keyword": keyword,
    }

    return {
        "content_id": content_id,
        "embedding": daylog_doc["embedding"]["search"],
        "text": json.dumps(payload),
    }

In [8]:
cursor = prod_mongo_client["prod"]["post"]\
    .find({
        "embedding.search": {"$exists": True},
        "prompting": {"$exists": True},
        "public": True,
        "ownerPublic": True,
        "countryCode": {"$ne": "KR"},
    }, {})\
    .sort("createdAt", -1)\
    # .limit(100)

for doc in tqdm(cursor, desc="Processing documents"):
    content_id = doc["_id"]
    
    doc = dev_mongo_client["lab_dev"]["langchain_existing_embedding"].find_one({"content_id": content_id})
    if doc is not None:
        continue
    
    try:
        embedding_doc = get_content_embedding_doc(content_id)
    except Exception as e:
        print(f"Error processing {content_id}: {e}")
        continue
    
    dev_mongo_client["lab_dev"]["langchain_existing_embedding"].replace_one(
        {"content_id": embedding_doc["content_id"]},
        embedding_doc,
        upsert=True,
    )

Processing documents: 4665it [26:50,  3.58it/s]

Error processing 64fc8fd43eab3f4daedef436: 'operatingHours'


Processing documents: 5065it [28:55,  4.79it/s]

Error processing 64ed870b9f8fcb6648a8262a: 'summary'


Processing documents: 5567it [33:04,  2.15it/s]

Error processing 64e1df8e2a1ec3ee39c862b9: 'spaceTypeIds'


Processing documents: 7245it [44:23,  1.88it/s]

Error processing 64acb03f25e4b1b26cf32e50: 'operatingHours'


Processing documents: 7300it [44:51,  3.79it/s]

Error processing 64abb18f25e4b1b26cebed77: 'summary'


Processing documents: 8933it [59:45,  3.34it/s]

Error processing 644c6fa8ada45ed7638913f2: 'keyword'


Processing documents: 9777it [1:05:01,  3.95it/s]

Error processing 640167b6c8121901f97629ad: 'keyword'


Processing documents: 10524it [1:12:50,  1.10it/s]

Error processing 63ec981801e1afe5eb78816e: 'operatingDates'


Processing documents: 11325it [1:26:50,  1.13it/s]

Error processing 63d2f743d03d41c2da2849ec: 'operatingHours'


Processing documents: 13952it [2:10:47, 11.42it/s]

Error processing 62c39649ede58929fb17d42c: 'operatingDates'


Processing documents: 14988it [2:19:09,  2.05it/s]

Error processing 6235d08d84c5df035db77e6a: 'ref'


Processing documents: 15232it [2:21:31,  1.80it/s]

Error processing 620ddb9d55737b17e0207037: 'operatingHours'


Processing documents: 15326it [2:22:21,  2.29it/s]

Error processing 620b561121b4677256997e61: 'operatingHours'


Processing documents: 15336it [2:22:26,  1.93it/s]

Error processing 620b49544d0d2e14856dac71: 'ref'


Processing documents: 15432it [2:23:19,  1.67it/s]

Error processing 6205821b451d8a727aa16739: 'en'


Processing documents: 15481it [2:23:47,  1.94it/s]

Error processing 62032265fe92446d1f23f7ff: 'ref'


Processing documents: 15532it [2:24:18,  1.72it/s]

Error processing 61fde04979583809ef042707: 'operatingDates'


Processing documents: 15553it [2:24:31,  1.76it/s]

Error processing 61da0f6fb878b1002940f909: 'ref'


Processing documents: 15588it [2:24:45,  1.77it/s]

Error processing 61c3c50998a36d5318b3e3bf: 'ref'


Processing documents: 15640it [2:25:15,  1.64it/s]

Error processing 619c6ddc1443c14e18a9b590: 'ref'


Processing documents: 15643it [2:25:16,  2.12it/s]

Error processing 619c69861443c14e18a98552: 'ref'


Processing documents: 15664it [2:25:27,  1.69it/s]

Error processing 6196ffdf07f20a52db927e1a: 'ref'


Processing documents: 15671it [2:25:31,  1.79it/s]
