In [None]:
# Initialize
import os
import re
from string import Template

import nbformat
import requests
from anthropic import Anthropic
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from pymongo import MongoClient

load_dotenv()

uri = os.getenv("MONGO_URI")
mongodb_client = MongoClient(uri)

anthropic_client = Anthropic()
base_context = []


# MongoDB context
def webpage_as_context(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    body = soup.select('div.body')[0]
    text = body.get_text(separator=" ", strip=True)
    return {
        "role": "user",
        "content": f"MongoDB documentation from {url}\n{text}",
    }


docs = [
    "https://www.mongodb.com/docs/manual/reference/operator/aggregation/queryStats/",
    "https://www.mongodb.com/docs/manual/core/query-shapes/"
]

for url in docs:
    context_message = webpage_as_context(url)
    base_context.append(context_message)

TEMPLATE_PATH = "mongodb-advisor-template.ipynb"  # Your existing template


def create_notebooks_from_query_stats(ns, queries):
    with open(TEMPLATE_PATH, "r") as f:
        nb = nbformat.read(f, as_version=nbformat.NO_CONVERT)

    header_cell = nb.cells[0]
    header_cell.source = Template(header_cell.source).safe_substitute(ns)

    setup_cell = nb.cells[1]
    setup_cell.source = Template(setup_cell.source).safe_substitute(ns)

    append_index = 0
    for id, cell in enumerate(nb.cells):
        if "Query List" in cell.source:
            append_index = id + 1

    query_description_cell = nb.cells[append_index].source
    query_code_cell = nb.cells[append_index + 1].source
    del nb.cells[append_index:append_index + 2]

    for query in queries:
        description = Template(query_description_cell).safe_substitute(query)
        code = Template(query_code_cell).safe_substitute(query)
        nb.cells.insert(append_index, nbformat.v4.new_code_cell(code))
        nb.cells.insert(append_index, nbformat.v4.new_markdown_cell(description))

    path = f'generated/{ns["cluster_name"]}/{ns["database_name"]}/{ns["collection_name"]}.ipynb'
    os.makedirs(os.path.dirname(path), exist_ok=True)

    with open(path, "w") as f:
        nbformat.write(nb, f)


def get_query_description(query_shape):
    context = base_context.copy()
    context.extend([
        {
            "role": "user",
            "content": f"Here is an example Query Shape from the $queryStats output. Generate a 3 to 10 word description for the query purpose. Don't wrap in quotes"
        },
        {
            "role": "user",
            "content": f"Query Shape: {query_shape}",
        }
    ])
    response = anthropic_client.messages.create(
        max_tokens=1024,
        messages=context,
        model="claude-3-7-sonnet-latest",
    )

    return response.content[0].text


def get_query_details(query_stats):
    context = base_context.copy()
    context.extend([
        {
            "role": "user",
            "content": f"Here is an example output from the $queryStats. Generate a short summary for the query purpose and metrics. Use markdown, Use only level 5 headers.",
        },
        {
            "role": "user",
            "content": f"Query stats: {query_stats}",
        }
    ])

    response = anthropic_client.messages.create(
        max_tokens=1024,
        messages=context,
        model="claude-3-7-sonnet-latest",
    )
    return response.content[0].text


def get_find_command(query_shape):
    context = base_context.copy()
    context.extend([
        {
            "role": "user",
            "content": f"Here is an example Query Shape from the $queryStats output. It is for the find command. Compose a db.find() pymongo command from the Query Shape. Use projection or sorting if needed. Use plaintext, return only the python code. Make the query nicely formatted, as in example.",
        },
        {
            "role": "user",
            "content": f"Query stats: {query_shape}",
        },
        {
            "role": "user",
            "content": """Example output:
db.find(
    {
        "genres": {"$in": ["Animation", "Short"]},
        "poster": None
    },
    {"_id": 1, "title": 1}
)
""",
        }
    ])

    response = anthropic_client.messages.create(
        max_tokens=1024,
        messages=context,
        model="claude-3-7-sonnet-latest",
    )
    return response.content[0].text


In [None]:
# Collect query stats

server_status = mongodb_client.admin.command("serverStatus")
cluster_name = server_status["repl"]["primary"].split("-shard")[0]

# Query statistics
# https://www.mongodb.com/docs/manual/reference/operator/aggregation/queryStats/
query_stats = list(mongodb_client.get_database("admin").aggregate([
    {"$queryStats": {}},
    {"$match": {
        "key.queryShape.cmdNs.db": {"$not": {"$in": ["admin", "local", "config"]}},
        "metrics.totalExecMicros.sum": {"$gt": 10000}
    }},
    {"$project": {
        "db": "$key.queryShape.cmdNs.db",
        "coll": "$key.queryShape.cmdNs.coll",
        "hash": {"$concat": ["#", {"$substr": [{"$ifNull": ["$queryShapeHash", "000000"]}, 0, 6]}]},
        "shape": "$key.queryShape",
        "sum": "$metrics.totalExecMicros.sum",
        "count": "$metrics.execCount",
        "min": "$metrics.totalExecMicros.min",
        "max": "$metrics.totalExecMicros.max",
        "timestamp": "$asOf",
        "raw": "$$ROOT"
    }},
    {"$project": {
        "shape.cmdNs": 0,
    }},
    {"$group": {
        "_id": {"db": "$db", "coll": "$coll"},
        "stats": {"$addToSet": "$$ROOT"}
    }},
    {"$project": {
        "_id": 0,
        "ns.cluster_name": cluster_name,
        "ns.database_name": "$_id.db",
        "ns.collection_name": "$_id.coll",
        "stats": "$stats"
    }}
]))


In [None]:
# Create notebooks from a template, based on data from query statistics
for qs in query_stats:
    queries = []
    for stats in qs["stats"]:
        shape = stats["shape"]

        is_internal_command = bool(re.search("collStats|queryStats|indexStats|listSearchIndexes", f'{shape}'))
        if is_internal_command:
            continue

        query_example = "[] #TODO"
        query_example_ai = None

        if shape["command"] == "aggregate":
            query_example = shape["pipeline"]
        elif shape["command"] == "find":
            query_example_ai = get_find_command(shape)
            query_example = f'db.find({shape["filter"]})'

        query_description = get_query_description(shape)
        query_details = get_query_details(stats["raw"])

        queries.append({
            "query_hash": stats["hash"],
            "query_example": query_example,
            "query_example_ai": query_example_ai if query_example_ai else query_example,
            "query_index": "{'_id': 1} #TODO: design an index for this query",
            "query_short_description": query_description,
            "query_details": query_details,
        })

    if len(queries) > 0:
        create_notebooks_from_query_stats(qs["ns"], queries)
        # break

