In [157]:
# Connect to a specific database
cluster_name = "echo"
database_name = "sample_mflix"
collection_name = "movies"

In [158]:
# Init
import json
import os

import requests
from IPython.core.display import Markdown
# https://github.com/anthropics/anthropic-sdk-python
from anthropic import Anthropic
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient

# Load env variables for MongoDB Atlas CLI, MongoDB database client and Anthropic client
load_dotenv()
uri = os.getenv("MONGO_URI")

# Create MongoBD Client, point to a specific database/collection
mongodb_client = MongoClient(uri)
namespace = database_name + "." + collection_name
db = mongodb_client[database_name][collection_name]

# Create Anthropic client
anthropic_client = Anthropic()
base_context = []

## MongoDB Context

In [159]:
docs = [
    "https://www.mongodb.com/docs/manual/core/query-optimization/",
    "https://www.mongodb.com/docs/manual/tutorial/equality-sort-range-rule/",
    "https://www.mongodb.com/docs/manual/core/index-partial/",
    "https://www.mongodb.com/docs/manual/reference/explain-results/",
    "https://www.mongodb.com/docs/manual/tutorial/analyze-query-plan/",
    "https://www.mongodb.com/docs/manual/tutorial/optimize-query-performance-with-indexes-and-projections/",
    "https://www.mongodb.com/docs/atlas/performance-advisor/",
    "https://www.mongodb.com/docs/atlas/performance-advisor/index-ranking/",
    "https://www.mongodb.com/docs/manual/reference/operator/aggregation/indexStats/",
    "https://www.mongodb.com/docs/cloud-manager/reference/api/performance-advisor/get-suggested-indexes/"
]

In [160]:
# Add to context
def webpage_as_context(url):
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')
    body = soup.select('div.body')[0]
    text = body.get_text(separator=" ", strip=True)

    return {
        "role": "user",
        "content": f"MongoDB documentation from {url}\n{text}",
    }


base_context = []
for url in docs:
    context_message = webpage_as_context(url)
    base_context.append(context_message)


## Indexes

In [161]:
designed_indexes = [
    {'_id': 1},
    {'type': 1, 'imdb.rating': -1, 'year': 1},
    {"genres": 1, "poster": 1}
]

In [162]:
# Organize indexes

# Designed indexes
designed_indexes = [str(index_definition) for index_definition in designed_indexes]

# Current indexes in database
# https://www.mongodb.com/docs/manual/reference/operator/aggregation/indexStats/
current_indexes = []
index_stats = list(db.aggregate([{"$indexStats": {}}]))

for idx in index_stats:
    current_indexes.append(str(idx["key"]))

# Suggested indexes
# API details: https://www.mongodb.com/docs/cloud-manager/reference/api/performance-advisor/get-suggested-indexes/
suggested_indexes = set()
suggested_indexes_details = []

# CLI https://www.mongodb.com/docs/atlas/cli/current/command/atlas-clusters-list/
clusters_raw = !atlas clusters list -o json
clusters = json.loads(clusters_raw.s)["results"]
cluster = None

for c in clusters:
    if cluster_name == c["name"]:
        cluster = c
        break
cluster_hosts = cluster["connectionStrings"]["standard"].split("/")[2].split(",")

# CLI https://www.mongodb.com/docs/atlas/cli/current/command/atlas-performanceAdvisor-suggestedIndexes/
for host in cluster_hosts:
    index_suggestions_raw = !atlas performanceAdvisor suggestedIndexes list --processName {host} -o json
    index_suggestions = json.loads(index_suggestions_raw.s)

    for suggestion in index_suggestions["suggestedIndexes"]:
        if namespace not in suggestion["namespace"]: continue

        index_definition = {}
        for record in suggestion["index"]:
            index_definition.update(record)

        if str(index_definition) in suggested_indexes:
            continue

        suggested_indexes.add(str(index_definition))

        sample_queries = []
        for sample_query in index_suggestions["shapes"]:
            if sample_query["id"] in suggestion["impact"]:
                del sample_query["operations"][0]["stats"]["ts"]
                suggested_indexes_details.append({
                    "index_definition": index_definition,
                    "sample_query": json.dumps(sample_query["operations"][0]["predicates"]),
                    "query_count": sample_query["count"],
                    "stats": json.dumps(sample_query["operations"][0]["stats"])
                })
                # Extract a single query example in index recommendation
                break

# Compose index summary
message = "Designed indexes:\n"

for idx in designed_indexes:
    status = "\033[91mx\033[0m "
    if idx in current_indexes:
        status = "\033[92m√\033[0m "
    if idx in suggested_indexes:
        status = "\033[91m+\033[0m "

    message += status + str(idx)
    message += "\n"

message += "\n"
message += "Current indexes:\n"

for idx in current_indexes:
    status = "\033[93m?\033[0m "
    if idx in designed_indexes:
        status = "\033[92m√\033[0m "
    if idx in suggested_indexes:
        status = "\033[92m+\033[0m "

    message += status + str(idx) + "\n"

message += "\n"
message += "Suggested indexes:\n"
for si in suggested_indexes_details:
    message += "\033[92m+\033[0m " + str(si["index_definition"]) + "\n"
    message += "    query: " + str(si["query_count"]) + " x " + str(si["sample_query"]) + "\n"
    message += "    stats: " + str(si["stats"]) + "\n\n"

print(message)

Designed indexes:
[92m√[0m {'_id': 1}
[91m+[0m {'type': 1, 'imdb.rating': -1, 'year': 1}
[92m√[0m {'genres': 1, 'poster': 1}

Current indexes:
[92m√[0m {'genres': 1, 'poster': 1}
[93m?[0m {'plot': 1}
[92m√[0m {'_id': 1}

Suggested indexes:
[92m+[0m {'type': 1, 'imdb.rating': -1, 'year': 1}
    query: 48 x [{"aggregate": [{"$match": {"type": "movie", "year": {"$in": [{"$numberInt": "2010"}, {"$numberInt": "2011"}, {"$numberInt": "2012"}, {"$numberInt": "2013"}, {"$numberInt": "2014"}, {"$numberInt": "2015"}]}}}, {"$sort": {"imdb.rating": {"$numberInt": "-1"}}}, {"$limit": {"$numberInt": "10000"}}]}]
    stats: {"ms": 155, "nReturned": 101, "nScanned": 106745}




## Queries

#### 1. Find movies by type, year, sort by imdb.rating
Index `{'type': 1, 'imdb.rating': -1, 'year': 1}`

In [163]:
pipeline = [
    {"$match": {
        "type": "movie",
        "year": {"$in": [2010, 2011, 2012, 2013, 2014, 2015]}
    }},
    {"$sort": {"imdb.rating": -1}},
    {"$limit": 10000}
]

#### 2. Find movies with missing poster by genre

Index `{'genres': 1, 'poster': 1}`



In [164]:
pipeline = [
    {"$match": {
        "genres": {"$in": ["Animation", "Short"]},
        "poster": None
    }},
    {"$project": {"_id": 1, "title": 1}}
]

#### 3. Aggregation with an $or operator
Indexes: `{'year': 1}`, `{'genres': 1}`

In [165]:
pipeline = [
    {"$match": {
        "$or": [{"year": 2000}, {"genres": "Animation"}]
    }},
    {"$project": {"_id": 1, "title": 1}}
]

## Query performance

In [166]:
# Explain query
# https://www.mongodb.com/docs/manual/reference/explain-results/
explain = mongodb_client.get_database(database_name).command(
    'explain',
    {
        'aggregate': collection_name,
        'pipeline': pipeline,
        'cursor': {}
    },
    verbosity='allPlansExecution'
)

context = base_context.copy()
context.extend([
    {
        "role": "user",
        "content": f"Format a json explain plan using a template. Use markdown format for output. Do not provide text summary in the end ",
    },
    {
        "role": "user",
        "content": """Template:
### Query Overview
```
nReturned: 7345
executionTimeMillis: 16
totalKeysExamined: 7465
totalDocsExamined: 7345
```

### Indexes Used
```
- genres_1: {genres: 1}
- year_1: {year: 1}
```

### Execution Stages
```
SUBPLAN
└── PROJECTION_SIMPLE
    └── FETCH
        └── OR
            ├── IXSCAN (genres_1)
            │   nReturned: 4560
            │   keysExamined: 4560
            │
            └── IXSCAN (year_1)
                nReturned: 2905
                keysExamined: 2905
```""",
    },
    {
        "role": "user",
        "content": f"Query explain: {explain}",
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))

### Query Overview
```
nReturned: 7345
executionTimeMillis: 125
totalKeysExamined: 0
totalDocsExamined: 106745
```

### Indexes Used
```
No indexes used - COLLSCAN performed
```

### Execution Stages
```
SUBPLAN
└── PROJECTION_SIMPLE
    └── COLLSCAN
        nReturned: 7345
        docsExamined: 106745
        filter: $or: [
          {genres: {$eq: "Animation"}},
          {year: {$eq: 2000}}
        ]
```

In [167]:
# Measure end-to-end execution time
time = %timeit -o list(db.aggregate(pipeline=pipeline))

357 ms ± 8.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [168]:
# Generate summary
context = base_context.copy()
context.extend([
    {
        "role": "user",
        "content": f"Act as a MongoDB advisor. You will be provided with information on indexes, example query that is being evaluated and a query explain plan. Provide a brief summary of explain plan, recommendations for query performance and indexes. Use markdown format for output, escape $ $ character sequence in markdown if it's not in the code block. Include original query in the beginning for a reference",
    },
    {
        "role": "user",
        "content": f"Current indexes: {current_indexes}, designed indexes: {designed_indexes}, suggested indexes: {suggested_indexes_details}",
    },
    {
        "role": "user",
        "content": f"Index stats: {index_stats}",
    },
    {
        "role": "user",
        "content": f"Query: {pipeline}",
    },
    {
        "role": "user",
        "content": f"Query explain: {explain}",
    },
    {
        "role": "user",
        "content": f"Execution time on on the client side: {time}",
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))
context.extend([
    {
        "role": "assistant",
        "content": response.content[0].text,
    }
])

Here's my analysis and recommendations:

**Original Query:**
```javascript
[
  {
    '$match': {
      '$or': [
        {'year': 2000}, 
        {'genres': 'Animation'}
      ]
    }
  },
  {
    '$project': {
      '_id': 1,
      'title': 1
    }
  }
]
```

**Explain Plan Summary:**
- Query uses a COLLSCAN (full collection scan)
- Examined 106,745 documents to return 7,345 results
- Execution time: 125ms on server side, 357ms on client side
- No indexes were used despite having indexes on `genres` and `_id`
- The query uses an `$or` condition which makes index selection more complex

**Performance Issues:**
1. Full collection scan is being performed instead of using indexes
2. High number of documents examined vs returned (106,745:7,345 ratio)
3. The `$or` condition makes it challenging to effectively use existing indexes

**Recommendations:**

1. **Create Compound Index:**
```javascript
{
  "year": 1,
  "genres": 1
}
```
This index would support both conditions in the `$or` clause and potentially allow for index intersection.

2. **Alternative Query Structure:**
Consider splitting into two queries and using `$unionWith` if possible:
```javascript
[
  {
    '$facet': {
      'byYear': [{'$match': {'year': 2000}}],
      'byGenre': [{'$match': {'genres': 'Animation'}}]
    }
  },
  {
    '$project': {
      'combined': {
        '$setUnion': ['$byYear', '$byGenre']
      }
    }
  }
]
```

3. **Index Usage:**
- The existing `{'genres': 1, 'poster': 1}` index is not optimal for this query since it includes the unused `poster` field
- Consider removing unused indexes like `{'plot': 1}` which shows 0 operations in index stats

4. **Query Optimization:**
- Add a limit if you don't need all results
- Consider whether both conditions in the `$or` are necessary
- If possible, restructure the query to use `$in` instead of `$or` for better index utilization

The suggested index by the system `{'type': 1, 'imdb.rating': -1, 'year': 1}` appears to be for a different query pattern and won't help with this specific query.

In [169]:
follow_up_question = """
Why genres_1_poster_1 index was not selected?
"""

In [170]:
# Chat
context.extend([
    {
        "role": "user",
        "content": follow_up_question,
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))

context.extend([
    {
        "role": "assistant",
        "content": response.content[0].text,
    }
])

Let me explain why the `genres_1_poster_1` index wasn't selected:

1. **\$or Operator Behavior**
- When MongoDB processes an `$or` query, it needs to satisfy both conditions: `year: 2000` OR `genres: 'Animation'`
- For the `year: 2000` condition, the `genres_1_poster_1` index is not helpful at all since it doesn't contain the `year` field
- While the index could potentially help with the `genres: 'Animation'` part, MongoDB would still need to do a collection scan for the `year: 2000` part

2. **Cost-Based Decision**
- MongoDB's query optimizer determined that using two different execution strategies (index scan for genres + collection scan for year) and then combining results would be more expensive than:
  - A single collection scan filtering for both conditions
  - This is especially true since the query needs to return 7,345 documents (about 6.9% of the collection)
  - When a high percentage of documents need to be returned, collection scans can be more efficient than index lookups

3. **Index Structure Limitations**
- The `genres_1_poster_1` index includes the `poster` field which isn't used in this query
- While MongoDB can use index prefixes, in this case, the benefit of using just the `genres` prefix for only half of the `$or` conditions wasn't deemed worthwhile by the query optimizer

4. **Query Optimizer Statistics**
- The query planner uses statistics about the data distribution to make decisions
- If the planner estimates that a significant portion of documents will match either condition, it may choose a collection scan as the most efficient approach

To make better use of indexes for this query, you could:

1. Create a single index that covers both conditions:
```javascript
{ "year": 1, "genres": 1 }
```

2. Or split the query into two parts using separate indexes and union the results:
```javascript
// First query using year index
db.movies.find({"year": 2000})
// Second query using genres index
db.movies.find({"genres": "Animation"})
```

3. If you frequently query only by genres, consider creating a simpler index:
```javascript
{ "genres": 1 }
```
This would be more efficient than the current compound index with the unused `poster` field.