In [1]:
# Connect to a specific database
cluster_name = "echo"
database_name = "sample_mflix"
collection_name = "movies"

In [2]:
# Init
import json
import os

import requests
from IPython.core.display import Markdown
# https://github.com/anthropics/anthropic-sdk-python
from anthropic import Anthropic
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient

# Load env variables for MongoDB Atlas CLI, MongoDB database client and Anthropic client
load_dotenv()
uri = os.getenv("MONGO_URI")

# Create MongoBD Client, point to a specific database/collection
mongodb_client = MongoClient(uri)
namespace = database_name + "." + collection_name
db = mongodb_client[database_name][collection_name]

# Create Anthropic client
anthropic_client = Anthropic()
base_context = []

## MongoDB Context

In [3]:
docs = [
    "https://www.mongodb.com/docs/manual/tutorial/equality-sort-range-rule/",
    "https://www.mongodb.com/docs/manual/reference/explain-results/",
    "https://www.mongodb.com/docs/manual/tutorial/analyze-query-plan/",
    "https://www.mongodb.com/docs/manual/tutorial/optimize-query-performance-with-indexes-and-projections/",
    "https://www.mongodb.com/docs/atlas/performance-advisor/",
    "https://www.mongodb.com/docs/atlas/performance-advisor/index-ranking/",
    "https://www.mongodb.com/docs/manual/reference/operator/aggregation/indexStats/",
    "https://www.mongodb.com/docs/cloud-manager/reference/api/performance-advisor/get-suggested-indexes/"
]

In [4]:
# Add to context
def webpage_as_context(url):
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')
    body = soup.select('div.body')[0]
    text = body.get_text(separator=" ", strip=True)

    return {
        "role": "user",
        "content": f"MongoDB documentation from {url}\n{text}",
    }


base_context = []
for url in docs:
    context_message = webpage_as_context(url)
    base_context.append(context_message)


## Indexes

In [5]:
designed_indexes = [
    {'_id': 1},
    {'_fts': 'text', '_ftsx': 1},
    {'genres': 1},
    {'text': 1},
    {'year': 1},
    {'type': 1, 'imdb.rating': -1, 'year': 1}
]

In [6]:
# Organize indexes

# Designed indexes
designed_indexes = [str(index_definition) for index_definition in designed_indexes]

# Current indexes in database
# https://www.mongodb.com/docs/manual/reference/operator/aggregation/indexStats/
current_indexes = []
index_stats = list(db.aggregate([{"$indexStats": {}}]))

for idx in index_stats:
    current_indexes.append(str(idx["key"]))

# Suggested indexes
# API details: https://www.mongodb.com/docs/cloud-manager/reference/api/performance-advisor/get-suggested-indexes/
suggested_indexes = set()
suggested_indexes_details = []

# CLI https://www.mongodb.com/docs/atlas/cli/current/command/atlas-clusters-list/
clusters_raw = !atlas clusters list -o json
clusters = json.loads(clusters_raw.s)["results"]
cluster = None

for c in clusters:
    if cluster_name == c["name"]:
        cluster = c
        break
cluster_hosts = cluster["connectionStrings"]["standard"].split("/")[2].split(",")

# CLI https://www.mongodb.com/docs/atlas/cli/current/command/atlas-performanceAdvisor-suggestedIndexes/
for host in cluster_hosts:
    index_suggestions_raw = !atlas performanceAdvisor suggestedIndexes list --processName {host} -o json
    index_suggestions = json.loads(index_suggestions_raw.s)

    for suggestion in index_suggestions["suggestedIndexes"]:
        if namespace not in suggestion["namespace"]: continue

        index_definition = {}
        for record in suggestion["index"]:
            index_definition.update(record)

        if str(index_definition) in suggested_indexes:
            continue

        suggested_indexes.add(str(index_definition))

        sample_queries = []
        for sample_query in index_suggestions["shapes"]:
            if sample_query["id"] in suggestion["impact"]:
                del sample_query["operations"][0]["stats"]["ts"]
                suggested_indexes_details.append({
                    "index_definition": index_definition,
                    "sample_query": json.dumps(sample_query["operations"][0]["predicates"]),
                    "query_count": sample_query["count"],
                    "stats": json.dumps(sample_query["operations"][0]["stats"])
                })
                # Extract a single query example in index recommendation
                break

# Compose index summary
message = "Designed indexes:\n"

for idx in designed_indexes:
    status = "\033[91mx\033[0m "
    if idx in current_indexes:
        status = "\033[92m√\033[0m "
    if idx in suggested_indexes:
        status = "\033[92m+\033[0m "

    message += status + str(idx)
    message += "\n"

message += "\n"
message += "Current indexes:\n"

for idx in current_indexes:
    status = "\033[93m?\033[0m "
    if idx in designed_indexes:
        status = "\033[92m√\033[0m "
    if idx in suggested_indexes:
        status = "\033[92m+\033[0m "

    message += status + str(idx) + "\n"

message += "\n"
message += "Suggested indexes:\n"
for si in suggested_indexes_details:
    message += "\033[92m+\033[0m " + str(si["index_definition"]) + "\n"
    message += "    query: " + str(si["query_count"]) + " x " + str(si["sample_query"]) + "\n"
    message += "    stats: " + str(si["stats"]) + "\n\n"

print(message)

Designed indexes:
[92m√[0m {'_id': 1}
[92m√[0m {'_fts': 'text', '_ftsx': 1}
[92m√[0m {'genres': 1}
[91mx[0m {'text': 1}
[92m√[0m {'year': 1}
[91mx[0m {'type': 1, 'imdb.rating': -1, 'year': 1}

Current indexes:
[92m√[0m {'_id': 1}
[92m√[0m {'genres': 1}
[92m√[0m {'year': 1}
[92m√[0m {'_fts': 'text', '_ftsx': 1}
[93m?[0m {'genres': 1, 'poster': 1}

Suggested indexes:



## Queries

#### 1. Find movies by type, year, sort by imdb.rating
Index `{'type': 1, 'imdb.rating': -1, 'year': 1}`

In [7]:
pipeline = [
    {"$match": {
        "type": "movie",
        "year": {"$in": [2010, 2011, 2012, 2013, 2014, 2015]}
    }},
    {"$sort": {"imdb.rating": -1}},
    {"$limit": 10000}
]

#### 2. Find movies with missing poster by genre

Index `{'genres': 1}`



In [None]:
pipeline = [
    {"$match": {
        "genres": {"$in": ["Animation", "Short"]},
        "poster": {"$exists": "false"}
    }},
    {"$project": {"_id": 1, "title": 1}}
]

#### 3. Aggregation with an $or operator
Indexes: `{'year': 1}`, `{'genres': 1}`

In [None]:
pipeline = [
    {"$match": {
        "$or": [{"year": 2000}, {"genres": "Animation"}]
    }},
    {"$project": {"_id": 1, "title": 1}}
]

## Query explain

In [8]:
# Explain query
# https://www.mongodb.com/docs/manual/reference/explain-results/
explain = mongodb_client.get_database(database_name).command(
    'explain',
    {
        'aggregate': collection_name,
        'pipeline': pipeline,
        'cursor': {}
    },
    verbosity='allPlansExecution'
)

context = base_context.copy()
context.extend([
    {
        "role": "user",
        "content": f"Format a json explain plan using a template. Use markdown format for output. Do not provide text summary in the end ",
    },
    {
        "role": "user",
        "content": """Template:
### Query Overview
```
nReturned: 7345
executionTimeMillis: 16
totalKeysExamined: 7465
totalDocsExamined: 7345
```

### Indexes Used
```
- genres_1: {genres: 1}
- year_1: {year: 1}
```

### Execution Stages
```
SUBPLAN
└── PROJECTION_SIMPLE
    └── FETCH
        └── OR
            ├── IXSCAN (genres_1)
            │   nReturned: 4560
            │   keysExamined: 4560
            │
            └── IXSCAN (year_1)
                nReturned: 2905
                keysExamined: 2905
```""",
    },
    {
        "role": "user",
        "content": f"Query explain: {explain}",
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))

### Query Overview
```
nReturned: 10000
executionTimeMillis: 111
totalKeysExamined: 26861
totalDocsExamined: 26860
```

### Indexes Used
```
- year_1: {year: 1}
```

### Execution Stages
```
SORT (imdb.rating: -1)
└── FETCH
    └── filter: {type: "movie"}
    └── IXSCAN (year_1)
        indexBounds: {year: [[2010,2010], [2011,2011], [2012,2012], 
                            [2013,2013], [2014,2014], [2015,2015]]}
        nReturned: 26860
        keysExamined: 26861
```

In [9]:
# Measure end-to-end execution time
time = %timeit -o list(db.aggregate(pipeline=pipeline))

1.37 s ± 101 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
# Generate summary
context = base_context.copy()
context.extend([
    {
        "role": "user",
        "content": f"Act as a MongoDB advisor. You will be provided with information on indexes, example query that is being evaluated and a query explain plan. Provide a brief summary of explain plan, recommendations for query performance and indexes. Use markdown format for output, escape $ $ character sequence in markdown if it's not in the code block.",
    },
    {
        "role": "user",
        "content": f"Current indexes: {current_indexes}, designed indexes: {designed_indexes}, suggested indexes: {suggested_indexes_details}",
    },
    {
        "role": "user",
        "content": f"Index stats: {index_stats}",
    },
    {
        "role": "user",
        "content": f"Query: {pipeline}",
    },
    {
        "role": "user",
        "content": f"Query explain: {explain}",
    },
    {
        "role": "user",
        "content": f"Execution time on on the client side: {time}",
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))
context.extend([
    {
        "role": "assistant",
        "content": response.content[0].text,
    }
])

# Query Analysis and Recommendations

## Explain Plan Summary
- Query uses IXSCAN on `year_1` index to find movies from 2010-2015
- Performs FETCH to get full documents and filter by `type: "movie"`
- Uses SORT stage for sorting by `imdb.rating` descending
- Examined 26,861 index keys and 26,860 documents
- Returned 10,000 documents
- Total execution time: 111ms on server side, ~1.37s on client side

## Performance Observations
1. Query scans more documents (26,860) than needed (10,000) because filtering on `type: "movie"` happens after index scan
2. In-memory sort required for `imdb.rating` which could be expensive for large result sets
3. Large difference between server (111ms) and client (1.37s) time indicates network transfer overhead

## Recommendations

### Index Recommendations
Create a compound index to better support the query:
```javascript
{ "type": 1, "year": 1, "imdb.rating": -1 }
```

Benefits:
- Supports filtering on both `type` and `year` fields
- Provides sorting order for `imdb.rating`
- Eliminates need for separate sort stage
- Reduces number of documents examined

### Query Optimization
1. Consider reducing limit if you don't need all 10,000 documents
2. Use projection to include only needed fields if full documents aren't required
3. Consider breaking large result sets into smaller batches

### Current Index Usage
Most accessed indexes:
- `genres_1`: 128 ops
- `year_1`: 16 ops
- Other indexes show minimal usage

Consider removing unused indexes like `genres_1_poster_1` (0 ops) to reduce write overhead unless needed for other queries.

## ESR (Equality, Sort, Range) Rule Application
The suggested compound index follows the ESR rule:
- Equality: `type`
- Sort: `imdb.rating`
- Range: `year`

This ordering will provide optimal query performance for this specific query pattern.

In [11]:
follow_up_question = "Are there indexes that were considered but rejected for this query?"

In [12]:
# Chat
context.extend([
    {
        "role": "user",
        "content": follow_up_question,
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))

context.extend([
    {
        "role": "assistant",
        "content": response.content[0].text,
    }
])

Based on the explain plan output, there were no rejected plans - we can see this from the empty `rejectedPlans` array in the explain output:

```javascript
'rejectedPlans': []
```

This means the query optimizer only considered one plan using the `year_1` index and didn't evaluate any alternative index plans. This can happen for a few reasons:

1. No other existing indexes could provide a viable alternative plan for this query
2. The query planner determined early that other indexes would be less efficient

Looking at the available indexes:
```
Current indexes:
- {'_id': 1}
- {'genres': 1}
- {'year': 1}
- {'_fts': 'text', '_ftsx': 1}
- {'genres': 1, 'poster': 1}
```

None of these alternatives would be better than `year_1` because:
- `_id` index doesn't help with the query conditions
- `genres` indexes aren't relevant to the query predicates
- Text index can only be used for text search queries
- The compound index on `{genres: 1, poster: 1}` doesn't include any of the queried fields

In MongoDB, the query planner uses a process called "index intersection" where it can sometimes use multiple indexes for a single query. However, in this case:
1. The requirement to sort by `imdb.rating`
2. The range condition on `year`
3. The equality match on `type`

These requirements make index intersection less efficient than a single well-designed compound index, which is why I recommended the compound index `{ "type": 1, "year": 1, "imdb.rating": -1 }` in the previous analysis.

For future analysis, you can force MongoDB to consider specific indexes using `hint()` if you want to compare different index strategies. For example:
```javascript
db.movies.explain("executionStats").aggregate([
  {$match: {type: "movie", year: {$in: [2010,2011,2012,2013,2014,2015]}}},
  {$sort: {"imdb.rating": -1}},
  {$limit: 10000}
]).hint("year_1")
```