In [1]:
# Specify cluster name, connect to a specific database,
cluster_name = "echo"
database_name = "sample_mflix"
collection_name = "movies"

In [2]:
# Init
import json
import os

import requests
from IPython.core.display import Markdown
# https://github.com/anthropics/anthropic-sdk-python
from anthropic import Anthropic
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient

# Load env variables for MongoDB Atlas CLI, MongoDB database client and Anthropic client
load_dotenv()
uri = os.getenv("MONGO_URI")

# Create MongoBD Client, point to a specific database/collection
mongodb_client = MongoClient(uri)
namespace = database_name + "." + collection_name
db = mongodb_client[database_name][collection_name]

# Create Anthropic client
anthropic_client = Anthropic()
base_context = []

## MongoDB Context

In [3]:
docs = [
    "https://www.mongodb.com/docs/manual/core/query-optimization/",
    "https://www.mongodb.com/docs/manual/tutorial/equality-sort-range-rule/",
    "https://www.mongodb.com/docs/manual/core/index-partial/",
    "https://www.mongodb.com/docs/manual/reference/explain-results/",
    "https://www.mongodb.com/docs/manual/tutorial/analyze-query-plan/",
    "https://www.mongodb.com/docs/manual/tutorial/optimize-query-performance-with-indexes-and-projections/",
    "https://www.mongodb.com/docs/atlas/performance-advisor/",
    "https://www.mongodb.com/docs/atlas/performance-advisor/index-ranking/",
    "https://www.mongodb.com/docs/manual/reference/operator/aggregation/indexStats/",
    "https://www.mongodb.com/docs/cloud-manager/reference/api/performance-advisor/get-suggested-indexes/"
]

In [4]:
# Add to context
def webpage_as_context(url):
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')
    body = soup.select('div.body')[0]
    text = body.get_text(separator=" ", strip=True)

    return {
        "role": "user",
        "content": f"MongoDB documentation from {url}\n{text}",
    }


base_context = []
for url in docs:
    context_message = webpage_as_context(url)
    base_context.append(context_message)


## Indexes

In [5]:
designed_indexes = [
    {'_id': 1},
    {'type': 1, 'imdb.rating': -1, 'year': 1},
    {"genres": 1, "poster": 1}
]

In [6]:
# Organize indexes

# Designed indexes
designed_indexes = [str(index_definition) for index_definition in designed_indexes]

# Current indexes in database
# https://www.mongodb.com/docs/manual/reference/operator/aggregation/indexStats/
current_indexes = []
index_stats = list(db.aggregate([{"$indexStats": {}}]))

for idx in index_stats:
    current_indexes.append(str(idx["key"]))

# Suggested indexes
# API details: https://www.mongodb.com/docs/cloud-manager/reference/api/performance-advisor/get-suggested-indexes/
suggested_indexes = set()
suggested_indexes_details = []

# CLI https://www.mongodb.com/docs/atlas/cli/current/command/atlas-clusters-list/
clusters_raw = !atlas clusters list -o json
clusters = json.loads(clusters_raw.s)["results"]
cluster = None

for c in clusters:
    if cluster_name == c["name"]:
        cluster = c
        break
cluster_hosts = cluster["connectionStrings"]["standard"].split("/")[2].split(",")

# CLI https://www.mongodb.com/docs/atlas/cli/current/command/atlas-performanceAdvisor-suggestedIndexes/
for host in cluster_hosts:
    index_suggestions_raw = !atlas performanceAdvisor suggestedIndexes list --processName {host} -o json
    index_suggestions = json.loads(index_suggestions_raw.s)

    for suggestion in index_suggestions["suggestedIndexes"]:
        if namespace not in suggestion["namespace"]: continue

        index_definition = {}
        for record in suggestion["index"]:
            index_definition.update(record)

        if str(index_definition) in suggested_indexes:
            continue

        suggested_indexes.add(str(index_definition))

        sample_queries = []
        for sample_query in index_suggestions["shapes"]:
            if sample_query["id"] in suggestion["impact"]:
                del sample_query["operations"][0]["stats"]["ts"]
                suggested_indexes_details.append({
                    "index_definition": index_definition,
                    "sample_query": json.dumps(sample_query["operations"][0]["predicates"]),
                    "query_count": sample_query["count"],
                    "stats": json.dumps(sample_query["operations"][0]["stats"])
                })
                # Extract a single query example in index recommendation
                break

# Compose index summary
message = "Designed indexes:\n"

for idx in designed_indexes:
    status = "\033[91mx\033[0m "
    if idx in current_indexes:
        status = "\033[92m√\033[0m "
    if idx in suggested_indexes:
        status = "\033[91m+\033[0m "

    message += status + str(idx)
    message += "\n"

message += "\n"
message += "Current indexes:\n"

for idx in current_indexes:
    status = "\033[93m?\033[0m "
    if idx in designed_indexes:
        status = "\033[92m√\033[0m "
    if idx in suggested_indexes:
        status = "\033[92m+\033[0m "

    message += status + str(idx) + "\n"

message += "\n"
message += "Suggested indexes:\n"
for si in suggested_indexes_details:
    message += "\033[92m+\033[0m " + str(si["index_definition"]) + "\n"
    message += "    query: " + str(si["query_count"]) + " x " + str(si["sample_query"]) + "\n"
    message += "    stats: " + str(si["stats"]) + "\n\n"

print(message)

Designed indexes:
[92m√[0m {'_id': 1}
[91m+[0m {'type': 1, 'imdb.rating': -1, 'year': 1}
[92m√[0m {'genres': 1, 'poster': 1}

Current indexes:
[92m√[0m {'genres': 1, 'poster': 1}
[93m?[0m {'plot': 1}
[92m√[0m {'_id': 1}

Suggested indexes:
[92m+[0m {'type': 1, 'imdb.rating': -1, 'year': 1}
    query: 48 x [{"aggregate": [{"$match": {"type": "movie", "year": {"$in": [{"$numberInt": "2010"}, {"$numberInt": "2011"}, {"$numberInt": "2012"}, {"$numberInt": "2013"}, {"$numberInt": "2014"}, {"$numberInt": "2015"}]}}}, {"$sort": {"imdb.rating": {"$numberInt": "-1"}}}, {"$limit": {"$numberInt": "10000"}}]}]
    stats: {"ms": 155, "nReturned": 101, "nScanned": 106745}




## Queries

#### 1. Find movies by type, year, sort by imdb.rating

Index is present in the design but in missing on the database

Index `{'type': 1, 'imdb.rating': -1, 'year': 1}`

In [7]:
pipeline = [
    {"$match": {
        "type": "movie",
        "year": {"$in": [2010, 2011, 2012, 2013, 2014, 2015]}
    }},
    {"$sort": {"imdb.rating": -1}},
    {"$limit": 10000}
]
is_aggregation = True

#### 2. Find movies with missing poster by genre

Optimal index is present

Index `{'genres': 1, 'poster': 1}`



In [8]:
query = db.find(
    {
        "genres": {"$in": ["Animation", "Short"]},
        "poster": None
    },
    {"_id": 1, "title": 1}
)
is_aggregation = False

#### 3. Aggregation with an $or operator
Collection scan - indexes are missing

Indexes: `{'year': 1}`, `{'genres': 1}`


In [9]:
pipeline = [
    {"$match": {
        "$or": [{"year": 2000}, {"genres": "Animation"}]
    }},
    {"$project": {"_id": 1, "title": 1}}
]
is_aggregation = True

## Query performance

In [10]:
# Explain query
# https://www.mongodb.com/docs/manual/reference/explain-results/

if is_aggregation:
    explain = mongodb_client.get_database(database_name).command(
        'explain',
        {
            'aggregate': collection_name,
            'pipeline': pipeline,
            'cursor': {}
        },
        verbosity='allPlansExecution'
    )
else:
    explain = query.clone().explain()

context = base_context.copy()
context.extend([
    {
        "role": "user",
        "content": f"Format a json explain plan using a template. Use markdown format for output. Do not provide text summary in the end ",
    },
    {
        "role": "user",
        "content": """Template:
### Query Overview
```
nReturned: 7345
executionTimeMillis: 16
totalKeysExamined: 7465
totalDocsExamined: 7345
```

### Indexes Used
```
- genres_1: {genres: 1}
- year_1: {year: 1}
```

### Execution Stages
```
SUBPLAN
└── PROJECTION_SIMPLE
    └── FETCH
        └── OR
            ├── IXSCAN (genres_1)
            │   nReturned: 4560
            │   keysExamined: 4560
            │
            └── IXSCAN (year_1)
                nReturned: 2905
                keysExamined: 2905
```""",
    },
    {
        "role": "user",
        "content": f"Query explain: {explain}",
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))

### Query Overview
```
nReturned: 7345
executionTimeMillis: 123
totalKeysExamined: 0
totalDocsExamined: 106745
```

### Indexes Used
```
No indexes used - COLLSCAN performed
```

### Execution Stages
```
SUBPLAN
└── PROJECTION_SIMPLE
    └── COLLSCAN
        nReturned: 7345
        docsExamined: 106745
        filter: {$or: [
          {genres: {$eq: 'Animation'}},
          {year: {$eq: 2000}}
        ]}
```

In [11]:
# Measure end-to-end execution time
# Aggregations and queries are executed differently
time = %timeit -o list(db.aggregate(pipeline=pipeline) if is_aggregation else query.clone())

372 ms ± 9.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
# Generate summary
context = base_context.copy()
context.extend([
    {
        "role": "user",
        "content": f"Act as a MongoDB advisor. You will be provided with information on indexes, example query that is being evaluated and a query explain plan. Provide a brief summary of explain plan, recommendations for query performance and indexes. Use markdown format for output, escape $ $ character sequence in markdown if it's not in the code block. Include original query in the beginning for a reference",
    },
    {
        "role": "user",
        "content": f"Current indexes: {current_indexes}, designed indexes: {designed_indexes}, suggested indexes: {suggested_indexes_details}",
    },
    {
        "role": "user",
        "content": f"Index stats: {index_stats}",
    },
    {
        "role": "user",
        "content": f"Query: {pipeline}",
    },
    {
        "role": "user",
        "content": f"Query explain: {explain}",
    },
    {
        "role": "user",
        "content": f"Execution time on on the client side: {time}",
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))
context.extend([
    {
        "role": "assistant",
        "content": response.content[0].text,
    }
])

# Query Analysis and Recommendations

## Original Query
```javascript
[
  {
    '$match': {
      '$or': [
        {'year': 2000},
        {'genres': 'Animation'}
      ]
    }
  },
  {
    '$project': {
      '_id': 1,
      'title': 1
    }
  }
]
```

## Explain Plan Summary
- Collection scan (COLLSCAN) is being used, which is inefficient
- Examined 106,745 documents to return 7,345 results
- Query execution time: 123ms on server side, 372ms on client side
- No indexes were used (totalKeysExamined: 0)
- Query has an OR condition between year and genres fields

## Performance Issues
1. Full collection scan required
2. High number of documents examined vs returned (106,745 / 7,345 = ~14.5 ratio)
3. None of the existing indexes support the query predicates efficiently

## Recommendations

### Index Recommendations
1. Create a compound index to support the query:
```javascript
{
  "genres": 1,
  "year": 1
}
```

2. Alternative approach - create separate indexes:
```javascript
{ "genres": 1 }
{ "year": 1 }
```

### Query Optimization
1. Consider if both conditions in the \$or are necessary - could the business logic be satisfied with a single condition?
2. If possible, try to avoid \$or operations by restructuring the query or splitting into two separate queries

### Index Usage Analysis
- Current `genres_1_poster_1` index is being used frequently (46 ops) but doesn't fully support this query
- `plot_1` index shows no usage (0 ops) - consider removing if not needed
- The suggested index `{type: 1, imdb.rating: -1, year: 1}` appears to be for different query patterns

## Expected Improvements
- With recommended indexes, document examination ratio should drop significantly
- Query execution time should improve by reducing the number of documents scanned
- Server load will decrease due to more efficient query execution

## Additional Notes
- Monitor the new indexes' usage with `\$indexStats` to ensure they are being utilized
- Consider dropping unused indexes like `plot_1` to reduce write overhead
- Evaluate if the designed index `{type: 1, imdb.rating: -1, year: 1}` should be prioritized based on the query patterns in your application

In [13]:
follow_up_question = """
Why genres_1_poster_1 index was not selected?
"""

In [14]:
# Chat
context.extend([
    {
        "role": "user",
        "content": follow_up_question,
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))

context.extend([
    {
        "role": "assistant",
        "content": response.content[0].text,
    }
])

Let me explain why the `genres_1_poster_1` index wasn't selected for this query:

1. **\$or Operator Behavior**:
   - When MongoDB encounters an \$or operator, it needs to satisfy both conditions:
     - `year: 2000`
     - `genres: 'Animation'`
   - The `genres_1_poster_1` index could only partially help with the `genres: 'Animation'` condition
   - The `year` condition would still require a collection scan
   - MongoDB query planner determined that using the index for just one part of the \$or would be less efficient than a single collection scan

2. **Index Coverage**:
   - The existing index `{genres: 1, poster: 1}` includes the `poster` field which is not used in the query
   - While MongoDB can use index prefixes, in this case with the \$or condition and need to check `year` field, the planner opted for collection scan

3. **Cost Calculation**:
   - MongoDB's query planner estimated that:
     - Using the index would require reading index entries for 'Animation' genre
     - Then performing random seeks to check the `year` field
     - Finally merging results from both conditions
   - A single collection scan would be more efficient as it:
     - Reads documents sequentially (better I/O pattern)
     - Can check both conditions in a single pass
     - Avoids the overhead of index lookups and result merging

4. **Better Index Strategy**:
   For this query pattern, either of these would work better:
   ```javascript
   // Option 1: Separate indexes for both fields
   { "genres": 1 }
   { "year": 1 }
   
   // Option 2: Compound index including both fields
   { "genres": 1, "year": 1 }
   ```

These indexes would allow MongoDB to:
- Use index intersection (with separate indexes)
- Or use a single compound index to satisfy both conditions
- Efficiently handle the \$or operation by scanning smaller portions of the data

Remember that the query planner makes decisions based on statistics and estimated costs. In some cases, even when an index exists, it might choose a collection scan if it determines that would be more efficient based on the data distribution and query conditions.