In [23]:
# Specify cluster name, connect to a specific database,
cluster_name = "echo"
database_name = "sample_mflix"
collection_name = "movies"

In [24]:
# Init
import json
import os

import requests
from IPython.core.display import Markdown
# https://github.com/anthropics/anthropic-sdk-python
from anthropic import Anthropic
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient

# Load env variables for MongoDB Atlas CLI, MongoDB database client and Anthropic client
load_dotenv()
uri = os.getenv("MONGO_URI")

# Create MongoBD Client, point to a specific database/collection
mongodb_client = MongoClient(uri)
namespace = database_name + "." + collection_name
db = mongodb_client[database_name][collection_name]

# Create Anthropic client
anthropic_client = Anthropic()
base_context = []

## MongoDB Context

In [25]:
docs = [
    "https://www.mongodb.com/docs/manual/core/query-optimization/",
    "https://www.mongodb.com/docs/manual/tutorial/equality-sort-range-rule/",
    "https://www.mongodb.com/docs/manual/core/index-partial/",
    "https://www.mongodb.com/docs/manual/reference/explain-results/",
    "https://www.mongodb.com/docs/manual/tutorial/analyze-query-plan/",
    "https://www.mongodb.com/docs/manual/tutorial/optimize-query-performance-with-indexes-and-projections/",
    "https://www.mongodb.com/docs/atlas/performance-advisor/",
    "https://www.mongodb.com/docs/atlas/performance-advisor/index-ranking/",
    "https://www.mongodb.com/docs/manual/reference/operator/aggregation/indexStats/",
    "https://www.mongodb.com/docs/cloud-manager/reference/api/performance-advisor/get-suggested-indexes/"
]

In [26]:
# Add to context
def webpage_as_context(url):
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')
    body = soup.select('div.body')[0]
    text = body.get_text(separator=" ", strip=True)

    return {
        "role": "user",
        "content": f"MongoDB documentation from {url}\n{text}",
    }


base_context = []
for url in docs:
    context_message = webpage_as_context(url)
    base_context.append(context_message)


## Indexes

In [27]:
designed_indexes = [
    {'_id': 1},
    {'type': 1, 'imdb.rating': -1, 'year': 1},
    {"genres": 1, "poster": 1}
]

In [28]:
# Organize indexes

# Designed indexes
designed_indexes = [str(index_definition) for index_definition in designed_indexes]

# Current indexes in database
# https://www.mongodb.com/docs/manual/reference/operator/aggregation/indexStats/
current_indexes = []
index_stats = list(db.aggregate([{"$indexStats": {}}]))

for idx in index_stats:
    current_indexes.append(str(idx["key"]))

# Suggested indexes
# API details: https://www.mongodb.com/docs/cloud-manager/reference/api/performance-advisor/get-suggested-indexes/
suggested_indexes = set()
suggested_indexes_details = []

# CLI https://www.mongodb.com/docs/atlas/cli/current/command/atlas-clusters-list/
clusters_raw = !atlas clusters list -o json
clusters = json.loads(clusters_raw.s)["results"]
cluster = None

for c in clusters:
    if cluster_name == c["name"]:
        cluster = c
        break
cluster_hosts = cluster["connectionStrings"]["standard"].split("/")[2].split(",")

# CLI https://www.mongodb.com/docs/atlas/cli/current/command/atlas-performanceAdvisor-suggestedIndexes/
for host in cluster_hosts:
    index_suggestions_raw = !atlas performanceAdvisor suggestedIndexes list --processName {host} -o json
    index_suggestions = json.loads(index_suggestions_raw.s)

    for suggestion in index_suggestions["suggestedIndexes"]:
        if namespace not in suggestion["namespace"]: continue

        index_definition = {}
        for record in suggestion["index"]:
            index_definition.update(record)

        if str(index_definition) in suggested_indexes:
            continue

        suggested_indexes.add(str(index_definition))

        sample_queries = []
        for sample_query in index_suggestions["shapes"]:
            if sample_query["id"] in suggestion["impact"]:
                del sample_query["operations"][0]["stats"]["ts"]
                suggested_indexes_details.append({
                    "index_definition": index_definition,
                    "sample_query": json.dumps(sample_query["operations"][0]["predicates"]),
                    "query_count": sample_query["count"],
                    "stats": json.dumps(sample_query["operations"][0]["stats"])
                })
                # Extract a single query example in index recommendation
                break

# Compose index summary
message = "Designed indexes:\n"

for idx in designed_indexes:
    status = "\033[91mx\033[0m "
    if idx in current_indexes:
        status = "\033[92m√\033[0m "
    if idx in suggested_indexes:
        status = "\033[91m+\033[0m "

    message += status + str(idx)
    message += "\n"

message += "\n"
message += "Current indexes:\n"

for idx in current_indexes:
    status = "\033[93m?\033[0m "
    if idx in designed_indexes:
        status = "\033[92m√\033[0m "
    if idx in suggested_indexes:
        status = "\033[92m+\033[0m "

    message += status + str(idx) + "\n"

message += "\n"
message += "Suggested indexes:\n"
for si in suggested_indexes_details:
    message += "\033[92m+\033[0m " + str(si["index_definition"]) + "\n"
    message += "    query: " + str(si["query_count"]) + " x " + str(si["sample_query"]) + "\n"
    message += "    stats: " + str(si["stats"]) + "\n\n"

print(message)

Designed indexes:
[92m√[0m {'_id': 1}
[91m+[0m {'type': 1, 'imdb.rating': -1, 'year': 1}
[92m√[0m {'genres': 1, 'poster': 1}

Current indexes:
[92m√[0m {'genres': 1, 'poster': 1}
[93m?[0m {'plot': 1}
[92m√[0m {'_id': 1}

Suggested indexes:
[92m+[0m {'type': 1, 'imdb.rating': -1, 'year': 1}
    query: 48 x [{"aggregate": [{"$match": {"type": "movie", "year": {"$in": [{"$numberInt": "2010"}, {"$numberInt": "2011"}, {"$numberInt": "2012"}, {"$numberInt": "2013"}, {"$numberInt": "2014"}, {"$numberInt": "2015"}]}}}, {"$sort": {"imdb.rating": {"$numberInt": "-1"}}}, {"$limit": {"$numberInt": "10000"}}]}]
    stats: {"ms": 155, "nReturned": 101, "nScanned": 106745}




## Queries

#### 1. Find movies by type, year, sort by imdb.rating

Index is present in the design but in missing on the database

Index `{'type': 1, 'imdb.rating': -1, 'year': 1}`

In [29]:
pipeline = [
    {"$match": {
        "type": "movie",
        "year": {"$in": [2010, 2011, 2012, 2013, 2014, 2015]}
    }},
    {"$sort": {"imdb.rating": -1}},
    {"$limit": 10000}
]
is_aggregation = True

#### 2. Find movies with missing poster by genre

Optimal index is present

Index `{'genres': 1, 'poster': 1}`



In [30]:
query = db.find(
    {
        "genres": {"$in": ["Animation", "Short"]},
        "poster": None
    },
    {"_id": 1, "title": 1}
)
is_aggregation = False

#### 3. Aggregation with an $or operator
Collection scan - indexes are missing

Indexes: `{'year': 1}`, `{'genres': 1}`


In [31]:
pipeline = [
    {"$match": {
        "$or": [{"year": 2000}, {"genres": "Animation"}]
    }},
    {"$project": {"_id": 1, "title": 1}}
]
is_aggregation = True

## Query performance

In [32]:
# Explain query
# https://www.mongodb.com/docs/manual/reference/explain-results/

if is_aggregation:
    explain = mongodb_client.get_database(database_name).command(
        'explain',
        {
            'aggregate': collection_name,
            'pipeline': pipeline,
            'cursor': {}
        },
        verbosity='allPlansExecution'
    )
else:
    explain = query.clone().explain()

context = base_context.copy()
context.extend([
    {
        "role": "user",
        "content": f"Format a json explain plan using a template. Use markdown format for output. Do not provide text summary in the end ",
    },
    {
        "role": "user",
        "content": """Template:
### Query Overview
```
nReturned: 7345
executionTimeMillis: 16
totalKeysExamined: 7465
totalDocsExamined: 7345
```

### Indexes Used
```
- genres_1: {genres: 1}
- year_1: {year: 1}
```

### Execution Stages
```
SUBPLAN
└── PROJECTION_SIMPLE
    └── FETCH
        └── OR
            ├── IXSCAN (genres_1)
            │   nReturned: 4560
            │   keysExamined: 4560
            │
            └── IXSCAN (year_1)
                nReturned: 2905
                keysExamined: 2905
```""",
    },
    {
        "role": "user",
        "content": f"Query explain: {explain}",
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))

### Query Overview
```
nReturned: 7345
executionTimeMillis: 123
totalKeysExamined: 0
totalDocsExamined: 106745
```

### Indexes Used
```
No indexes used - COLLSCAN performed
```

### Execution Stages
```
SUBPLAN
└── PROJECTION_SIMPLE
    └── COLLSCAN
        filter: $or: [
          {genres: {$eq: 'Animation'}},
          {year: {$eq: 2000}}
        ]
        nReturned: 7345
        docsExamined: 106745
        direction: forward
```

In [33]:
# Measure end-to-end execution time
# Aggregations and queries are executed differently
time = %timeit -o list(db.aggregate(pipeline=pipeline) if is_aggregation else query.clone())

361 ms ± 8.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [34]:
# Generate summary
context = base_context.copy()
context.extend([
    {
        "role": "user",
        "content": f"Act as a MongoDB advisor. You will be provided with information on indexes, example query that is being evaluated and a query explain plan. Provide a brief summary of explain plan, recommendations for query performance and indexes. Use markdown format for output, escape $ $ character sequence in markdown if it's not in the code block. Include original query in the beginning for a reference",
    },
    {
        "role": "user",
        "content": f"Current indexes: {current_indexes}, designed indexes: {designed_indexes}, suggested indexes: {suggested_indexes_details}",
    },
    {
        "role": "user",
        "content": f"Index stats: {index_stats}",
    },
    {
        "role": "user",
        "content": f"Query explain: {explain}",
    },
    {
        "role": "user",
        "content": f"Execution time on on the client side: {time}",
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))
context.extend([
    {
        "role": "assistant",
        "content": response.content[0].text,
    }
])

# Query Analysis and Recommendations

## Original Query
```javascript
{
  "aggregate": "movies",
  "pipeline": [
    {
      "$match": {
        "$or": [
          {"year": 2000},
          {"genres": "Animation"}
        ]
      }
    },
    {
      "$project": {
        "_id": 1,
        "title": 1
      }
    }
  ]
}
```

## Explain Plan Summary
- Collection scan (COLLSCAN) was used instead of an index
- Examined 106,745 documents to return 7,345 results
- Execution time: 123ms on server side, 361ms on client side
- No indexes were used for the query
- Using SUBPLAN stage for handling the \$or operation

## Performance Issues
1. Full collection scan required examining all documents
2. High ratio of documents examined (106,745) vs returned (7,345)
3. No suitable index available for the \$or conditions
4. Current indexes don't support the query predicates on `year` and `genres`

## Recommendations

### Index Recommendations
1. Create a compound index for the `genres` field:
```javascript
{ "genres": 1, "year": 1 }
```

2. Consider creating separate indexes if these fields are frequently queried independently:
```javascript
{ "genres": 1 }
{ "year": 1 }
```

### Query Optimization
1. If possible, restructure the query to avoid \$or by splitting into two separate queries and combining results in the application
2. Consider adding a limit if you don't need all results
3. The projection is good as it reduces the amount of data returned

### Current Index Usage
- The existing indexes (`genres_1_poster_1`, `plot_1`, `_id_`) aren't helpful for this query
- The `genres_1_poster_1` index has been used 54 times recently, while `plot_1` hasn't been used at all
- Consider dropping unused indexes like `plot_1` if it's not needed for other queries

### Expected Improvement
With the recommended index on `{genres: 1, year: 1}`, we would expect:
- Reduction in documents examined from 106,745 to approximately 7,345
- Significant improvement in query execution time
- Better query performance especially for larger datasets

In [35]:
follow_up_question = """
Why genres_1_poster_1 index was not selected?
"""

In [36]:
# Chat
context.extend([
    {
        "role": "user",
        "content": follow_up_question,
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))

context.extend([
    {
        "role": "assistant",
        "content": response.content[0].text,
    }
])

# Index Selection Analysis

## Why `genres_1_poster_1` wasn't selected

1. **Partial Index Coverage**
   - While the index includes `genres` which is one part of the \$or condition, it also includes `poster` which isn't relevant to this query
   - For \$or queries, MongoDB needs an index that can efficiently satisfy each clause of the \$or

2. **\$or Operator Behavior**
   - The query contains two \$or conditions:
     ```javascript
     "$or": [
       {"genres": "Animation"},
       {"year": 2000}
     ]
     ```
   - The `genres_1_poster_1` index can only satisfy the first condition (`genres: "Animation"`)
   - There's no index available for the second condition (`year: 2000`)
   - When MongoDB can't find indexes to support all \$or conditions efficiently, it often falls back to a collection scan

3. **Query Planner Decision**
   - The query planner determined that using the `genres_1_poster_1` index for just one part of the \$or would be less efficient than a simple collection scan
   - This is because:
     - It would need to perform an index scan for `genres` condition
     - Plus a collection scan for the `year` condition
     - Then merge the results
     - This could be more expensive than a single collection scan

## How to Fix

1. **Create a compound index that covers both conditions:**
   ```javascript
   { "genres": 1, "year": 1 }
   ```

2. **Or create separate indexes for both fields:**
   ```javascript
   { "genres": 1 }
   { "year": 1 }
   ```
   - This would allow MongoDB to use index intersection or choose the most selective index for each \$or clause

3. **Alternative Query Approach:**
   - Split the query into two separate queries if possible:
     ```javascript
     const animationResults = db.movies.find({"genres": "Animation"});
     const year2000Results = db.movies.find({"year": 2000});
     ```
   - This would allow better use of indexes and potentially better performance

Remember: MongoDB's query planner makes decisions based on statistics and cost estimates. Sometimes a collection scan is chosen because it's estimated to be more efficient than partial index usage combined with additional operations.