In [37]:
# Specify cluster name, connect to a specific database
cluster_name = "echo"
database_name = "sample_mflix"
collection_name = "movies"

In [38]:
# Init
import json
import os

import requests
from IPython.core.display import Markdown
# https://github.com/anthropics/anthropic-sdk-python
from anthropic import Anthropic
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient

# Load env variables for MongoDB Atlas CLI, MongoDB database client and Anthropic client
load_dotenv()
uri = os.getenv("MONGO_URI")

# Create MongoBD Client, point to a specific database/collection
mongodb_client = MongoClient(uri)
namespace = database_name + "." + collection_name
db = mongodb_client[database_name][collection_name]

# Create Anthropic client
anthropic_client = Anthropic()
base_context = []

## MongoDB Context

In [39]:
docs = [
    "https://www.mongodb.com/docs/manual/core/query-optimization/",
    "https://www.mongodb.com/docs/manual/tutorial/equality-sort-range-rule/",
    "https://www.mongodb.com/docs/manual/core/index-partial/",
    "https://www.mongodb.com/docs/manual/reference/explain-results/",
    "https://www.mongodb.com/docs/manual/tutorial/analyze-query-plan/",
    "https://www.mongodb.com/docs/manual/tutorial/optimize-query-performance-with-indexes-and-projections/",
    "https://www.mongodb.com/docs/atlas/performance-advisor/",
    "https://www.mongodb.com/docs/atlas/performance-advisor/index-ranking/",
    "https://www.mongodb.com/docs/manual/reference/operator/aggregation/indexStats/",
    "https://www.mongodb.com/docs/cloud-manager/reference/api/performance-advisor/get-suggested-indexes/"
]

In [40]:
# Add to context
def webpage_as_context(url):
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')
    body = soup.select('div.body')[0]
    text = body.get_text(separator=" ", strip=True)

    return {
        "role": "user",
        "content": f"MongoDB documentation from {url}\n{text}",
    }


base_context = []
for url in docs:
    context_message = webpage_as_context(url)
    base_context.append(context_message)


## Indexes

In [41]:
designed_indexes = [
    {'_id': 1},
    {'type': 1, 'imdb.rating': -1, 'year': 1},
    {"genres": 1, "poster": 1}
]

In [42]:
# Organize indexes

# Designed indexes
designed_indexes = [str(index_definition) for index_definition in designed_indexes]

# Current indexes in database
# https://www.mongodb.com/docs/manual/reference/operator/aggregation/indexStats/
current_indexes = []
index_stats = list(db.aggregate([{"$indexStats": {}}]))

for idx in index_stats:
    current_indexes.append(str(idx["key"]))

# Suggested indexes
# API details: https://www.mongodb.com/docs/cloud-manager/reference/api/performance-advisor/get-suggested-indexes/
suggested_indexes = set()
suggested_indexes_details = []

# CLI https://www.mongodb.com/docs/atlas/cli/current/command/atlas-clusters-list/
clusters_raw = !atlas clusters list -o json
clusters = json.loads(clusters_raw.s)["results"]
cluster = None

for c in clusters:
    if cluster_name == c["name"]:
        cluster = c
        break
cluster_hosts = cluster["connectionStrings"]["standard"].split("/")[2].split(",")

# CLI https://www.mongodb.com/docs/atlas/cli/current/command/atlas-performanceAdvisor-suggestedIndexes/
for host in cluster_hosts:
    index_suggestions_raw = !atlas performanceAdvisor suggestedIndexes list --processName {host} -o json
    index_suggestions = json.loads(index_suggestions_raw.s)

    for suggestion in index_suggestions["suggestedIndexes"]:
        if namespace not in suggestion["namespace"]: continue

        index_definition = {}
        for record in suggestion["index"]:
            index_definition.update(record)

        if str(index_definition) in suggested_indexes:
            continue

        suggested_indexes.add(str(index_definition))

        sample_queries = []
        for sample_query in index_suggestions["shapes"]:
            if sample_query["id"] in suggestion["impact"]:
                del sample_query["operations"][0]["stats"]["ts"]
                suggested_indexes_details.append({
                    "index_definition": index_definition,
                    "sample_query": json.dumps(sample_query["operations"][0]["predicates"]),
                    "query_count": sample_query["count"],
                    "stats": json.dumps(sample_query["operations"][0]["stats"])
                })
                # Extract a single query example in index recommendation
                break

# Compose index summary
message = "Designed indexes:\n"

for idx in designed_indexes:
    status = "\033[91mx\033[0m "
    if idx in current_indexes:
        status = "\033[92m√\033[0m "
    if idx in suggested_indexes:
        status = "\033[91m+\033[0m "

    message += status + str(idx)
    message += "\n"

message += "\n"
message += "Current indexes:\n"

for idx in current_indexes:
    status = "\033[93m?\033[0m "
    if idx in designed_indexes:
        status = "\033[92m√\033[0m "
    if idx in suggested_indexes:
        status = "\033[92m+\033[0m "

    message += status + str(idx) + "\n"

message += "\n"
message += "Suggested indexes:\n"
for si in suggested_indexes_details:
    message += "\033[92m+\033[0m " + str(si["index_definition"]) + "\n"
    message += "    query: " + str(si["query_count"]) + " x " + str(si["sample_query"]) + "\n"
    message += "    stats: " + str(si["stats"]) + "\n\n"

print(message)

Designed indexes:
[92m√[0m {'_id': 1}
[91m+[0m {'type': 1, 'imdb.rating': -1, 'year': 1}
[92m√[0m {'genres': 1, 'poster': 1}

Current indexes:
[92m√[0m {'genres': 1, 'poster': 1}
[93m?[0m {'plot': 1}
[92m√[0m {'_id': 1}

Suggested indexes:
[92m+[0m {'type': 1, 'imdb.rating': -1, 'year': 1}
    query: 48 x [{"aggregate": [{"$match": {"type": "movie", "year": {"$in": [{"$numberInt": "2010"}, {"$numberInt": "2011"}, {"$numberInt": "2012"}, {"$numberInt": "2013"}, {"$numberInt": "2014"}, {"$numberInt": "2015"}]}}}, {"$sort": {"imdb.rating": {"$numberInt": "-1"}}}, {"$limit": {"$numberInt": "10000"}}]}]
    stats: {"ms": 155, "nReturned": 101, "nScanned": 106745}




## Queries

#### 1. Find movies by type, year, sort by imdb.rating

Index is present in the design but in missing on the database

Index `{'type': 1, 'imdb.rating': -1, 'year': 1}`

In [43]:
pipeline = [
    {"$match": {
        "type": "movie",
        "year": {"$in": [2010, 2011, 2012, 2013, 2014, 2015]}
    }},
    {"$sort": {"imdb.rating": -1}},
    {"$limit": 10000}
]

#### 2. Find movies with missing poster by genre

Optimal index is present

Index `{'genres': 1, 'poster': 1}`



In [44]:
pipeline = [
    {"$match": {
        "genres": {"$in": ["Animation", "Short"]},
        "poster": None
    }},
    {"$project": {"_id": 1, "title": 1}}
]

#### 3. Aggregation with an $or operator
Collection scan - indexes are missing

Indexes: `{'year': 1}`, `{'genres': 1}`

In [45]:
pipeline = [
    {"$match": {
        "$or": [{"year": 2000}, {"genres": "Animation"}]
    }},
    {"$project": {"_id": 1, "title": 1}}
]

## Query performance

In [46]:
# Explain query
# https://www.mongodb.com/docs/manual/reference/explain-results/
explain = mongodb_client.get_database(database_name).command(
    'explain',
    {
        'aggregate': collection_name,
        'pipeline': pipeline,
        'cursor': {}
    },
    verbosity='allPlansExecution'
)

context = base_context.copy()
context.extend([
    {
        "role": "user",
        "content": f"Format a json explain plan using a template. Use markdown format for output. Do not provide text summary in the end ",
    },
    {
        "role": "user",
        "content": """Template:
### Query Overview
```
nReturned: 7345
executionTimeMillis: 16
totalKeysExamined: 7465
totalDocsExamined: 7345
```

### Indexes Used
```
- genres_1: {genres: 1}
- year_1: {year: 1}
```

### Execution Stages
```
SUBPLAN
└── PROJECTION_SIMPLE
    └── FETCH
        └── OR
            ├── IXSCAN (genres_1)
            │   nReturned: 4560
            │   keysExamined: 4560
            │
            └── IXSCAN (year_1)
                nReturned: 2905
                keysExamined: 2905
```""",
    },
    {
        "role": "user",
        "content": f"Query explain: {explain}",
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))

### Query Overview
```
nReturned: 7345
executionTimeMillis: 123
totalKeysExamined: 0
totalDocsExamined: 106745
```

### Indexes Used
```
None - Collection Scan used
```

### Execution Stages
```
SUBPLAN
└── PROJECTION_SIMPLE
    └── COLLSCAN
        filter: {$or: [{genres: {$eq: 'Animation'}}, {year: {$eq: 2000}}]}
        nReturned: 7345
        docsExamined: 106745
        direction: forward
```

In [47]:
# Measure end-to-end execution time
time = %timeit -o list(db.aggregate(pipeline=pipeline))

369 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [48]:
# Generate summary
context = base_context.copy()
context.extend([
    {
        "role": "user",
        "content": f"Act as a MongoDB advisor. You will be provided with information on indexes, example query that is being evaluated and a query explain plan. Provide a brief summary of explain plan, recommendations for query performance and indexes. Use markdown format for output, escape $ $ character sequence in markdown if it's not in the code block. Include original query in the beginning for a reference",
    },
    {
        "role": "user",
        "content": f"Current indexes: {current_indexes}, designed indexes: {designed_indexes}, suggested indexes: {suggested_indexes_details}",
    },
    {
        "role": "user",
        "content": f"Index stats: {index_stats}",
    },
    {
        "role": "user",
        "content": f"Query explain: {explain}",
    },
    {
        "role": "user",
        "content": f"Execution time on on the client side: {time}",
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))
context.extend([
    {
        "role": "assistant",
        "content": response.content[0].text,
    }
])

# Query Analysis and Recommendations

## Original Query
```javascript
db.movies.aggregate([
  {
    $match: {
      $or: [
        { year: 2000 },
        { genres: "Animation" }
      ]
    }
  },
  {
    $project: {
      _id: 1,
      title: 1
    }
  }
])
```

## Explain Plan Summary
- Query performs a COLLSCAN (full collection scan)
- Examined 106,745 documents to return 7,345 results
- Execution time: 123ms on server side, 369ms on client side
- No indexes were used despite having indexes on `genres` and other fields

## Performance Issues
1. Full collection scan is being performed instead of using indexes
2. High number of documents examined (106,745) compared to documents returned (7,345)
3. \$or operation without proper index coverage

## Recommendations

### Index Recommendations
1. Create a compound index to support both conditions in the \$or query:
```javascript
db.movies.createIndex({ "genres": 1, "year": 1 })
```

2. Consider creating separate indexes if these fields are commonly queried independently:
```javascript
db.movies.createIndex({ "genres": 1 })
db.movies.createIndex({ "year": 1 })
```

### Query Optimization
1. If possible, restructure the query to avoid \$or by doing two separate queries and combining results:
```javascript
db.movies.aggregate([
  {
    $match: { 
      $or: [
        { year: 2000 },
        { genres: "Animation" }
      ]
    },
    hint: { "genres": 1, "year": 1 }  // Force index usage
  },
  {
    $project: {
      _id: 1,
      title: 1
    }
  }
])
```

2. Consider adding a limit if you don't need all results

### Current Index Usage
- The existing `{ genres: 1, poster: 1 }` index has been used 54 times but isn't optimal for this query
- The `plot` index hasn't been used at all (0 ops) - consider dropping if not needed
- The `_id` index has minimal usage (9 ops)

Implementation of these recommendations should significantly reduce the number of documents examined and improve query performance by eliminating the full collection scan.

In [49]:
follow_up_question = """
Why genres_1_poster_1 index was not selected?
"""

In [50]:
# Chat
context.extend([
    {
        "role": "user",
        "content": follow_up_question,
    }
])
response = anthropic_client.messages.create(
    max_tokens=1024,
    messages=context,
    model="claude-3-5-sonnet-latest",
)

display(Markdown(response.content[0].text))

context.extend([
    {
        "role": "assistant",
        "content": response.content[0].text,
    }
])

# Index Selection Analysis

The `genres_1_poster_1` index wasn't selected for several reasons:

1. **Partial Index Coverage**
   - While the index includes `genres` which is one part of the \$or condition, it doesn't help with the `year` condition
   - The second field `poster` is not relevant to this query
   - MongoDB query planner determined that using this index would require:
     * An index scan for the `genres: "Animation"` condition
     * A collection scan for the `year: 2000` condition
     * Merging results from both operations

2. **Query Planner Decision**
   - When dealing with \$or conditions, MongoDB needs indexes that can efficiently serve all conditions
   - In this case, the planner decided that a single collection scan would be more efficient than:
     * Using the index for one condition
     * Performing a collection scan for the other condition
     * Merging the results
   - This is known as the "index intersection penalty" for \$or queries

3. **Cost Calculation**
   - The query planner estimated that:
     * Cost of single COLLSCAN: examine all 106,745 documents once
     * Cost of index + COLLSCAN + merge: examine index entries + examine documents for year condition + merge overhead
   - Selected the simpler plan (COLLSCAN) as it was deemed more efficient

## Better Index Strategy
```javascript
// Option 1: Compound index covering both conditions
db.movies.createIndex({ "genres": 1, "year": 1 })

// Option 2: Separate indexes if these fields are queried independently
db.movies.createIndex({ "genres": 1 })
db.movies.createIndex({ "year": 1 })
```

With these indexes, the query planner would have better options for executing the \$or query, potentially using index intersection or separate index scans for each condition.