# MongoDB Sharded Cluster Examples

This notebook contains 30 different examples across 5 categories:

1. Basic Data Operations (6 examples)
   - Complex inserts, updates, deletes with sharding
   - Batch operations across shards
   
2. Advanced Aggregation (6 examples)
   - Multi-stage pipelines
   - Group operations
   - Window functions
   
3. Cluster Management (6 examples)
   - Shard balancing analysis
   - Replica set operations
   - Simulated node failures
   
4. Data Analysis and Reporting (6 examples)
   - Complex queries across shards
   - Performance analysis
   - Statistical operations
   
5. System Administration (6 examples)
   - Index management
   - Monitoring operations
   - Backup scenarios

Each example includes:
# Install required packages
import sys
!{sys.executable} -m pip install pandas pymongo --quiet
- Result analysis

In [6]:
import sys
!{sys.executable} -m pip install pandas pymongo --quiet

import pandas as pd
from pymongo import MongoClient
from datetime import datetime
import json
import time
import warnings
warnings.filterwarnings('ignore')

def print_mongo(obj):
    """Pretty print MongoDB output"""
    print(json.dumps(obj, indent=2, default=str))

def get_mongo_client(max_retries=5, retry_delay=5):
    """Connect to MongoDB with retry logic"""
    for attempt in range(max_retries):
        try:
            client = MongoClient('mongodb://admin:admin@router1:27017/businessdb?authSource=admin')
            client.admin.command('ping')
            print("Successfully connected to MongoDB")
            return client
        except Exception as e:
            print(f"Connection attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                raise

client = get_mongo_client()
db = client.businessdb

Successfully connected to MongoDB


## Category 1: Basic Data Operations
Advanced CRUD operations demonstrating sharding behavior

In [7]:
# Example 1: Insert with Validation
# Task: Insert a new organization with validation rules

new_org = {
    "organizationId": "TEST123",
    "name": "Test Company",
    "industry": "Technology",
    "country": "Czech Republic",
    "founded": 2023,
    "numberOfEmployees": 100
}

try:
    result = db.organizations.insert_one(new_org)
    print(f"Inserted document with ID: {result.inserted_id}")
except Exception as e:
    print(f"Validation error: {e}")

# Explanation: This command demonstrates document insertion with schema validation
# The document must match the schema defined in init-collections.js

Inserted document with ID: 6771e2c5ffa110f4979adcbf


In [8]:
# Example 1.1: Bulk Insert with Shard Distribution Analysis
# Task: Insert organizations and analyze how data is distributed across shards

def analyze_shard_distribution(collection_name):
    """Analyze how documents are distributed across shards"""
    stats = db.command('collStats', collection_name)
    print(f"\nDistribution for {collection_name}:")
    for shard, info in stats.get('shards', {}).items():
        docs = info.get('count', 0)
        size = info.get('size', 0)
        print(f"Shard {shard}: {docs} documents, {size/1024/1024:.2f}MB")
    return stats

# Generate test data
industries = ['Technology', 'Healthcare', 'Finance', 'Education', 'Manufacturing', 'Retail']
countries = ['USA', 'UK', 'Germany', 'France', 'Japan', 'China']

test_orgs = [
    {
        "organizationId": f"ORG_{i}",
        "name": f"Company {i}",
        "industry": industries[i % len(industries)],
        "country": countries[i % len(countries)],
        "founded": 2000 + (i % 23),
        "numberOfEmployees": (i + 1) * 100
    } for i in range(100)
]

# Insert data and analyze distribution
result = db.organizations.insert_many(test_orgs)
print(f"Inserted {len(result.inserted_ids)} documents")

# Analyze shard distribution
distribution = analyze_shard_distribution('organizations')
print("\nDetailed stats:")
print_mongo(distribution)

Inserted 6 documents


OperationFailure: collection name has invalid type int, full error: {'ok': 0.0, 'errmsg': 'collection name has invalid type int', 'code': 2, 'codeName': 'BadValue', '$clusterTime': {'clusterTime': Timestamp(1735516877, 6), 'signature': {'hash': b'\x86w\x9c\xa4\x9f\xebJ\xe7\xfc\xf0\x00X\x16\x10\xd4n\x8c\xbf\r\xde', 'keyId': 7453987094499688473}}, 'operationTime': Timestamp(1735516877, 6)}

In [None]:
# Example 1.2: Bulk Update Operation
# Task: Update all organizations in the Technology industry

result = db.organizations.update_many(
    {"industry": "Technology"},
    {
        "$set": {"category": "Tech Sector"},
        "$inc": {"numberOfEmployees": 50}
    }
)
print(f"Modified {result.modified_count} documents")

## Category 2: Aggregation Framework
Complex data analysis using MongoDB's aggregation pipeline.

In [None]:
# Example 2.1: Advanced Aggregation Pipeline
# Task: Analyze employee distribution by industry and country

pipeline = [
    {"$group": {
        "_id": {"industry": "$industry", "country": "$country"},
        "totalEmployees": {"$sum": "$numberOfEmployees"},
        "avgEmployees": {"$avg": "$numberOfEmployees"},
        "companies": {"$push": "$name"}
    }},
    {"$sort": {"totalEmployees": -1}},
    {"$limit": 5}
]

results = list(db.organizations.aggregate(pipeline))
print_mongo(results)

## Category 3: Sharding Analysis
Examples focusing on sharding behavior and data distribution.

In [None]:
# Example 3.1: Analyze Chunk Distribution
# Task: Check how data is distributed across shards

def analyze_sharding():
    for collection in ['organizations', 'people', 'customers']:
        print(f"\nAnalyzing {collection}:")
        stats = db.command('collStats', collection)
        print(f"Total documents: {stats['count']}")
        print("Shard distribution:")
        for shard, info in stats.get('shards', {}).items():
            print(f"{shard}: {info['count']} docs ({(info['count']/stats['count']*100):.1f}%)")

analyze_sharding()

## Category 4: Complex Queries
Advanced query operations demonstrating MongoDB's query capabilities.

In [None]:
# Example 4.1: Complex Multi-Collection Query
# Task: Find customers and their related organizations

pipeline = [
    {"$match": {"country": "Czech Republic"}},
    {"$lookup": {
        "from": "organizations",
        "localField": "company",
        "foreignField": "name",
        "as": "organization_info"
    }},
    {"$unwind": {"path": "$organization_info", "preserveNullAndEmptyArrays": True}},
    {"$project": {
        "_id": 0,
        "customer": {"$concat": ["$firstName", " ", "$lastName"]},
        "company": "$company",
        "org_industry": "$organization_info.industry"
    }}
]

results = list(db.customers.aggregate(pipeline))
print_mongo(results[:5])

## Category 5: Performance Analytics
Examples focusing on query performance and optimization.

In [None]:
# Example 5.1: Query Performance Analysis
# Task: Analyze query execution across shards

def analyze_query(query, collection):
    print(f"Analyzing query on {collection}:")
    explanation = db[collection].find(query).explain()
    print("\nExecution stats:")
    pp.pprint(explanation.get('executionStats', {}))
    print("\nShard execution:")
    pp.pprint(explanation.get('shards', {}))

# Example query
query = {"industry": "Technology", "numberOfEmployees": {"$gt": 1000}}
analyze_query(query, 'organizations')