# MongoDB Examples for Sharded Cluster
This notebook contains 30 examples divided into 5 categories:
1. Basic Data Operations (CRUD)
2. Aggregation Framework
3. Sharding and Distribution Analysis
4. Complex Queries and Indexes
5. Performance and Analytics

Each example includes:
- Description of the task
- MongoDB command
- Detailed explanation
- Results analysis

In [None]:
# Install required packages
import sys
!{sys.executable} -m pip install pandas pymongo --quiet

import pandas as pd
from pymongo import MongoClient
from datetime import datetime
import pprint
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Connect to MongoDB
client = MongoClient('mongodb://admin:admin123@router1:27017/businessdb?authSource=admin')
db = client.businessdb

pp = pprint.PrettyPrinter(indent=2)
print("Connected to MongoDB")

## Category 1: Basic Data Operations (CRUD)
Examples demonstrating fundamental database operations across shards.

In [None]:
# Example 1: Insert with Validation
# Task: Insert a new organization with validation rules

new_org = {
    "organizationId": "TEST123",
    "name": "Test Company",
    "industry": "Technology",
    "country": "Czech Republic",
    "founded": 2023,
    "numberOfEmployees": 100
}

try:
    result = db.organizations.insert_one(new_org)
    print(f"Inserted document with ID: {result.inserted_id}")
except Exception as e:
    print(f"Validation error: {e}")

# Explanation: This command demonstrates document insertion with schema validation
# The document must match the schema defined in init-collections.js

In [None]:
# Example 1.1: Complex Insert Operation
# Task: Insert multiple organizations with different industries to demonstrate shard distribution

new_orgs = [
    {
        "organizationId": f"ORG_{i}",
        "name": f"Company {i}",
        "industry": industry,
        "country": "Czech Republic",
        "founded": 2023,
        "numberOfEmployees": i * 100
    } for i, industry in enumerate(['Technology', 'Healthcare', 'Finance', 'Education', 'Manufacturing', 'Retail'])
]

result = db.organizations.insert_many(new_orgs)
print(f"Inserted {len(result.inserted_ids)} documents")

# Verify shard distribution
pp.pprint(db.organizations.getShardDistribution())

In [None]:
# Example 1.2: Bulk Update Operation
# Task: Update all organizations in the Technology industry

result = db.organizations.update_many(
    {"industry": "Technology"},
    {
        "$set": {"category": "Tech Sector"},
        "$inc": {"numberOfEmployees": 50}
    }
)
print(f"Modified {result.modified_count} documents")

## Category 2: Aggregation Framework
Complex data analysis using MongoDB's aggregation pipeline.

In [None]:
# Example 2.1: Advanced Aggregation Pipeline
# Task: Analyze employee distribution by industry and country

pipeline = [
    {"$group": {
        "_id": {"industry": "$industry", "country": "$country"},
        "totalEmployees": {"$sum": "$numberOfEmployees"},
        "avgEmployees": {"$avg": "$numberOfEmployees"},
        "companies": {"$push": "$name"}
    }},
    {"$sort": {"totalEmployees": -1}},
    {"$limit": 5}
]

results = list(db.organizations.aggregate(pipeline))
pp.pprint(results)

## Category 3: Sharding Analysis
Examples focusing on sharding behavior and data distribution.

In [None]:
# Example 3.1: Analyze Chunk Distribution
# Task: Check how data is distributed across shards

def analyze_sharding():
    for collection in ['organizations', 'people', 'customers']:
        print(f"\nAnalyzing {collection}:")
        stats = db.command('collStats', collection)
        print(f"Total documents: {stats['count']}")
        print("Shard distribution:")
        for shard, info in stats.get('shards', {}).items():
            print(f"{shard}: {info['count']} docs ({(info['count']/stats['count']*100):.1f}%)")

analyze_sharding()

## Category 4: Complex Queries
Advanced query operations demonstrating MongoDB's query capabilities.

In [None]:
# Example 4.1: Complex Multi-Collection Query
# Task: Find customers and their related organizations

pipeline = [
    {"$match": {"country": "Czech Republic"}},
    {"$lookup": {
        "from": "organizations",
        "localField": "company",
        "foreignField": "name",
        "as": "organization_info"
    }},
    {"$unwind": {"path": "$organization_info", "preserveNullAndEmptyArrays": True}},
    {"$project": {
        "_id": 0,
        "customer": {"$concat": ["$firstName", " ", "$lastName"]},
        "company": "$company",
        "org_industry": "$organization_info.industry"
    }}
]

results = list(db.customers.aggregate(pipeline))
pp.pprint(results[:5])

## Category 5: Performance Analytics
Examples focusing on query performance and optimization.

In [None]:
# Example 5.1: Query Performance Analysis
# Task: Analyze query execution across shards

def analyze_query(query, collection):
    print(f"Analyzing query on {collection}:")
    explanation = db[collection].find(query).explain()
    print("\nExecution stats:")
    pp.pprint(explanation.get('executionStats', {}))
    print("\nShard execution:")
    pp.pprint(explanation.get('shards', {}))

# Example query
query = {"industry": "Technology", "numberOfEmployees": {"$gt": 1000}}
analyze_query(query, 'organizations')