# Data Loading Script for MongoDB Sharded Cluster
This notebook loads CSV data into the MongoDB sharded cluster.

In [1]:
# Install required packages
import sys
!{sys.executable} -m pip install pandas pymongo --quiet

In [2]:
import pandas as pd
from pymongo import MongoClient
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Connect to MongoDB
client = MongoClient('mongodb://admin:admin@router1:27017/businessdb?authSource=admin')
db = client.businessdb

# Test connection with retry logic
max_retries = 5
retry_count = 0

while retry_count < max_retries:
    try:
        client.admin.command('ping')
        print("Successfully connected to MongoDB")
        break
    except Exception as e:
        retry_count += 1
        print(f"Connection attempt {retry_count} failed: {e}")
        if retry_count < max_retries:
            print("Retrying in 5 seconds...")
            time.sleep(5)
        else:
            print("Max retries reached. Please check your connection settings.")
            raise

Successfully connected to MongoDB


In [4]:
# Load Organizations
def load_organizations():
    df = pd.read_csv('organizations-100.csv')
    
    # Transform data
    organizations = df.apply(lambda x: {
        'organizationId': x['Organization Id'],
        'name': x['Name'],
        'website': x['Website'],
        'country': x['Country'],
        'description': x['Description'],
        'founded': int(x['Founded']),
        'industry': x['Industry'],
        'numberOfEmployees': int(x['Number of employees'])
    }, axis=1).tolist()
    
    # Insert data
    result = db.organizations.insert_many(organizations)
    print(f"Inserted {len(result.inserted_ids)} organizations")
    
    # Verify sharding
    sharding_info = client.admin.command('listShards')
    print("\nSharding distribution:")
    print(sharding_info)

In [5]:
# Load People
def load_people():
    df = pd.read_csv('people-100.csv')
    
    # Transform data
    people = df.apply(lambda x: {
        'userId': x['User Id'],
        'firstName': x['First Name'],
        'lastName': x['Last Name'],
        'sex': x['Sex'],
        'email': x['Email'],
        'phone': x['Phone'],
        'dateOfBirth': datetime.strptime(x['Date of birth'], '%Y-%m-%d'),
        'jobTitle': x['Job Title']
    }, axis=1).tolist()
    
    # Insert data
    result = db.people.insert_many(people)
    print(f"Inserted {len(result.inserted_ids)} people")
    
    # Check distribution across shards
    print("\nCollection stats:")
    print(db.command('collStats', 'people'))

In [6]:
# Load Customers
def load_customers():
    df = pd.read_csv('customers-100.csv')
    
    # Transform data
    customers = df.apply(lambda x: {
        'customerId': x['Customer Id'],
        'firstName': x['First Name'],
        'lastName': x['Last Name'],
        'company': x['Company'],
        'city': x['City'],
        'country': x['Country'],
        'phone1': x['Phone 1'],
        'phone2': x['Phone 2'],
        'email': x['Email'],
        'subscriptionDate': datetime.strptime(x['Subscription Date'], '%Y-%m-%d'),
        'website': x['Website']
    }, axis=1).tolist()
    
    # Insert data
    result = db.customers.insert_many(customers)
    print(f"Inserted {len(result.inserted_ids)} customers")
    
    # Check shard distribution
    print("\nShard distribution:")
    print(db.customers.aggregate([{'$collStats': {'storageStats': {}}}]).next())

In [7]:
# Execute data loading
print("Loading organizations...")
load_organizations()

print("\nLoading people...")
load_people()

print("\nLoading customers...")
load_customers()

print("\nData loading complete!")

Loading organizations...
Inserted 100 organizations

Sharding distribution:
{'shards': [{'_id': 'shard1rs', 'host': 'shard1rs/shard1-1:27017,shard1-2:27017,shard1-3:27017', 'state': 1, 'topologyTime': Timestamp(1735589939, 4)}, {'_id': 'shard2rs', 'host': 'shard2rs/shard2-1:27017,shard2-2:27017,shard2-3:27017', 'state': 1, 'topologyTime': Timestamp(1735589939, 10)}, {'_id': 'shard3rs', 'host': 'shard3rs/shard3-1:27017,shard3-2:27017,shard3-3:27017', 'state': 1, 'topologyTime': Timestamp(1735589940, 1)}], 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1735591995, 135), 'signature': {'hash': b's\x80C\xbb\xad\x8e\xebX_P\x14\x87r\x89)\x8aN\xf4\xd8%', 'keyId': 7454301898422616089}}, 'operationTime': Timestamp(1735591995, 135)}

Loading people...
Inserted 100 people

Collection stats:
{'sharded': True, 'capped': False, 'wiredTiger': {'metadata': {'formatVersion': 1}, 'creationString': 'access_pattern_hint=none,allocation_size=4KB,app_metadata=(formatVersion=1),assert=(commit_timestamp=

In [8]:
# Verify data distribution across shards
def check_shard_distribution():
    for collection in ['organizations', 'people', 'customers']:
        print(f"\nDistribution for {collection}:")
        stats = db.command('collStats', collection)
        print(f"Total documents: {stats['count']}")
        print("Shards distribution:")
        if 'shards' in stats:
            for shard, info in stats['shards'].items():
                print(f"{shard}: {info['count']} documents")

check_shard_distribution()


Distribution for organizations:
Total documents: 203
Shards distribution:
shard1rs: 142 documents
shard2rs: 30 documents
shard3rs: 31 documents

Distribution for people:
Total documents: 100
Shards distribution:
shard1rs: 36 documents
shard2rs: 34 documents
shard3rs: 30 documents

Distribution for customers:
Total documents: 100
Shards distribution:
shard2rs: 26 documents
shard3rs: 35 documents
shard1rs: 39 documents
