# Data Loading Script for MongoDB Sharded Cluster
This notebook loads CSV data into the MongoDB sharded cluster.

In [None]:
# Install required packages
import sys
!{sys.executable} -m pip install pandas pymongo --quiet

In [None]:
import pandas as pd
from pymongo import MongoClient
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Connect to MongoDB
client = MongoClient('mongodb://admin:admin@router1:27017/businessdb?authSource=admin')
db = client.businessdb

# Test connection with retry logic
max_retries = 5
retry_count = 0

while retry_count < max_retries:
    try:
        client.admin.command('ping')
        print("Successfully connected to MongoDB")
        break
    except Exception as e:
        retry_count += 1
        print(f"Connection attempt {retry_count} failed: {e}")
        if retry_count < max_retries:
            print("Retrying in 5 seconds...")
            time.sleep(5)
        else:
            print("Max retries reached. Please check your connection settings.")
            raise

In [None]:
# Load Organizations
def load_organizations():
    df = pd.read_csv('organizations-100000.csv')
    
    # Transform data
    organizations = df.apply(lambda x: {
        'organizationId': x['Organization Id'],
        'name': x['Name'],
        'website': x['Website'],
        'country': x['Country'],
        'description': x['Description'],
        'founded': int(x['Founded']),
        'industry': x['Industry'],
        'numberOfEmployees': int(x['Number of employees'])
    }, axis=1).tolist()
    
    # Insert data
    result = db.organizations.insert_many(organizations)
    print(f"Inserted {len(result.inserted_ids)} organizations")
    
    # Verify sharding
    sharding_info = client.admin.command('listShards')
    print("\nSharding distribution:")
    print(sharding_info)

In [None]:
# Load People
def load_people():
    df = pd.read_csv('people-100000.csv')
    
    # Transform data
    people = df.apply(lambda x: {
        'userId': x['User Id'],
        'firstName': x['First Name'],
        'lastName': x['Last Name'],
        'sex': x['Sex'],
        'email': x['Email'],
        'phone': x['Phone'],
        'dateOfBirth': datetime.strptime(x['Date of birth'], '%Y-%m-%d'),
        'jobTitle': x['Job Title']
    }, axis=1).tolist()
    
    # Insert data
    result = db.people.insert_many(people)
    print(f"Inserted {len(result.inserted_ids)} people")
    
    # Check distribution across shards
    print("\nCollection stats:")
    print(db.command('collStats', 'people'))

In [None]:
# Load Customers
def load_customers():
    df = pd.read_csv('customers-100000.csv')
    
    # Transform data
    customers = df.apply(lambda x: {
        'customerId': x['Customer Id'],
        'firstName': x['First Name'],
        'lastName': x['Last Name'],
        'company': x['Company'],
        'city': x['City'],
        'country': x['Country'],
        'phone1': x['Phone 1'],
        'phone2': x['Phone 2'],
        'email': x['Email'],
        'subscriptionDate': datetime.strptime(x['Subscription Date'], '%Y-%m-%d'),
        'website': x['Website']
    }, axis=1).tolist()
    
    # Insert data
    result = db.customers.insert_many(customers)
    print(f"Inserted {len(result.inserted_ids)} customers")
    
    # Check shard distribution
    print("\nShard distribution:")
    print(db.customers.aggregate([{'$collStats': {'storageStats': {}}}]).next())

In [None]:
# Execute data loading
print("Loading organizations...")
load_organizations()

print("\nLoading people...")
load_people()

print("\nLoading customers...")
load_customers()

print("\nData loading complete!")

In [None]:
# Verify data distribution across shards
def check_shard_distribution():
    for collection in ['organizations', 'people', 'customers']:
        print(f"\nDistribution for {collection}:")
        stats = db.command('collStats', collection)
        print(f"Total documents: {stats['count']}")
        print("Shards distribution:")
        if 'shards' in stats:
            for shard, info in stats['shards'].items():
                print(f"{shard}: {info['count']} documents")

check_shard_distribution()