In [16]:
import sys
!{sys.executable} -m pip install pandas pymongo --quiet

import pandas as pd
from pymongo import MongoClient
import json
import time
import warnings
import threading

warnings.filterwarnings('ignore')

def print_mongo(obj):
    """Pretty print MongoDB output"""
    print(json.dumps(obj, indent=2, default=str))

def get_mongo_router_client(max_retries=5, retry_delay=5):
    """Connect to MongoDB via the router with retry logic"""
    for attempt in range(max_retries):
        try:
            # Connect to the mongos router on 'router1:27017'
            client = MongoClient('mongodb://admin:admin@router1:27017/businessdb?authSource=admin')
            client.admin.command('ping')
            print("Successfully connected to MongoDB (via router1)")
            return client
        except Exception as e:
            print(f"Connection attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                raise

def get_shard_primary_client(shard_primary_host='shard1-1:27017'):
    """
    Connects directly to the shard's primary node (or your best guess),
    so we can run 'rs.stepDown()' and forcibly trigger a failover.
    
    NOTE: Adjust 'shard_primary_host' to match your actual primary node.
    """
    # If the shard requires authentication, specify the same admin credentials, 
    # but point to the shard node directly:
    uri = f'mongodb://admin:admin@{shard_primary_host}/admin?authSource=admin'
    shard_client = MongoClient(uri)
    # Check connection
    shard_client.admin.command('ping')
    print(f"Successfully connected to shard primary: {shard_primary_host}")
    return shard_client

In [17]:
router_client = get_mongo_router_client()
db = router_client.businessdb

Successfully connected to MongoDB (via router1)


In [18]:
# Verify shards:
try:
    shards_info = router_client.admin.command("listShards")
    print("\nSharded cluster details (listShards):")
    print_mongo(shards_info)
except Exception as e:
    print(f"Could not list shards: {e}")


Sharded cluster details (listShards):
{
  "shards": [
    {
      "_id": "shard1rs",
      "host": "shard1rs/shard1-1:27017,shard1-2:27017,shard1-3:27017",
      "state": 1,
      "topologyTime": "Timestamp(1735598398, 5)"
    },
    {
      "_id": "shard2rs",
      "host": "shard2rs/shard2-1:27017,shard2-2:27017,shard2-3:27017",
      "state": 1,
      "topologyTime": "Timestamp(1735598398, 12)"
    },
    {
      "_id": "shard3rs",
      "host": "shard3rs/shard3-1:27017,shard3-2:27017,shard3-3:27017",
      "state": 1,
      "topologyTime": "Timestamp(1735598399, 6)"
    }
  ],
  "ok": 1.0,
  "$clusterTime": {
    "clusterTime": "Timestamp(1735599455, 67)",
    "signature": {
      "hash": "b'5\\nz\\xd5<\\x84\\xac%)\\x87H\\xa2\\x8b*\\xc1\\xe6\\x8c\\xd3\\xa2\\x95'",
      "keyId": 7454338233845940249
    }
  },
  "operationTime": "Timestamp(1735599455, 67)"
}


In [19]:
organizations = db["organizations"]

In [None]:
def get_shard_primaries(router_client):
    """
    Returns a dict mapping:
        shard_id -> primary_host
    by parsing each shard's host string and checking who is primary
    with a direct 'hello' command.
    
    Requires that 'admin:admin' credentials *also* exist on each shard's local admin DB 
    (or a cluster-wide user recognized by each shard).
    """
    shard_primaries = {}
    try:
        shards_data = router_client.admin.command("listShards")
        for shard in shards_data.get("shards", []):
            shard_id = shard["_id"]  # e.g. 'shard1rs'
            host_def = shard["host"] # e.g. 'shard1rs/shard1-1:27017,shard1-2:27017,shard1-3:27017'

            # Split 'replicaSetName/host1,host2,host3'
            if "/" in host_def:
                _, hosts_part = host_def.split("/", 1)
                hosts = hosts_part.split(",")  # ["shard1-1:27017", "shard1-2:27017", "shard1-3:27017"]

                primary_host = None
                for host in hosts:
                    # Connect to each host, run 'hello' to see if it's primary
                    # Adjust user/password if your shard admin user is different
                    uri = f"mongodb://admin:admin@{host}/admin?authSource=admin"
                    try:
                        node_client = MongoClient(uri, serverSelectionTimeoutMS=2000)
                        hello_res = node_client.admin.command("hello")
                        # 'isWritablePrimary' is the newer field; 'ismaster' is older
                        if hello_res.get("isWritablePrimary") or hello_res.get("ismaster"):
                            primary_host = host
                            node_client.close()
                            break
                        node_client.close()
                    except Exception:
                        # If the node is down or auth fails, skip
                        pass

                shard_primaries[shard_id] = primary_host
            else:
                # If there's no slash, might be a single host unreplicated
                shard_primaries[shard_id] = host_def
    except Exception:
        pass

    return shard_primaries

In [20]:
keep_running = True

def continuous_operations():
    i = 0
    while keep_running:
        doc_id = f"failover-demo-{i}"
        doc = {
            "organizationId": doc_id,
            "name": f"Failover Org {i}",
            "industry": "FailoverTest",
            "founded": 2024,
            "numberOfEmployees": i * 10
        }
        start_time = time.time()
        try:
            # Insert
            organizations.insert_one(doc)
            # Find
            found_doc = organizations.find_one({"organizationId": doc_id})
            elapsed = time.time() - start_time

            if found_doc:
                print(f"[{time.strftime('%X')}] Iter={i}, inserted+found doc={doc_id}, elapsed={elapsed:.2f}s")
            else:
                print(f"[{time.strftime('%X')}] Iter={i}, could NOT find doc={doc_id}, elapsed={elapsed:.2f}s")

        except Exception as e:
            elapsed = time.time() - start_time
            print(f"[{time.strftime('%X')}] Iter={i}, ERROR: {e}, elapsed={elapsed:.2f}s")

        #  -- Print a simple one-line primary info (we do this every iteration).
        shard_primaries = get_shard_primaries(router_client)
        # E.g. "shard1rs=shard1-1:27017, shard2rs=shard2-1:27017"
        shard_info_line = ", ".join([
            f"{shard_id}={primary if primary else 'UNKNOWN'}"
            for shard_id, primary in shard_primaries.items()
        ])
        print(f"   Shard primaries: [{shard_info_line}]")

        i += 1
        time.sleep(2)

# Start the worker
worker_thread = threading.Thread(target=continuous_operations, daemon=True)
worker_thread.start()

time.sleep(6)
print("\nNow we'll trigger a failover on shard1 by stepping down its primary...")


Now we'll trigger a failover on shard1 by stepping down its primary...


In [None]:
try:
    shard_primary_client = get_shard_primary_client('shard1-1:27017')  # or whichever node is currently primary
    # The 'replSetStepDown' command steps down the current primary for N seconds
    # forcing a new election in that replica set
    stepdown_result = shard_primary_client.admin.command({
        "replSetStepDown": 30,  # step down for 30s
        "force": True
    })
    print("\nStep down result:")
    print_mongo(stepdown_result)

except Exception as e:
    # Typically, the step-down command might throw an exception even when successful, 
    # because the primary is no longer primary right after step-down
    print(f"Caught exception while stepping down primary (often normal): {e}")

# We wait a bit to let the replica set elect a new primary
print("\nWaiting 10 seconds for the replica set to elect a new primary...")
time.sleep(10)

Successfully connected to shard primary: shard1-1:27017

Step down result:
{
  "ok": 1.0,
  "lastCommittedOpTime": "Timestamp(1735599500, 1)",
  "$clusterTime": {
    "clusterTime": "Timestamp(1735599508, 42)",
    "signature": {
      "hash": "b'(\\x94\\xc4q\\x9b\\x12}\\xaep\\xf0\\xcc\\xdc\\xbce\\xa0\\xb0h\\xa4\\x19\\xc0'",
      "keyId": 7454338233845940249
    }
  },
  "operationTime": "Timestamp(1735599500, 1)"
}

Waiting 10 seconds for the replica set to elect a new primary...


In [15]:
# Stop the worker thread
keep_running = False
worker_thread.join()

[22:56:50] Iter=131, inserted+found doc=failover-demo-131, shards=['shard1rs', 'shard2rs', 'shard3rs'], elapsed=0.01s
[22:56:52] Iter=132, inserted+found doc=failover-demo-132, shards=['shard1rs', 'shard2rs', 'shard3rs'], elapsed=0.01s
[22:56:54] Iter=133, inserted+found doc=failover-demo-133, shards=['shard1rs', 'shard2rs', 'shard3rs'], elapsed=0.01s
[22:56:56] Iter=134, inserted+found doc=failover-demo-134, shards=['shard1rs', 'shard2rs', 'shard3rs'], elapsed=0.01s
[22:56:58] Iter=135, inserted+found doc=failover-demo-135, shards=['shard1rs', 'shard2rs', 'shard3rs'], elapsed=0.01s
