# Nemotron Nano 9B v2 NIM Setup and Inference

This notebook will guide you through:
1. Setting up your NGC API key
2. Logging into the NVIDIA Container Registry
3. Pulling and running the Nemotron Nano 9B v2 NIM container
4. Performing inference against the model


## Step 1: Enter Your NGC API Key

Get your API key from: https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/deploy


In [None]:
import getpass
import os
import subprocess
import time
import requests
import json

# Prompt for API key securely
NGC_API_KEY = getpass.getpass("Enter your NGC API Key: ")
os.environ['NGC_API_KEY'] = NGC_API_KEY

print("✓ API Key saved to environment")


## Step 2: Login to NVIDIA Container Registry

This will authenticate Docker with the NVIDIA Container Registry (nvcr.io)


In [None]:
# Docker login to nvcr.io
print("Logging into nvcr.io...")

login_process = subprocess.Popen(
    ['docker', 'login', 'nvcr.io', '-u', '$oauthtoken', '--password-stdin'],
    stdin=subprocess.PIPE,
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True
)

stdout, stderr = login_process.communicate(input=NGC_API_KEY)

if login_process.returncode == 0:
    print("✓ Successfully logged into nvcr.io")
    print(stdout)
else:
    print("✗ Login failed")
    print(stderr)


## Step 3: Setup Local Cache and Pull Container

This creates a local cache directory and pulls the NIM container image.


In [None]:
# Setup local cache directory
LOCAL_NIM_CACHE = os.path.expanduser("~/.cache/nim")
os.makedirs(LOCAL_NIM_CACHE, exist_ok=True)
print(f"✓ Created cache directory: {LOCAL_NIM_CACHE}")

# Pull the container image
print("\nPulling container image (this may take several minutes)...")
pull_result = subprocess.run(
    ['docker', 'pull', 'nvcr.io/nim/nvidia/nvidia-nemotron-nano-9b-v2:latest'],
    capture_output=True,
    text=True
)

if pull_result.returncode == 0:
    print("✓ Container image pulled successfully")
else:
    print("✗ Failed to pull container")
    print(pull_result.stderr)


## Step 4: Run the NIM Container

**Note:** This will start the container in the background. The model may take a few minutes to load and be ready for inference.

**Important:** Make sure you have:
- NVIDIA GPU with appropriate drivers
- Docker with GPU support (nvidia-docker)
- Sufficient disk space (~20GB+)


In [None]:
# Get current user ID
user_id = os.getuid()

# Check if container is already running
check_container = subprocess.run(
    ['docker', 'ps', '--filter', 'ancestor=nvcr.io/nim/nvidia/nvidia-nemotron-nano-9b-v2:latest', '-q'],
    capture_output=True,
    text=True
)

if check_container.stdout.strip():
    print("⚠ Container is already running")
    container_id = check_container.stdout.strip().split('\n')[0]
    print(f"Container ID: {container_id}")
else:
    print("Starting NIM container...")
    print("This will run in detached mode. Monitor logs with: docker logs -f <container_id>")
    
    run_result = subprocess.Popen(
        [
            'docker', 'run', '-d',
            '--gpus', 'all',
            '--shm-size=16GB',
            '-e', f'NGC_API_KEY={NGC_API_KEY}',
            '-v', f'{LOCAL_NIM_CACHE}:/opt/nim/.cache',
            '-u', str(user_id),
            '-p', '8000:8000',
            'nvcr.io/nim/nvidia/nvidia-nemotron-nano-9b-v2:latest'
        ],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    
    stdout, stderr = run_result.communicate()
    
    if run_result.returncode == 0:
        container_id = stdout.strip()
        print(f"✓ Container started successfully")
        print(f"Container ID: {container_id}")
        print("\nWaiting for model to load (this may take 2-5 minutes)...")
    else:
        print("✗ Failed to start container")
        print(stderr)
        container_id = None


## Step 5: Wait for Model to be Ready

This cell will poll the health endpoint until the model is ready to accept requests.


In [None]:
import time

# Wait for the service to be ready
max_retries = 60  # 5 minutes max
retry_interval = 5  # seconds

print("Checking if service is ready...")
for i in range(max_retries):
    try:
        response = requests.get('http://localhost:8000/v1/health/ready', timeout=5)
        if response.status_code == 200:
            print(f"\n✓ Service is ready! (took {i * retry_interval} seconds)")
            # Also check models endpoint
            models_response = requests.get('http://localhost:8000/v1/models', timeout=5)
            if models_response.status_code == 200:
                print(f"Available models: {models_response.json()}")
            break
    except requests.exceptions.RequestException:
        pass
    
    if i % 6 == 0:  # Print every 30 seconds
        print(f"Waiting... ({i * retry_interval}s elapsed)")
    
    time.sleep(retry_interval)
else:
    print("\n✗ Service did not become ready in time. Check container logs with:")
    print(f"docker logs {container_id}")


## Step 6: Perform Inference

Now let's test the model with a sample question!


In [None]:
# Perform inference
url = 'http://localhost:8000/v1/chat/completions'

payload = {
    "model": "nvidia/nvidia-nemotron-nano-9b-v2",
    "messages": [
        {"role": "user", "content": "Which number is larger, 9.11 or 9.8?"}
    ],
    "max_tokens": 2048,  # Increased default for better responses
    "temperature": 0.6,
    "top_p": 0.95
}

headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json'
}

print("Sending inference request...\n")
response = requests.post(url, json=payload, headers=headers)

if response.status_code == 200:
    result = response.json()
    print("✓ Inference successful!\n")
    print("Response:")
    print(json.dumps(result, indent=2))
    print("\n" + "="*60)
    print("Model's Answer:")
    print(result['choices'][0]['message']['content'])
    print("="*60)
else:
    print(f"✗ Inference failed with status code: {response.status_code}")
    print(response.text)


## Step 7: Try Your Own Questions!

Use this cell to experiment with your own prompts.


In [None]:
# Custom inference function
def ask_nemotron(question, max_tokens=2048, temperature=0.6, top_p=0.95):
    url = 'http://localhost:8000/v1/chat/completions'
    
    payload = {
        "model": "nvidia/nvidia-nemotron-nano-9b-v2",
        "messages": [
            {"role": "user", "content": question}
        ],
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p
    }
    
    response = requests.post(url, json=payload, headers={'Content-Type': 'application/json'})
    
    if response.status_code == 200:
        result = response.json()
        return result['choices'][0]['message']['content']
    else:
        return f"Error: {response.status_code} - {response.text}"

# Try it out!
question = "What are the three laws of robotics?"
answer = ask_nemotron(question)
print(f"Q: {question}\n")
print(f"A: {answer}")


## Step 8: Thinking Mode - A Unique Capability!

**Important:** Nemotron Nano 9B v2 has a special thinking capability that allows it to reason through problems before answering.

- **By default, the model thinks** unless you explicitly disable it
- To **enable thinking explicitly**, add `"/think"` to the system message
- To **disable thinking**, add `"/no_think"` to the front of your prompt
- You can control thinking with `min_thinking_tokens` and `max_thinking_tokens` parameters

Let's explore this powerful feature!


In [None]:
### Example 1: Thinking Mode (Explicit)
# With /think in system message, the model will reason through the problem

url = 'http://localhost:8000/v1/chat/completions'

payload = {
    "model": "nvidia/nvidia-nemotron-nano-9b-v2",
    "messages": [
        {"role": "system", "content": "/think"},
        {"role": "user", "content": "If a train travels 120 miles in 2 hours, then slows down and travels 90 miles in 3 hours, what is its average speed for the entire journey?"}
    ],
    "temperature": 0.6,   
    "top_p": 0.95,
    "max_tokens": 2048,
    "min_thinking_tokens": 512,
    "max_thinking_tokens": 2048,
    "stream": False
}

print("🧠 THINKING MODE - Asking the model to solve a problem...\n")
response = requests.post(url, json=payload, headers={'Content-Type': 'application/json'})

if response.status_code == 200:
    result = response.json()
    print("=" * 80)
    print("Model's Response (with internal reasoning):")
    print("=" * 80)
    print(result['choices'][0]['message']['content'])
    print("\n" + "=" * 80)
    print(f"Usage stats: {json.dumps(result.get('usage', {}), indent=2)}")
    print("=" * 80)
else:
    print(f"Error: {response.status_code} - {response.text}")


In [None]:
### Example 2: No-Thinking Mode
# With /no_think prefix, the model responds directly without reasoning

payload_no_think = {
    "model": "nvidia/nvidia-nemotron-nano-9b-v2",
    "messages": [
        {"role": "user", "content": "/no_think If a train travels 120 miles in 2 hours, then slows down and travels 90 miles in 3 hours, what is its average speed for the entire journey?"}
    ],
    "temperature": 0.6,   
    "top_p": 0.95,
    "max_tokens": 2048,
    "stream": False
}

print("💨 NO-THINKING MODE - Direct response without reasoning...\n")
response = requests.post(url, json=payload_no_think, headers={'Content-Type': 'application/json'})

if response.status_code == 200:
    result = response.json()
    print("=" * 80)
    print("Model's Response (direct, no reasoning):")
    print("=" * 80)
    print(result['choices'][0]['message']['content'])
    print("\n" + "=" * 80)
    print(f"Usage stats: {json.dumps(result.get('usage', {}), indent=2)}")
    print("=" * 80)
else:
    print(f"Error: {response.status_code} - {response.text}")


In [None]:
### Example 3: Side-by-Side Comparison
# Let's compare thinking vs no-thinking on a challenging problem

def compare_thinking_modes(question):
    """Compare model responses with and without thinking"""
    
    # With thinking
    payload_think = {
        "model": "nvidia/nvidia-nemotron-nano-9b-v2",
        "messages": [
            {"role": "system", "content": "/think"},
            {"role": "user", "content": question}
        ],
        "temperature": 0.6,
        "top_p": 0.95,
        "max_tokens": 2048,
        "min_thinking_tokens": 512,
        "max_thinking_tokens": 2048
    }
    
    # Without thinking
    payload_no_think = {
        "model": "nvidia/nvidia-nemotron-nano-9b-v2",
        "messages": [
            {"role": "user", "content": f"/no_think {question}"}
        ],
        "temperature": 0.6,
        "top_p": 0.95,
        "max_tokens": 2048
    }
    
    print(f"Question: {question}\n")
    print("🧠 WITH THINKING:")
    print("-" * 80)
    
    response = requests.post(url, json=payload_think, headers={'Content-Type': 'application/json'})
    if response.status_code == 200:
        result = response.json()
        print(result['choices'][0]['message']['content'][:500] + "..." if len(result['choices'][0]['message']['content']) > 500 else result['choices'][0]['message']['content'])
        think_tokens = result.get('usage', {})
    else:
        print(f"Error: {response.status_code}")
        think_tokens = {}
    
    print("\n\n💨 WITHOUT THINKING:")
    print("-" * 80)
    
    response = requests.post(url, json=payload_no_think, headers={'Content-Type': 'application/json'})
    if response.status_code == 200:
        result = response.json()
        print(result['choices'][0]['message']['content'][:500] + "..." if len(result['choices'][0]['message']['content']) > 500 else result['choices'][0]['message']['content'])
        no_think_tokens = result.get('usage', {})
    else:
        print(f"Error: {response.status_code}")
        no_think_tokens = {}
    
    print("\n\n📊 TOKEN COMPARISON:")
    print("-" * 80)
    print(f"With thinking:    {json.dumps(think_tokens, indent=2)}")
    print(f"Without thinking: {json.dumps(no_think_tokens, indent=2)}")

# Test with a logic puzzle
compare_thinking_modes(
    "A farmer has 17 sheep, and all but 9 die. How many sheep are left?"
)


In [None]:
### Example 4: Streaming with Thinking Tokens
# You can also stream the response to see tokens as they're generated

print("🌊 STREAMING MODE - Watch the model think and respond in real-time!\n")
print("=" * 80)

payload_stream = {
    "model": "nvidia/nvidia-nemotron-nano-9b-v2",
    "messages": [
        {"role": "system", "content": "/think"},
        {"role": "user", "content": "Explain the concept of recursion with a simple example."}
    ],
    "temperature": 0.6,   
    "top_p": 0.95,
    "max_tokens": 2048,
    "min_thinking_tokens": 256,
    "max_thinking_tokens": 1024,
    "stream": True
}

response = requests.post(
    url, 
    json=payload_stream, 
    headers={'Content-Type': 'application/json'},
    stream=True
)

if response.status_code == 200:
    full_response = ""
    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: '):
                data = line[6:]  # Remove 'data: ' prefix
                if data.strip() == '[DONE]':
                    break
                try:
                    chunk = json.loads(data)
                    if 'choices' in chunk and len(chunk['choices']) > 0:
                        delta = chunk['choices'][0].get('delta', {})
                        content = delta.get('content', '')
                        if content:
                            print(content, end='', flush=True)
                            full_response += content
                except json.JSONDecodeError:
                    pass
    
    print("\n" + "=" * 80)
    print("✓ Streaming complete!")
else:
    print(f"Error: {response.status_code} - {response.text}")


In [None]:
### Example 5: Custom Thinking Function
# Here's a reusable function for easy experimentation with thinking parameters

def ask_with_thinking(
    question, 
    enable_thinking=True,
    max_tokens=2048,
    min_thinking_tokens=512,
    max_thinking_tokens=2048,
    temperature=0.6,
    top_p=0.95,
    stream=False
):
    """
    Ask the model a question with optional thinking mode.
    
    Args:
        question: The question to ask
        enable_thinking: Whether to enable thinking mode (default: True)
        max_tokens: Maximum tokens for the response
        min_thinking_tokens: Minimum thinking tokens (if thinking enabled)
        max_thinking_tokens: Maximum thinking tokens (if thinking enabled)
        temperature: Sampling temperature
        top_p: Nucleus sampling parameter
        stream: Whether to stream the response
    """
    url = 'http://localhost:8000/v1/chat/completions'
    
    if enable_thinking:
        payload = {
            "model": "nvidia/nvidia-nemotron-nano-9b-v2",
            "messages": [
                {"role": "system", "content": "/think"},
                {"role": "user", "content": question}
            ],
            "temperature": temperature,
            "top_p": top_p,
            "max_tokens": max_tokens,
            "min_thinking_tokens": min_thinking_tokens,
            "max_thinking_tokens": max_thinking_tokens,
            "stream": stream
        }
    else:
        payload = {
            "model": "nvidia/nvidia-nemotron-nano-9b-v2",
            "messages": [
                {"role": "user", "content": f"/no_think {question}"}
            ],
            "temperature": temperature,
            "top_p": top_p,
            "max_tokens": max_tokens,
            "stream": stream
        }
    
    response = requests.post(url, json=payload, headers={'Content-Type': 'application/json'})
    
    if response.status_code == 200:
        result = response.json()
        return result['choices'][0]['message']['content'], result.get('usage', {})
    else:
        return f"Error: {response.status_code} - {response.text}", {}

# Test it out!
question = "Write a Python function that finds the longest palindrome in a string."
answer, usage = ask_with_thinking(question, enable_thinking=True)

print(f"Q: {question}\n")
print(f"A: {answer}\n")
print(f"Token usage: {json.dumps(usage, indent=2)}")


## Step 9: Cleanup (Optional)

When you're done, you can stop and remove the container.


In [None]:
# Stop the container
if 'container_id' in locals() and container_id:
    print(f"Stopping container {container_id}...")
    subprocess.run(['docker', 'stop', container_id])
    print("✓ Container stopped")
else:
    # Try to find and stop any running nemotron containers
    result = subprocess.run(
        ['docker', 'ps', '--filter', 'ancestor=nvcr.io/nim/nvidia/nvidia-nemotron-nano-9b-v2:latest', '-q'],
        capture_output=True,
        text=True
    )
    if result.stdout.strip():
        containers = result.stdout.strip().split('\n')
        for cid in containers:
            print(f"Stopping container {cid}...")
            subprocess.run(['docker', 'stop', cid])
        print("✓ All containers stopped")
    else:
        print("No running containers found")
