# Serve FacebookAI XLM Roberta Base using Triton Inference Server on AWS Neuron

This notebook shows how to serve [FacebookAI/xlm-roberta-base](https://huggingface.co/FacebookAI/xlm-roberta-base) model using [Triton Inference Server](https://github.com/triton-inference-server) on [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html) with [torch-neuronx](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/torch-neuronx.html).

## Setup and Imports

In [None]:
! pip install kubernetes
! pip install boto3

In [None]:
import os
import subprocess
import sys

# Set working directory
os.chdir(os.path.expanduser('~/amazon-eks-machine-learning-with-terraform-and-kubeflow'))
print(f"Working directory: {os.getcwd()}")

# Get the src directory
src_dir = os.path.join(os.getcwd(), "src")
sys.path.insert(0, src_dir)

from k8s.utils import (
    wait_for_helm_release_pods,
    wait_for_triton_server,
    find_matching_helm_services
)

# Get notebook directory
notebook_dir = os.path.join(os.getcwd(), 'examples', 'inference', 'triton-inference-server', 
                            'python_backend', 'xlm-roberta-base-neuron')
print(f"Notebook directory: {notebook_dir}")

# initialize key variables
release_name = 'triton-server-xlm-roberta-base-neuronx'
namespace = 'kubeflow-user-example-com'
hf_model_id = 'FacebookAI/xlm-roberta-base'

## Step 1: Build and Push Docker Container

Build and push Docker container image to your current AWS region.

In [None]:
import sys
import boto3

# Create a Boto3 session
session = boto3.session.Session()

# Access the region_name attribute to get the current region
current_region = session.region_name

cmd = ['./containers/tritonserver-neuronx/build_tools/build_and_push.sh', current_region]

# Start the subprocess with streaming output
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, 
                          text=True, bufsize=1, universal_newlines=True)

# Stream output line by line
for line in process.stdout:
    print(line, end='')  # end='' prevents double newlines
    sys.stdout.flush()   # Force immediate output

# Wait for the process to complete and get the return code
return_code = process.wait()

if return_code != 0:
    print(f"\nProcess exited with return code: {return_code}")
else:
    print("\nProcess completed successfully")

## Step 2: Download Hugging Face FacebookAI XLM Roberta Base Model Weights

Below we download the Hugging Face model.

In [None]:

cmd = [
    'helm', 'install', '--debug', release_name,
    'charts/machine-learning/model-prep/hf-snapshot',
    '--set-json', f'env=[{{"name":"HF_MODEL_ID","value":"{hf_model_id}"}}]',
    '-n', namespace
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for model download to complete
wait_for_helm_release_pods(release_name, namespace)

In [None]:
# Uninstall the model download job
cmd = ['helm', 'uninstall', release_name, '-n', namespace]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

## Step 3: Launch Triton Server

In [None]:
cmd = [
    'helm', 'install', '--debug', release_name,
    'charts/machine-learning/serving/triton-inference-server',
    '-f', f'{notebook_dir}/triton_server.yaml',
    '-n', namespace
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:

wait_for_triton_server(release_name, namespace)

## Step 4: Check Service Status

In [None]:
# Check service status
services = find_matching_helm_services(release_name, namespace)
for service in services:
    print(f"Service {service.metadata.name} is available.")
    print(f"Service type: {service.spec.type}")
    print(f"Service ports: {service.spec.ports} ")
    print(f"Run  'kubectl port-forward svc/{release_name} 8000:8000 -n {namespace}' in a separate terminal")

## Step 5: Test the Deployed Model

**Prerequisites:**
- Run `kubectl port-forward svc/YOUR_SERVICE_NAME 8000:8000 -n YOUR_NAMESPACE` in a separate terminal
- Install required packages: `pip install requests numpy`

In [None]:
# Install additional packages for testing
! pip install requests numpy

In [None]:
import json
import requests
import numpy as np
from typing import List, Dict, Any

# Configuration for testing
BASE_URL = "http://localhost:8000"
MODEL_NAME = "xml-roberta-base"  # Update this based on your model deployment

### Check Model is Ready

Below we check Triton Inference server is healthy, and the model is successfully deployed within the server, and is ready.

In [None]:
def check_server_health(base_url: str = BASE_URL) -> bool:
    """Check if the Triton server is healthy and responsive"""
    try:
        health_url = f"{base_url}/v2/health/ready"
        response = requests.get(health_url, timeout=10)
        
        if response.status_code == 200:
            print("‚úì Triton server is healthy and ready")
            return True
        else:
            print(f"‚úó Triton server health check failed: {response.status_code}")
            return False
            
    except requests.exceptions.RequestException as e:
        print(f"‚úó Cannot connect to Triton server: {e}")
        print("\nPlease ensure kubectl port-forward is running:")
        print(f"kubectl port-forward svc/{release_name} 8000:8000 -n {namespace}")
        return False

def check_model_ready(base_url: str = BASE_URL, model_name: str = MODEL_NAME) -> List[str]:
    """Check model is ready"""
    try:
        model_url = f"{base_url}/v2/models/{model_name}/ready"
        response = requests.get(model_url, timeout=10)
        
        if response.status_code == 200:
            print(f"Available model: {model_name}")
            return [model_name]
        else:
            print(f"Failed to list models: {response.status_code}")
            return []
            
    except requests.exceptions.RequestException as e:
        print(f"Cannot list models: {e}")
        return []

# Check server health and  model is ready
server_healthy = check_server_health()
if server_healthy:
    available_models = check_model_ready()
else:
    available_models = []

### Define Tests for Masked Language Model

Below we define the tests for Masked Language Model.

In [None]:
def test_masked_lm(text: str, model_name: str = MODEL_NAME, base_url: str = BASE_URL) -> Dict[str, Any]:
    """Test a single text input with the masked language model"""
    
    # Triton inference endpoint
    url = f"{base_url}/v2/models/{model_name}/infer"
    
    # Prepare the request payload for Triton
    payload = {
        "inputs": [
            {
                "name": "text_input",
                "shape": [1, 1],  # batch_size=1, sequence_length=1 (string input)
                "datatype": "BYTES",
                "data": [text]
            }
        ],
        "outputs": [
            {
                "name": "logits"
            }
        ]
    }
    
    try:
        # Send request to Triton server
        response = requests.post(url, json=payload, timeout=30)
        
        result = {
            "text": text,
            "status_code": response.status_code,
            "success": response.status_code == 200
        }
        
        if response.status_code == 200:
            response_data = response.json()
            
            # Extract logits from response
            if "outputs" in response_data and len(response_data["outputs"]) > 0:
                logits_output = response_data["outputs"][0]
                logits_data = logits_output["data"]
                logits_shape = logits_output["shape"]
                
                # Convert to numpy array for analysis
                logits_array = np.array(logits_data).reshape(logits_shape)
                
                result.update({
                    "logits_shape": logits_shape,
                    "logits_datatype": logits_output.get("datatype", "Unknown"),
                    "logits_array": logits_array,
                    "logits_stats": {
                        "min": float(logits_array.min()),
                        "max": float(logits_array.max()),
                        "mean": float(logits_array.mean()),
                        "std": float(logits_array.std())
                    }
                })
                
                # Analyze top predictions for each token
                if len(logits_shape) >= 2:
                    seq_len, vocab_size = logits_shape[-2], logits_shape[-1]
                    result["sequence_length"] = seq_len
                    result["vocabulary_size"] = vocab_size
                    
                    # Get top predictions for first few token positions
                    top_predictions = []
                    for token_idx in range(min(5, seq_len)):  # Show first 5 tokens
                        token_logits = logits_array[token_idx] if len(logits_shape) == 2 else logits_array[0, token_idx]
                        top_indices = np.argsort(token_logits)[-5:][::-1]  # Top 5 indices
                        top_scores = token_logits[top_indices]
                        
                        token_predictions = {
                            "position": token_idx,
                            "top_token_ids": top_indices.tolist(),
                            "top_scores": top_scores.tolist()
                        }
                        top_predictions.append(token_predictions)
                    
                    result["top_predictions"] = top_predictions
            else:
                result["error"] = "No outputs found in response"
                result["raw_response"] = response_data
        else:
            result["error"] = response.text
            
    except requests.exceptions.RequestException as e:
        result["error"] = f"Request failed: {e}"
    except Exception as e:
        result["error"] = f"Unexpected error: {e}"
    
    return result

# Test texts with [MASK] tokens for masked language modeling
test_texts = [
    "The quick brown [MASK] jumps over the lazy dog.",
    "Paris is the capital of [MASK].",
    "Machine learning is a subset of [MASK] intelligence.",
    "The [MASK] panda is native to China.",
    "Python is a popular [MASK] language."
]

print(f"Testing {len(test_texts)} examples with model: {MODEL_NAME}")
print("=" * 60)

### Run Tests

Now we run the defined tests.

In [None]:
# Test all examples
if server_healthy and available_models:
    results = []
    
    for i, text in enumerate(test_texts, 1):
        print(f"\nTest {i}/{len(test_texts)}: {text}")
        print("-" * 50)
        
        result = test_masked_lm(text)
        results.append(result)
        
        if result['success']:
            print(f"‚úì Success - Shape: {result['logits_shape']}")
            print(f"  Sequence length: {result.get('sequence_length', 'Unknown')}")
            print(f"  Vocabulary size: {result.get('vocabulary_size', 'Unknown')}")
            print(f"  Logits stats: min={result['logits_stats']['min']:.3f}, max={result['logits_stats']['max']:.3f}, mean={result['logits_stats']['mean']:.3f}")
            
            # Show top predictions for first token (often the [MASK] token)
            if 'top_predictions' in result and result['top_predictions']:
                first_token_pred = result['top_predictions'][0]
                print(f"  Top predictions for token position {first_token_pred['position']}:")
                for j, (token_id, score) in enumerate(zip(first_token_pred['top_token_ids'][:3], first_token_pred['top_scores'][:3])):
                    print(f"    {j+1}. Token ID {token_id}: {score:.4f}")
        else:
            print(f"‚úó Failed: {result.get('error', 'Unknown error')}")
    
    # Summary
    successful_tests = sum(1 for r in results if r['success'])
    print(f"\n{'='*60}")
    print(f"Test Summary: {successful_tests}/{len(results)} tests passed")
    
    if successful_tests == len(results):
        print("üéâ All tests passed! Your masked language model is working correctly.")
    elif successful_tests > 0:
        print("‚ö†Ô∏è  Some tests passed, but there were failures. Check the errors above.")
    else:
        print("‚ùå All tests failed. Please check your model deployment and configuration.")
        
else:
    print("Cannot run tests - server not healthy or no models available")
    print("\nTroubleshooting:")
    print("1. Ensure your model is deployed via Helm")
    print("2. Check that kubectl port-forward is running:")
    print(f"   kubectl port-forward svc/{release_name} 8000:8000 -n {namespace}")
    print("3. Verify the service is running:")
    print(f"   kubectl get pods -n {namespace}")

## Step 6: Stop Service

To stop the Triton server service:

In [None]:
cmd = ['helm', 'uninstall', release_name, '-n', namespace]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)