# Serve Llama3 8B Instruct using Triton Inference Server with TensorRT-LLM

This notebook shows how to serve [Meta Llama 3 8B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) with [TensorRT-LLM backend](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main).

## Setup and Imports

In [None]:
! pip install kubernetes
! pip install boto3

In [None]:
import os
import subprocess
import time
from kubernetes import client, config

# Load Kubernetes configuration
config.load_kube_config()
v1 = client.CoreV1Api()

def find_matching_helm_pods(release_name, namespace='kubeflow-user-example-com'):
    """Find pods managed by a specific Helm release"""
    helm_pods = v1.list_namespaced_pod(
        namespace=namespace
    )

    matching_pods = []
    for pod in helm_pods.items:
        if (pod.metadata.annotations and
            pod.metadata.annotations.get('app.kubernetes.io/managed-by') == 'Helm' and 
            pod.metadata.annotations.get('app.kubernetes.io/instance') == release_name):
            matching_pods.append(pod)

    return matching_pods

def wait_for_helm_release_pods(release_name, namespace='kubeflow-user-example-com', timeout=1800):
    """Wait for all pods in a helm release to complete successfully"""
    print(f"Waiting for pods in release '{release_name}' to complete...")
    start_time = time.time()
    
    while time.time() - start_time < timeout:
        try:
            matching_pods = find_matching_helm_pods(release_name, namespace)
            
            if not matching_pods:
                print(f"No pods found in Hem release: {release_name} waiting...")
                time.sleep(60)
                continue
            
            all_completed = True
            for pod in matching_pods:
                status = pod.status.phase
                print(f"Pod {pod.metadata.name}: {status}")
                
                if status in ['Pending', 'Running']:
                    all_completed = False
                elif status == 'Failed':
                    print(f"Pod {pod.metadata.name} failed!")
                    return False
            
            if all_completed:
                print("All pods completed successfully!")
                return True
                
        except Exception as e:
            print(f"Error checking pods: {e}")
        
        time.sleep(60)
    
    print(f"Timeout waiting for pods to complete")
    return False

# Set working directory
os.chdir(os.path.expanduser('~/amazon-eks-machine-learning-with-terraform-and-kubeflow'))
print(f"Working directory: {os.getcwd()}")

## Step 1: Build and Push Docker Container

In [None]:
import sys
import boto3

# Create a Boto3 session
session = boto3.session.Session()

# Access the region_name attribute to get the current region
current_region = session.region_name

cmd = ['./containers/tritonserver-trtllm/build_tools/build_and_push.sh', current_region]

# Start the subprocess with streaming output
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, 
                          text=True, bufsize=1, universal_newlines=True)

# Stream output line by line
for line in process.stdout:
    print(line, end='')  # end='' prevents double newlines
    sys.stdout.flush()   # Force immediate output

# Wait for the process to complete and get the return code
return_code = process.wait()

if return_code != 0:
    print(f"\nProcess exited with return code: {return_code}")
else:
    print("\nProcess completed successfully")

## Step 2: Download Hugging Face Llama3 8B Instruct Model Weights

**Note:** Set your Hugging Face token below before running cell.

In [None]:
# Replace with your actual Hugging Face token
HF_TOKEN = None
assert HF_TOKEN, "Please set HF_TOKEN"

cmd = [
    'helm', 'install', '--debug', 'triton-server-llama3-8b-instruct-trtllm',
    'charts/machine-learning/model-prep/hf-snapshot',
    '--set-json', f'env=[{{"name":"HF_MODEL_ID","value":"meta-llama/Meta-Llama-3-8B-Instruct"}},{{"name":"HF_TOKEN","value":"{HF_TOKEN}"}}]',
    '-n', 'kubeflow-user-example-com'
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for model download to complete
wait_for_helm_release_pods('triton-server-llama3-8b-instruct-trtllm')

In [None]:
# Uninstall the model download job
cmd = ['helm', 'uninstall', 'triton-server-llama3-8b-instruct-trtllm', '-n', 'kubeflow-user-example-com']
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

## Step 3: Convert HuggingFace Checkpoint to TensorRT-LLM Checkpoint

In [None]:
cmd = [
    'helm', 'install', '--debug', 'triton-server-llama3-8b-instruct-trtllm',
    'charts/machine-learning/data-prep/data-process',
    '-f', 'examples/inference/triton-inference-server/tensorrtllm_backend/llama3-8b-instruct/hf_to_trtllm.yaml',
    '-n', 'kubeflow-user-example-com'
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for checkpoint conversion to complete
wait_for_helm_release_pods('triton-server-llama3-8b-instruct-trtllm')

In [None]:
# Uninstall the checkpoint conversion job
cmd = ['helm', 'uninstall', 'triton-server-llama3-8b-instruct-trtllm', '-n', 'kubeflow-user-example-com']
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

## Step 4: Build TensorRT-LLM Engine

In [None]:
cmd = [
    'helm', 'install', '--debug', 'triton-server-llama3-8b-instruct-trtllm',
    'charts/machine-learning/data-prep/data-process',
    '-f', 'examples/inference/triton-inference-server/tensorrtllm_backend/llama3-8b-instruct/trtllm_engine.yaml',
    '-n', 'kubeflow-user-example-com'
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for engine build to complete
wait_for_helm_release_pods('triton-server-llama3-8b-instruct-trtllm')

In [None]:
# Uninstall the engine build job
cmd = ['helm', 'uninstall', 'triton-server-llama3-8b-instruct-trtllm', '-n', 'kubeflow-user-example-com']
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

## Step 5: Build Triton Model

In [None]:
cmd = [
    'helm', 'install', '--debug', 'triton-server-llama3-8b-instruct-trtllm',
    'charts/machine-learning/data-prep/data-process',
    '-f', 'examples/inference/triton-inference-server/tensorrtllm_backend/llama3-8b-instruct/triton_model.yaml',
    '-n', 'kubeflow-user-example-com'
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for Triton model build to complete
wait_for_helm_release_pods('triton-server-llama3-8b-instruct-trtllm')

In [None]:
# Uninstall the Triton model build job
cmd = ['helm', 'uninstall', 'triton-server-llama3-8b-instruct-trtllm', '-n', 'kubeflow-user-example-com']
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

## Step 6: Launch Triton Server

In [None]:
cmd = [
    'helm', 'install', '--debug', 'triton-server-llama3-8b-instruct-trtllm',
    'charts/machine-learning/serving/triton-inference-server',
    '-f', 'examples/inference/triton-inference-server/tensorrtllm_backend/llama3-8b-instruct/triton_server.yaml',
    '-n', 'kubeflow-user-example-com'
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for Triton server to be ready
def wait_for_triton_server(release_name, namespace='kubeflow-user-example-com', timeout=1800):
    """Wait for Triton server pods to be running and ready"""
    print(f"Waiting for Triton server '{release_name}' to be ready...")
    start_time = time.time()
    
    while time.time() - start_time < timeout:
        try:
            matching_pods = find_matching_helm_pods(release_name, namespace)
            
            if not matching_pods:
                print(f"No pods found in Hem release: {release_name} waiting...")
                time.sleep(60)
                continue
            
            all_ready = True
            for pod in matching_pods:
                status = pod.status.phase
                ready = all(condition.status == 'True' for condition in pod.status.conditions if condition.type == 'Ready')
                print(f"Pod {pod.metadata.name}: {status}, Ready: {ready}")
                
                if status != 'Running' or not ready:
                    all_ready = False
                elif status == 'Failed':
                    print(f"Pod {pod.metadata.name} failed!")
                    return False
            
            if all_ready:
                print("Triton Inference Server Pod is Ready!")
                return True
                
        except Exception as e:
            print(f"Error checking pods: {e}")
        
        time.sleep(60)
    
    print(f"Timeout waiting for Triton Inference Server to be Ready")
    return False

wait_for_triton_server('triton-server-llama3-8b-instruct-trtllm')

## Step 7: Check Service Status

In [None]:
def find_matching_helm_services(release_name, namespace='kubeflow-user-example-com'):
    """Find services managed by a specific Helm release"""
    helm_services = v1.list_namespaced_service(
        namespace=namespace
    )

    matching_services = []
    for service in helm_services.items:
        if (service.metadata.annotations and
            service.metadata.annotations.get('app.kubernetes.io/managed-by') == 'Helm' and
            service.metadata.annotations.get('app.kubernetes.io/instance') == release_name):
            matching_services.append(service)

    return matching_services

# Check service status
services = find_matching_helm_services('triton-server-llama3-8b-instruct-trtllm')
print(services)

## Step 8: Stop Service

When you're done with the service, run this cell to clean up resources.

In [None]:
cmd = ['helm', 'uninstall', 'triton-server-llama3-8b-instruct-trtllm', '-n', 'kubeflow-user-example-com']
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)