# Deploy a LLaMA 3.3 70B Instruct Model Using SageMaker Endpoints with P4d.24xlarge instance

In this example you will deploy `Llama-3.3-70B-Instruct` SageMaker Managed Endpoint.

In [1]:
!pip install -Uq sagemaker

In [2]:
!pip install -Uq transformers

In [5]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

In [None]:
# assert hub['HUGGING_FACE_HUB_TOKEN'] != '<REPLACE WITH YOUR TOKEN>', "You have to provide a token."

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'meta-llama/Llama-3.3-70B-Instruct',
	'SM_NUM_GPUS': '8',
    # 'HUGGING_FACE_HUB_TOKEN': '<REPLACE WITH YOUR TOKEN>'
	'HUGGING_FACE_HUB_TOKEN': 'hf_'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="2.3.1"), 
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
    instance_type="ml.p4d.24xlarge",
	container_startup_health_check_timeout=1200,
  )

In [23]:
predictor.predict(
    {
        "inputs": "Plan a 7 day trip in europe in December, what kind of weather conditions should I expect?",
        "parameters": {
            "do_sample":True,
            "max_new_tokens":250,
            "top_p":0.9,
            "temperature":0.6,
        }
    }
)

[{'generated_text': 'Plan a 7 day trip in europe in December, what kind of weather conditions should I expect? December is one of the coldest months in Europe, with average temperatures ranging from 32°F (0°C) to 43°F (6°C) throughout the continent.\nWhat is the best way to travel in Europe in December?\nThe best way to travel in Europe in December depends on your personal preferences, budget, and the specific destinations you plan to visit. However, here are some popular options:\n1. Train: Europe has an excellent rail network, and trains are a convenient way to travel between cities. You can book tickets in advance through websites like Eurail, Rail Europe, or national rail companies like SNCF (France), DB (Germany), or Trenitalia (Italy).\n2. Budget Airlines: Low-cost carriers like Ryanair, EasyJet, and Wizz Air offer affordable flights between European cities. Be sure to check for any additional fees for baggage, food, or seat selection.\n3. Bus: Companies like FlixBus, Eurolines, 

# Streaming Response

In [None]:
#Get the list of endpoints.
import sagemaker
from sagemaker.huggingface import HuggingFacePredictor

# Get the endpoint name from your list of endpoints
sagemaker_client = boto3.client('sagemaker')
endpoints = sagemaker_client.list_endpoints()['Endpoints']
for endpoint in endpoints:
    print(endpoint['EndpointName'])

In [30]:
import json
import boto3
import time
from datetime import datetime

client = boto3.client('sagemaker-runtime')
##Update your endpoint below to invoke it using streaming response

endpoint= "huggingface-pytorch-tgi-inference-2024-12-07-01-30-30-704"
prompt = "Plan a 1 week trip to Europe in March, I like historical sites"

start_time = time.time()
first_token_received = False
ttft = None
token_count = 0

print(f"Prompt: {prompt}\n")
print("Response:", end=' ', flush=True)

response = client.invoke_endpoint_with_response_stream(
    EndpointName=endpoint,
    ContentType="application/json",
    Body=json.dumps({
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 512,
            "do_sample": True,
            "temperature": 0.7
        },
        "stream": True
    })
)

for event in response['Body']:
    line = event['PayloadPart']['Bytes'].decode()
    if line.startswith('data: '):
        try:
            chunk = json.loads(line[6:])  # Skip the "data: " prefix
            if 'token' in chunk:
                token_count += 1
                if not first_token_received:
                    ttft = time.time() - start_time
                    first_token_received = True
                print(chunk['token']['text'], end='', flush=True)
        except json.JSONDecodeError:
            continue

end_time = time.time()
total_latency = end_time - start_time

print("\n\nMetrics:")
print(f"Time to First Token (TTFT): {ttft:.2f} seconds")
print(f"Total Tokens Generated: {token_count}")
print(f"Total Latency: {total_latency:.2f} seconds")

Prompt: Plan a 1 week trip to Europe in March, I like historical sites

Response:  and architecture.
Europe in March can be a great destination, with fewer tourists than in the peak summer months, and many historical sites and architectural wonders to explore. Here's a suggested 1-week itinerary for a trip to Europe in March, focusing on historical sites and architecture:

**Destination:** Rome, Italy, and Barcelona, Spain

**Day 1-3: Rome, Italy**

* Fly into Rome's Leonardo da Vinci–Fiumicino Airport (FCO* Explore the Colosseum (70-80 EUR per person), the Roman Forum (12 EUR per person), and the Pantheon (free admission)
* Visit the Vatican City, including the Vatican Museums (-25 EUR per person) and St. Peter's Basilica (free admission)
* Wander through the center, taking in the architecture and piazzas, such as Piazza Navona the Spanish Steps
* Enjoy the local cuisine, including pizza, pasta, and gelato**Day 4-7: Barcelona, Spain**

* Take a flight from Rome to Barcelona's El Prat 

In [None]:
##Delete endpoint after use to save costs
predictor.delete_endpoint(delete_endpoint_config=True)