-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Description
Describe the bug
I'm trying to deploy compiled model for an inf1 instance. The deployment succeeded when I selected ml.inf1.xlarge
as the instance_type. However, model.deploy()
failed when I changed ml.inf1.xlarge
to ml.inf1.2xlarge
for increasing vCPUs.
To reproduce
Before deploying the SageMaker endpoint, NER model is compiled for a neuron instance by the following code.
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import torch.neuron
import os
model_name = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
sample_text = "this is a dummy input for neuron.trace"
max_length = 128
tokens = tokenizer(
sample_text,
padding="max_length",
max_length=max_length,
return_tensors="pt",
truncation=True,
)
example_input = (
tokens["input_ids"],
tokens["attention_mask"],
tokens["token_type_ids"],
)
model_neuron = torch.neuron.trace(
model,
example_input,
strict=False,
verbose=1,
)
save_dir = "tmp"
os.makedirs("tmp", exist_ok=True)
model_neuron.save(os.path.join(save_dir, "neuron_model.pt"))
tokenizer.save_pretrained(save_dir)
model.config.update({"traced_sequence_length": max_length})
model.config.save_pretrained(save_dir)
Then I created model.tar.gz
. The file structure is as below.
$ tree
.
├── code
│ └── inference.py
├── config.json
├── neuron_model.pt
├── special_tokens_map.json
├── tokenizer.json
├── tokenizer_config.json
└── vocab.txt
inference.py
is below.
from transformers import AutoConfig, AutoTokenizer
import numpy as np
import os
import torch
import torch.neuron
os.environ["NEURON_RT_NUM_CORES"] = "1"
AWS_NEURON_TRACED_WEIGHTS_NAME = "neuron_model.pt"
def model_fn(model_dir):
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = torch.jit.load(os.path.join(model_dir, AWS_NEURON_TRACED_WEIGHTS_NAME))
model_config = AutoConfig.from_pretrained(model_dir)
return model, tokenizer, model_config
def predict_fn(data, model_tokenizer_model_config):
model, tokenizer, model_config = model_tokenizer_model_config
input_text = data.pop("text", data)
tokens = tokenizer(
input_text,
return_tensors="pt",
max_length=model_config.traced_sequence_length,
padding="max_length",
truncation=True,
)
inputs = (
tokens["input_ids"],
tokens["attention_mask"],
tokens["token_type_ids"],
)
with torch.inference_mode():
logits = model(*inputs)["logits"]
preds = np.argmax(logits, axis=2)
outputs = []
index = 0
# ignore [CLS] (0th index) and [SEP] (last index)
for label_id, token_id in zip(
preds.flatten()[1:-1], tokens.input_ids.flatten()[1:]
):
if token_id == tokenizer.sep_token_id:
break
outputs.append(
{
"index": index,
"token": tokenizer.decode(token_id),
"label": model_config.id2label[label_id.item()],
}
)
index += 1
res = {"predictions": outputs}
return res
After uploading model.tar.gz
to my s3 bucket, I created SageMaker endpoint as blow.
from pprint import pprint
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
import boto3
import sagemaker
sess = sagemaker.Session()
sagemaker_session_bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
huggingface_model = HuggingFaceModel(
model_data="s3://my-sagemaker-bucket/ner-model/model.tar.gz",
role=role,
transformers_version="4.12",
pytorch_version="1.9",
py_version="py37",
)
huggingface_model._is_compiled_model = True
# deploy
predictor = huggingface_model.deploy(
initial_instance_count=1,
instance_type="ml.inf1.xlarge", # "ml.inf1.2xlarge" does not work
endpoint_name="ner-endpoint",
serializer=JSONSerializer(),
deserializer=JSONDeserializer(),
)
res = predictor.predict(data={"text": "Huggingface has offices in NYC and Paris."})
pprint(res)
"""
{'predictions': [{'index': 0, 'label': 'B-ORG', 'token': 'Hu'},
{'index': 1, 'label': 'I-ORG', 'token': '##gging'},
{'index': 2, 'label': 'I-ORG', 'token': '##face'},
{'index': 3, 'label': 'O', 'token': 'has'},
{'index': 4, 'label': 'O', 'token': 'offices'},
{'index': 5, 'label': 'O', 'token': 'in'},
{'index': 6, 'label': 'B-LOC', 'token': 'NYC'},
{'index': 7, 'label': 'O', 'token': 'and'},
{'index': 8, 'label': 'B-LOC', 'token': 'Paris'},
{'index': 9, 'label': 'O', 'token': '.'}]}
"""
Expected behavior
Same output as instance_type="ml.inf1.xlarge"
.
Screenshots or logs
When I rewrite instance_type="ml.inf1.xlarge"
to instance_type="ml.inf1.2xlarge"
, Model error occured.
---------------------------------------------------------------------------
ModelError Traceback (most recent call last)
<ipython-input-14-61ba969d077b> in <module>
29 deserializer=JSONDeserializer(),
30 )
---> 31 res = predictor.predict(data={"text": "Huggingface has offices in NYC and Paris."})
32 pprint(res)
/usr/local/lib/python3.6/site-packages/sagemaker/predictor.py in predict(self, data, initial_args, target_model, target_variant, inference_id)
159 data, initial_args, target_model, target_variant, inference_id
160 )
--> 161 response = self.sagemaker_session.sagemaker_runtime_client.invoke_endpoint(**request_args)
162 return self._handle_response(response)
163
/usr/local/lib/python3.6/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
506 )
507 # The "self" in this scope is referring to the BaseClient.
--> 508 return self._make_api_call(operation_name, kwargs)
509
510 _api_call.__name__ = str(py_operation_name)
/usr/local/lib/python3.6/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
909 error_code = parsed_response.get("Error", {}).get("Code")
910 error_class = self.exceptions.from_code(error_code)
--> 911 raise error_class(parsed_response, operation_name)
912 else:
913 return parsed_response
ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from model with message "{
"code": 400,
"type": "InternalServerException",
"message": "The PyTorch Neuron Runtime could not be initialized. Neuron Driver issues are logged\nto your system logs. See the Neuron Runtime\u0027s troubleshooting guide for help on this\ntopic: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/"
}
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/ner-endpoint in account 123456789 for more information.
System information
A description of your system. Please provide:
- SageMaker Python SDK version: 2.109.0
- Framework name (eg. PyTorch) or algorithm (eg. KMeans): transformers and pytorch
- Framework version:
- transformers_version: 4.12
- pytorch_version: 1.9
- Python version: py37
- CPU or GPU: Inferentia
- Custom Docker image (Y/N): N
Sagemaker endpoint is deployed by SageMaker notebook instance (ml.t3.medium), Image is "Base Python" and Kernel is "Python3".