Skip to content

Deploying Huggingface model into ml.inf1.xlarge succeeded, but ml.inf1.2xlarge failed #3376

@tagucci

Description

@tagucci

Describe the bug
I'm trying to deploy compiled model for an inf1 instance. The deployment succeeded when I selected ml.inf1.xlarge as the instance_type. However, model.deploy() failed when I changed ml.inf1.xlarge to ml.inf1.2xlarge for increasing vCPUs.

To reproduce

Before deploying the SageMaker endpoint, NER model is compiled for a neuron instance by the following code.

from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import torch.neuron
import os

model_name = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

sample_text = "this is a dummy input for neuron.trace"
max_length = 128
tokens = tokenizer(
    sample_text,
    padding="max_length",
    max_length=max_length,
    return_tensors="pt",
    truncation=True,
)
example_input = (
    tokens["input_ids"],
    tokens["attention_mask"],
    tokens["token_type_ids"],
)

model_neuron = torch.neuron.trace(
    model,
    example_input,
    strict=False,
    verbose=1,
)

save_dir = "tmp"
os.makedirs("tmp", exist_ok=True)
model_neuron.save(os.path.join(save_dir, "neuron_model.pt"))
tokenizer.save_pretrained(save_dir)
model.config.update({"traced_sequence_length": max_length})
model.config.save_pretrained(save_dir)

Then I created model.tar.gz. The file structure is as below.

$ tree
.
├── code
│   └── inference.py
├── config.json
├── neuron_model.pt
├── special_tokens_map.json
├── tokenizer.json
├── tokenizer_config.json
└── vocab.txt

inference.py is below.

from transformers import AutoConfig, AutoTokenizer
import numpy as np
import os
import torch
import torch.neuron

os.environ["NEURON_RT_NUM_CORES"] = "1"
AWS_NEURON_TRACED_WEIGHTS_NAME = "neuron_model.pt"


def model_fn(model_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = torch.jit.load(os.path.join(model_dir, AWS_NEURON_TRACED_WEIGHTS_NAME))
    model_config = AutoConfig.from_pretrained(model_dir)
    return model, tokenizer, model_config


def predict_fn(data, model_tokenizer_model_config):
    model, tokenizer, model_config = model_tokenizer_model_config
    input_text = data.pop("text", data)
    tokens = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=model_config.traced_sequence_length,
        padding="max_length",
        truncation=True,
    )
    inputs = (
        tokens["input_ids"],
        tokens["attention_mask"],
        tokens["token_type_ids"],
    )
    with torch.inference_mode():
        logits = model(*inputs)["logits"]
        preds = np.argmax(logits, axis=2)

    outputs = []
    index = 0
    # ignore [CLS] (0th index) and [SEP] (last index)
    for label_id, token_id in zip(
        preds.flatten()[1:-1], tokens.input_ids.flatten()[1:]
    ):
        if token_id == tokenizer.sep_token_id:
            break
        outputs.append(
            {
                "index": index,
                "token": tokenizer.decode(token_id),
                "label": model_config.id2label[label_id.item()],
            }
        )
        index += 1
    res = {"predictions": outputs}
    return res

After uploading model.tar.gz to my s3 bucket, I created SageMaker endpoint as blow.

from pprint import pprint
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
import boto3
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

huggingface_model = HuggingFaceModel(
    model_data="s3://my-sagemaker-bucket/ner-model/model.tar.gz",
    role=role,
    transformers_version="4.12",
    pytorch_version="1.9",
    py_version="py37",
)

huggingface_model._is_compiled_model = True

# deploy
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.inf1.xlarge", # "ml.inf1.2xlarge" does not work
    endpoint_name="ner-endpoint",
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)
res = predictor.predict(data={"text": "Huggingface has offices in NYC and Paris."})
pprint(res)
"""
{'predictions': [{'index': 0, 'label': 'B-ORG', 'token': 'Hu'},
                 {'index': 1, 'label': 'I-ORG', 'token': '##gging'},
                 {'index': 2, 'label': 'I-ORG', 'token': '##face'},
                 {'index': 3, 'label': 'O', 'token': 'has'},
                 {'index': 4, 'label': 'O', 'token': 'offices'},
                 {'index': 5, 'label': 'O', 'token': 'in'},
                 {'index': 6, 'label': 'B-LOC', 'token': 'NYC'},
                 {'index': 7, 'label': 'O', 'token': 'and'},
                 {'index': 8, 'label': 'B-LOC', 'token': 'Paris'},
                 {'index': 9, 'label': 'O', 'token': '.'}]}
"""

Expected behavior

Same output as instance_type="ml.inf1.xlarge".

Screenshots or logs
When I rewrite instance_type="ml.inf1.xlarge" to instance_type="ml.inf1.2xlarge", Model error occured.

---------------------------------------------------------------------------
ModelError                                Traceback (most recent call last)
<ipython-input-14-61ba969d077b> in <module>
     29     deserializer=JSONDeserializer(),
     30 )
---> 31 res = predictor.predict(data={"text": "Huggingface has offices in NYC and Paris."})
     32 pprint(res)

/usr/local/lib/python3.6/site-packages/sagemaker/predictor.py in predict(self, data, initial_args, target_model, target_variant, inference_id)
    159             data, initial_args, target_model, target_variant, inference_id
    160         )
--> 161         response = self.sagemaker_session.sagemaker_runtime_client.invoke_endpoint(**request_args)
    162         return self._handle_response(response)
    163 

/usr/local/lib/python3.6/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
    506                 )
    507             # The "self" in this scope is referring to the BaseClient.
--> 508             return self._make_api_call(operation_name, kwargs)
    509 
    510         _api_call.__name__ = str(py_operation_name)

/usr/local/lib/python3.6/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
    909             error_code = parsed_response.get("Error", {}).get("Code")
    910             error_class = self.exceptions.from_code(error_code)
--> 911             raise error_class(parsed_response, operation_name)
    912         else:
    913             return parsed_response

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from model with message "{
  "code": 400,
  "type": "InternalServerException",
  "message": "The PyTorch Neuron Runtime could not be initialized. Neuron Driver issues are logged\nto your system logs. See the Neuron Runtime\u0027s troubleshooting guide for help on this\ntopic: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/"
}
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/ner-endpoint in account 123456789 for more information.

System information
A description of your system. Please provide:

  • SageMaker Python SDK version: 2.109.0
  • Framework name (eg. PyTorch) or algorithm (eg. KMeans): transformers and pytorch
  • Framework version:
    • transformers_version: 4.12
    • pytorch_version: 1.9
  • Python version: py37
  • CPU or GPU: Inferentia
  • Custom Docker image (Y/N): N

Sagemaker endpoint is deployed by SageMaker notebook instance (ml.t3.medium), Image is "Base Python" and Kernel is "Python3".

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions