Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: Move accelerate dependency closer to size calculations #4502

Merged
merged 6 commits into from Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
43 changes: 13 additions & 30 deletions src/sagemaker/serve/builder/model_builder.py
Expand Up @@ -20,7 +20,6 @@

from pathlib import Path

from accelerate.commands.estimate import estimate_command_parser, gather_data
from sagemaker import Session
from sagemaker.model import Model
from sagemaker.base_predictor import PredictorBase
Expand All @@ -43,7 +42,11 @@
from sagemaker.serve.utils import task
from sagemaker.serve.utils.exceptions import TaskNotFoundException
from sagemaker.serve.utils.predictors import _get_local_mode_predictor
from sagemaker.serve.utils.hardware_detector import _get_gpu_info, _get_gpu_info_fallback
from sagemaker.serve.utils.hardware_detector import (
_get_gpu_info,
_get_gpu_info_fallback,
_total_inference_model_size_mib,
)
from sagemaker.serve.detector.image_detector import (
auto_detect_container,
_detect_framework_and_version,
Expand All @@ -70,11 +73,8 @@
ModelServer.DJL_SERVING,
}

MIB_CONVERSION_FACTOR = 0.00000095367431640625
MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer


# pylint: disable=attribute-defined-outside-init
# pylint: disable=attribute-defined-outside-init, disable=E1101
@dataclass
class ModelBuilder(Triton, DJL, JumpStart, TGI, Transformers):
"""Class that builds a deployable model.
Expand Down Expand Up @@ -719,39 +719,22 @@ def _schema_builder_init(self, model_task: str):
except ValueError:
raise TaskNotFoundException(f"Schema builder for {model_task} could not be found.")

def _total_inference_model_size_mib(self):
"""Calculates the model size from HF accelerate

This function gets the model size from accelerate. It also adds a
padding and converts to size MiB. When performing inference, expect
to add up to an additional 20% to the given model size as found by EleutherAI.
"""
dtypes = self.env_vars.get("dtypes", "float32")
parser = estimate_command_parser()
args = parser.parse_args([self.model, "--dtypes", dtypes])

output = gather_data(
args
) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam"

if output is None:
raise ValueError(f"Could not get Model size for {self.model}")

total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR
logger.info("Total memory size MIB: %s", total_memory_size_mib)
return total_memory_size_mib

def _can_fit_on_single_gpu(self) -> Type[bool]:
"""Check if model can fit on a single GPU

If the size of the model is <= single gpu memory size, returns True else False
"""
try:
single_gpu_size_mib = self._try_fetch_gpu_info()
if self._total_inference_model_size_mib() <= single_gpu_size_mib:
if (
_total_inference_model_size_mib(self.model, self.env_vars.get("dtypes", "float32"))
<= single_gpu_size_mib
):
logger.info(
"Total inference model size MIB %s, single GPU size for instance MIB %s",
self._total_inference_model_size_mib(),
_total_inference_model_size_mib(
self.model, self.env_vars.get("dtypes", "float32")
),
single_gpu_size_mib,
)
return True
Expand Down
14 changes: 10 additions & 4 deletions src/sagemaker/serve/builder/schema_builder.py
Expand Up @@ -208,12 +208,18 @@

def __repr__(self):
"""Placeholder docstring"""
if hasattr(self, "input_serializer") and hasattr(self, "output_serializer"):
return (

Check warning on line 212 in src/sagemaker/serve/builder/schema_builder.py

View check run for this annotation

Codecov / codecov/patch

src/sagemaker/serve/builder/schema_builder.py#L211-L212

Added lines #L211 - L212 were not covered by tests
f"SchemaBuilder(\n"
f"input_serializer={self.input_serializer}\n"
f"output_serializer={self.output_serializer}\n"
f"input_deserializer={self.input_deserializer._deserializer}\n"
f"output_deserializer={self.output_deserializer._deserializer})"
)
return (
f"SchemaBuilder(\n"
f"input_serializer={self.input_serializer}\n"
f"output_serializer={self.output_serializer}\n"
f"input_deserializer={self.input_deserializer._deserializer}\n"
f"output_deserializer={self.output_deserializer._deserializer})"
f"custom_input_translator={self.custom_input_translator}\n"
f"custom_output_translator={self.custom_output_translator}\n"
)

def generate_marshalling_map(self) -> dict:
Expand Down
27 changes: 27 additions & 0 deletions src/sagemaker/serve/utils/hardware_detector.py
Expand Up @@ -18,12 +18,18 @@

from botocore.exceptions import ClientError

from accelerate.commands.estimate import estimate_command_parser, gather_data
from sagemaker import Session
from sagemaker.model import Model
from sagemaker import instance_types_gpu_info

logger = logging.getLogger(__name__)


MIB_CONVERSION_FACTOR = 0.00000095367431640625
MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer


def _get_gpu_info(instance_type: str, session: Session) -> Tuple[int, int]:
"""Get GPU info for the provided instance

Expand Down Expand Up @@ -108,3 +114,24 @@ def _format_instance_type(instance_type: str) -> str:

ec2_instance = ".".join(split_instance)
return ec2_instance


def _total_inference_model_size_mib(model: Model, dtype: str) -> int:
"""Calculates the model size from HF accelerate

This function gets the model size from accelerate. It also adds a
padding and converts to size MiB. When performing inference, expect
to add up to an additional 20% to the given model size as found by EleutherAI.
"""
args = estimate_command_parser().parse_args([model, "--dtypes", dtype])

output = gather_data(
args
) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam"

if output is None:
raise ValueError(f"Could not get Model size for {model}")

total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR
logger.info("Total memory size MIB: %s", total_memory_size_mib)
return total_memory_size_mib
5 changes: 3 additions & 2 deletions tests/integ/sagemaker/serve/test_serve_pt_happy.py
Expand Up @@ -181,7 +181,6 @@ def model_builder(request):
# ), f"{caught_ex} was thrown when running pytorch squeezenet local container test"


@pytest.mark.skip(reason="Failing test. Fix is pending.")
@pytest.mark.skipif(
PYTHON_VERSION_IS_NOT_310, # or NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE,
reason="The goal of these test are to test the serving components of our feature",
Expand Down Expand Up @@ -222,8 +221,10 @@ def test_happy_pytorch_sagemaker_endpoint(
)
if caught_ex:
logger.exception(caught_ex)
ignore_if_worker_dies = "Worker died." in str(caught_ex)
# https://github.com/pytorch/serve/issues/3032
assert (
False
ignore_if_worker_dies
), f"{caught_ex} was thrown when running pytorch squeezenet sagemaker endpoint test"


Expand Down
66 changes: 4 additions & 62 deletions tests/unit/sagemaker/serve/builder/test_model_builder.py
Expand Up @@ -53,9 +53,6 @@
ModelServer.DJL_SERVING,
}

MIB_CONVERSION_FACTOR = 0.00000095367431640625
MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer

mock_session = MagicMock()


Expand Down Expand Up @@ -1205,7 +1202,7 @@ def test_build_for_transformers_happy_case(

@patch("sagemaker.serve.builder.model_builder.ModelBuilder._build_for_transformers")
@patch("sagemaker.serve.builder.model_builder.ModelBuilder._try_fetch_gpu_info")
@patch("sagemaker.serve.builder.model_builder.ModelBuilder._total_inference_model_size_mib")
@patch("sagemaker.serve.builder.model_builder._total_inference_model_size_mib")
@patch("sagemaker.image_uris.retrieve")
@patch("sagemaker.djl_inference.model.urllib")
@patch("sagemaker.djl_inference.model.json")
Expand Down Expand Up @@ -1248,7 +1245,7 @@ def test_build_for_transformers_happy_case_with_values(

@patch("sagemaker.serve.builder.model_builder.ModelBuilder._build_for_djl", Mock())
@patch("sagemaker.serve.builder.model_builder._get_gpu_info")
@patch("sagemaker.serve.builder.model_builder.ModelBuilder._total_inference_model_size_mib")
@patch("sagemaker.serve.builder.model_builder._total_inference_model_size_mib")
@patch("sagemaker.image_uris.retrieve")
@patch("sagemaker.djl_inference.model.urllib")
@patch("sagemaker.djl_inference.model.json")
Expand Down Expand Up @@ -1293,7 +1290,7 @@ def test_build_for_transformers_happy_case_with_valid_gpu_info(
@patch("sagemaker.serve.builder.model_builder.ModelBuilder._build_for_transformers", Mock())
@patch("sagemaker.serve.builder.model_builder._get_gpu_info")
@patch("sagemaker.serve.builder.model_builder._get_gpu_info_fallback")
@patch("sagemaker.serve.builder.model_builder.ModelBuilder._total_inference_model_size_mib")
@patch("sagemaker.serve.builder.model_builder._total_inference_model_size_mib")
@patch("sagemaker.image_uris.retrieve")
@patch("sagemaker.djl_inference.model.urllib")
@patch("sagemaker.djl_inference.model.json")
Expand Down Expand Up @@ -1342,61 +1339,6 @@ def test_build_for_transformers_happy_case_with_valid_gpu_fallback(
)
self.assertEqual(model_builder._can_fit_on_single_gpu(), True)

@patch("sagemaker.serve.builder.model_builder.ModelBuilder._build_for_transformers", Mock())
@patch("sagemaker.serve.builder.model_builder.estimate_command_parser")
@patch("sagemaker.serve.builder.model_builder.gather_data")
@patch("sagemaker.image_uris.retrieve")
@patch("sagemaker.djl_inference.model.urllib")
@patch("sagemaker.djl_inference.model.json")
@patch("sagemaker.huggingface.llm_utils.urllib")
@patch("sagemaker.huggingface.llm_utils.json")
@patch("sagemaker.model_uris.retrieve")
@patch("sagemaker.serve.builder.model_builder._ServeSettings")
def test_build_for_transformers_happy_case_hugging_face_responses(
self,
mock_serveSettings,
mock_model_uris_retrieve,
mock_llm_utils_json,
mock_llm_utils_urllib,
mock_model_json,
mock_model_urllib,
mock_image_uris_retrieve,
mock_gather_data,
mock_parser,
):
mock_setting_object = mock_serveSettings.return_value
mock_setting_object.role_arn = mock_role_arn
mock_setting_object.s3_model_data_url = mock_s3_model_data_url

mock_model_uris_retrieve.side_effect = KeyError
mock_llm_utils_json.load.return_value = {"pipeline_tag": "text-classification"}
mock_llm_utils_urllib.request.Request.side_effect = Mock()

mock_model_json.load.return_value = {"some": "config"}
mock_model_urllib.request.Request.side_effect = Mock()
mock_image_uris_retrieve.return_value = "https://some-image-uri"

mock_parser.return_value = Mock()
mock_gather_data.return_value = [[1, 1, 1, 1]]
product = MIB_CONVERSION_FACTOR * 1 * MEMORY_BUFFER_MULTIPLIER

model_builder = ModelBuilder(
model="stable-diffusion",
sagemaker_session=mock_session,
instance_type=mock_instance_type,
)
self.assertEqual(model_builder._total_inference_model_size_mib(), product)

mock_parser.return_value = Mock()
mock_gather_data.return_value = None
model_builder = ModelBuilder(
model="stable-diffusion",
sagemaker_session=mock_session,
instance_type=mock_instance_type,
)
with self.assertRaises(ValueError) as _:
model_builder._total_inference_model_size_mib()

@patch("sagemaker.serve.builder.model_builder.ModelBuilder._build_for_djl")
@patch("sagemaker.serve.builder.model_builder.ModelBuilder._can_fit_on_single_gpu")
@patch("sagemaker.image_uris.retrieve")
Expand Down Expand Up @@ -1556,7 +1498,7 @@ def test_try_fetch_gpu_info_throws(
self.assertEqual(model_builder._can_fit_on_single_gpu(), False)

@patch("sagemaker.serve.builder.model_builder.ModelBuilder._build_for_transformers", Mock())
@patch("sagemaker.serve.builder.model_builder.ModelBuilder._total_inference_model_size_mib")
@patch("sagemaker.serve.builder.model_builder._total_inference_model_size_mib")
@patch("sagemaker.image_uris.retrieve")
@patch("sagemaker.djl_inference.model.urllib")
@patch("sagemaker.djl_inference.model.json")
Expand Down
24 changes: 24 additions & 0 deletions tests/unit/sagemaker/serve/utils/test_hardware_detector.py
Expand Up @@ -13,6 +13,7 @@
from __future__ import absolute_import

from botocore.exceptions import ClientError
from unittest.mock import patch, Mock
import pytest

from sagemaker.serve.utils import hardware_detector
Expand All @@ -21,6 +22,8 @@
VALID_INSTANCE_TYPE = "ml.g5.48xlarge"
INVALID_INSTANCE_TYPE = "fl.c5.57xxlarge"
EXPECTED_INSTANCE_GPU_INFO = (8, 196608)
MIB_CONVERSION_FACTOR = 0.00000095367431640625
MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer


def test_get_gpu_info_success(sagemaker_session, boto_session):
Expand Down Expand Up @@ -96,3 +99,24 @@ def test_format_instance_type_without_ml_success():
formatted_instance_type = hardware_detector._format_instance_type("g5.48xlarge")

assert formatted_instance_type == "g5.48xlarge"


@patch("sagemaker.serve.utils.hardware_detector.estimate_command_parser")
@patch("sagemaker.serve.utils.hardware_detector.gather_data")
def test_total_inference_model_size_mib(
mock_gather_data,
mock_parser,
):
mock_parser.return_value = Mock()
mock_gather_data.return_value = [[1, 1, 1, 1]]
product = MIB_CONVERSION_FACTOR * 1 * MEMORY_BUFFER_MULTIPLIER

assert (
hardware_detector._total_inference_model_size_mib("stable-diffusion", "float32") == product
)

mock_parser.return_value = Mock()
mock_gather_data.return_value = None

with pytest.raises(ValueError):
hardware_detector._total_inference_model_size_mib("stable-diffusion", "float32")