diff --git a/src/sagemaker/jumpstart/model.py b/src/sagemaker/jumpstart/model.py index 6f263d9a7e..534e93c285 100644 --- a/src/sagemaker/jumpstart/model.py +++ b/src/sagemaker/jumpstart/model.py @@ -465,13 +465,13 @@ def benchmark_metrics(self) -> pd.DataFrame: Benchmark Metrics: Pandas DataFrame object. """ df = pd.DataFrame(self._get_deployment_configs_benchmarks_data()) - default_mask = df.apply(lambda row: any("Default" in str(val) for val in row), axis=1) - sorted_df = pd.concat([df[default_mask], df[~default_mask]]) - return sorted_df + blank_index = [""] * len(df) + df.index = blank_index + return df def display_benchmark_metrics(self, *args, **kwargs) -> None: """Display deployment configs benchmark metrics.""" - print(self.benchmark_metrics.to_markdown(index=False), *args, **kwargs) + print(self.benchmark_metrics.to_markdown(index=False, floatfmt=".2f"), *args, **kwargs) def list_deployment_configs(self) -> List[Dict[str, Any]]: """List deployment configs for ``This`` model. @@ -911,16 +911,34 @@ def _get_deployment_configs( ) ) + config_components = metadata_config.config_components.get(config_name) + image_uri = ( + ( + config_components.hosting_instance_type_variants.get("regional_aliases", {}) + .get(self.region, {}) + .get("alias_ecr_uri_1") + ) + if config_components + else self.image_uri + ) + init_kwargs = get_init_kwargs( + config_name=config_name, model_id=self.model_id, instance_type=instance_type_to_use, sagemaker_session=self.sagemaker_session, + image_uri=image_uri, + region=self.region, + model_version=self.model_version, ) deploy_kwargs = get_deploy_kwargs( model_id=self.model_id, instance_type=instance_type_to_use, sagemaker_session=self.sagemaker_session, + region=self.region, + model_version=self.model_version, ) + deployment_config_metadata = DeploymentConfigMetadata( config_name, metadata_config.benchmark_metrics, diff --git a/src/sagemaker/jumpstart/types.py b/src/sagemaker/jumpstart/types.py index 7a51d075ae..7a43f0ff67 100644 --- a/src/sagemaker/jumpstart/types.py +++ b/src/sagemaker/jumpstart/types.py @@ -746,7 +746,7 @@ def _get_regional_property( class JumpStartBenchmarkStat(JumpStartDataHolderType): """Data class JumpStart benchmark stat.""" - __slots__ = ["name", "value", "unit"] + __slots__ = ["name", "value", "unit", "concurrency"] def __init__(self, spec: Dict[str, Any]): """Initializes a JumpStartBenchmarkStat object. @@ -765,6 +765,7 @@ def from_json(self, json_obj: Dict[str, Any]) -> None: self.name: str = json_obj["name"] self.value: str = json_obj["value"] self.unit: Union[int, str] = json_obj["unit"] + self.concurrency: Union[int, str] = json_obj["concurrency"] def to_json(self) -> Dict[str, Any]: """Returns json representation of JumpStartBenchmarkStat object.""" diff --git a/src/sagemaker/jumpstart/utils.py b/src/sagemaker/jumpstart/utils.py index 44be0ea813..7a2ed4c599 100644 --- a/src/sagemaker/jumpstart/utils.py +++ b/src/sagemaker/jumpstart/utils.py @@ -1082,7 +1082,9 @@ def add_instance_rate_stats_to_benchmark_metrics( if not benchmark_metric_stats: benchmark_metric_stats = [] - benchmark_metric_stats.append(JumpStartBenchmarkStat(instance_type_rate)) + benchmark_metric_stats.append( + JumpStartBenchmarkStat({"concurrency": None, **instance_type_rate}) + ) final_benchmark_metrics[instance_type] = benchmark_metric_stats except ClientError as e: @@ -1127,43 +1129,94 @@ def get_metrics_from_deployment_configs( if not deployment_configs: return {} - data = {"Instance Type": [], "Config Name": []} + data = {"Instance Type": [], "Config Name": [], "Concurrent Users": []} instance_rate_data = {} for index, deployment_config in enumerate(deployment_configs): benchmark_metrics = deployment_config.benchmark_metrics if not deployment_config.deployment_args or not benchmark_metrics: continue - for inner_index, current_instance_type in enumerate(benchmark_metrics): - current_instance_type_metrics = benchmark_metrics[current_instance_type] - - data["Config Name"].append(deployment_config.deployment_config_name) - instance_type_to_display = ( - f"{current_instance_type} (Default)" - if index == 0 - and current_instance_type == deployment_config.deployment_args.default_instance_type - else current_instance_type + for current_instance_type, current_instance_type_metrics in benchmark_metrics.items(): + instance_type_rate, concurrent_users = _normalize_benchmark_metrics( + current_instance_type_metrics ) - data["Instance Type"].append(instance_type_to_display) - - for metric in current_instance_type_metrics: - column_name = f"{metric.name} ({metric.unit})" - - if metric.name.lower() == "instance rate": - if column_name not in instance_rate_data: - instance_rate_data[column_name] = [] - instance_rate_data[column_name].append(metric.value) - else: - if column_name not in data: - data[column_name] = [] - for _ in range(len(data[column_name]), inner_index): - data[column_name].append(" - ") + + for concurrent_user, metrics in concurrent_users.items(): + instance_type_to_display = ( + f"{current_instance_type} (Default)" + if index == 0 + and int(concurrent_user) == 1 + and current_instance_type + == deployment_config.deployment_args.default_instance_type + else current_instance_type + ) + + data["Config Name"].append(deployment_config.deployment_config_name) + data["Instance Type"].append(instance_type_to_display) + data["Concurrent Users"].append(concurrent_user) + + if instance_type_rate: + instance_rate_column_name = ( + f"{instance_type_rate.name} ({instance_type_rate.unit})" + ) + instance_rate_data[instance_rate_column_name] = instance_rate_data.get( + instance_rate_column_name, [] + ) + instance_rate_data[instance_rate_column_name].append(instance_type_rate.value) + + for metric in metrics: + column_name = _normalize_benchmark_metric_column_name(metric.name) + data[column_name] = data.get(column_name, []) data[column_name].append(metric.value) data = {**data, **instance_rate_data} return data +def _normalize_benchmark_metric_column_name(name: str) -> str: + """Normalizes benchmark metric column name. + + Args: + name (str): Name of the metric. + Returns: + str: Normalized metric column name. + """ + if "latency" in name.lower(): + name = "Latency for each user (TTFT in ms)" + elif "throughput" in name.lower(): + name = "Throughput per user (token/seconds)" + return name + + +def _normalize_benchmark_metrics( + benchmark_metric_stats: List[JumpStartBenchmarkStat], +) -> Tuple[JumpStartBenchmarkStat, Dict[str, List[JumpStartBenchmarkStat]]]: + """Normalizes benchmark metrics dict. + + Args: + benchmark_metric_stats (List[JumpStartBenchmarkStat]): + List of benchmark metrics stats. + Returns: + Tuple[JumpStartBenchmarkStat, Dict[str, List[JumpStartBenchmarkStat]]]: + Normalized benchmark metrics dict. + """ + instance_type_rate = None + concurrent_users = {} + for current_instance_type_metric in benchmark_metric_stats: + if current_instance_type_metric.name.lower() == "instance rate": + instance_type_rate = current_instance_type_metric + elif current_instance_type_metric.concurrency not in concurrent_users: + concurrent_users[current_instance_type_metric.concurrency] = [ + current_instance_type_metric + ] + else: + concurrent_users[current_instance_type_metric.concurrency].append( + current_instance_type_metric + ) + + return instance_type_rate, concurrent_users + + def deployment_config_response_data( deployment_configs: Optional[List[DeploymentConfigMetadata]], ) -> List[Dict[str, Any]]: diff --git a/tests/unit/sagemaker/jumpstart/constants.py b/tests/unit/sagemaker/jumpstart/constants.py index 3815bfc9ef..a9c067a8da 100644 --- a/tests/unit/sagemaker/jumpstart/constants.py +++ b/tests/unit/sagemaker/jumpstart/constants.py @@ -7662,25 +7662,33 @@ "inference_configs": { "neuron-inference": { "benchmark_metrics": { - "ml.inf2.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}] + "ml.inf2.2xlarge": [ + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ] }, "component_names": ["neuron-inference"], }, "neuron-inference-budget": { "benchmark_metrics": { - "ml.inf2.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}] + "ml.inf2.2xlarge": [ + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ] }, "component_names": ["neuron-base"], }, "gpu-inference-budget": { "benchmark_metrics": { - "ml.p3.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}] + "ml.p3.2xlarge": [ + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ] }, "component_names": ["gpu-inference-budget"], }, "gpu-inference": { "benchmark_metrics": { - "ml.p3.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}] + "ml.p3.2xlarge": [ + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ] }, "component_names": ["gpu-inference"], }, @@ -7748,8 +7756,12 @@ "training_configs": { "neuron-training": { "benchmark_metrics": { - "ml.tr1n1.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}], - "ml.tr1n1.4xlarge": [{"name": "Latency", "value": "50", "unit": "Tokens/S"}], + "ml.tr1n1.2xlarge": [ + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ], + "ml.tr1n1.4xlarge": [ + {"name": "Latency", "value": "50", "unit": "Tokens/S", "concurrency": 1} + ], }, "component_names": ["neuron-training"], "default_inference_config": "neuron-inference", @@ -7759,8 +7771,12 @@ }, "neuron-training-budget": { "benchmark_metrics": { - "ml.tr1n1.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}], - "ml.tr1n1.4xlarge": [{"name": "Latency", "value": "50", "unit": "Tokens/S"}], + "ml.tr1n1.2xlarge": [ + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ], + "ml.tr1n1.4xlarge": [ + {"name": "Latency", "value": "50", "unit": "Tokens/S", "concurrency": 1} + ], }, "component_names": ["neuron-training-budget"], "default_inference_config": "neuron-inference-budget", @@ -7770,7 +7786,9 @@ }, "gpu-training": { "benchmark_metrics": { - "ml.p3.2xlarge": [{"name": "Latency", "value": "200", "unit": "Tokens/S"}], + "ml.p3.2xlarge": [ + {"name": "Latency", "value": "200", "unit": "Tokens/S", "concurrency": "1"} + ], }, "component_names": ["gpu-training"], "default_inference_config": "gpu-inference", @@ -7780,7 +7798,9 @@ }, "gpu-training-budget": { "benchmark_metrics": { - "ml.p3.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}] + "ml.p3.2xlarge": [ + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": "1"} + ] }, "component_names": ["gpu-training-budget"], "default_inference_config": "gpu-inference-budget", @@ -7966,7 +7986,9 @@ "ContainerStartupHealthCheckTimeout": None, }, "AccelerationConfigs": None, - "BenchmarkMetrics": [{"name": "Instance Rate", "value": "0.0083000000", "unit": "USD/Hrs"}], + "BenchmarkMetrics": [ + {"name": "Instance Rate", "value": "0.0083000000", "unit": "USD/Hrs", "concurrency": 1} + ], }, { "DeploymentConfigName": "neuron-inference-budget", @@ -7998,7 +8020,9 @@ "ContainerStartupHealthCheckTimeout": None, }, "AccelerationConfigs": None, - "BenchmarkMetrics": [{"name": "Instance Rate", "value": "0.0083000000", "unit": "USD/Hrs"}], + "BenchmarkMetrics": [ + {"name": "Instance Rate", "value": "0.0083000000", "unit": "USD/Hrs", "concurrency": 1} + ], }, { "DeploymentConfigName": "gpu-inference-budget", @@ -8030,7 +8054,9 @@ "ContainerStartupHealthCheckTimeout": None, }, "AccelerationConfigs": None, - "BenchmarkMetrics": [{"name": "Instance Rate", "value": "0.0083000000", "unit": "USD/Hrs"}], + "BenchmarkMetrics": [ + {"name": "Instance Rate", "value": "0.0083000000", "unit": "USD/Hrs", "concurrency": 1} + ], }, { "DeploymentConfigName": "gpu-inference", diff --git a/tests/unit/sagemaker/jumpstart/test_types.py b/tests/unit/sagemaker/jumpstart/test_types.py index f8c9c81a38..7b9e8c4519 100644 --- a/tests/unit/sagemaker/jumpstart/test_types.py +++ b/tests/unit/sagemaker/jumpstart/test_types.py @@ -1027,7 +1027,9 @@ def test_inference_configs_parsing(): assert config.benchmark_metrics == { "ml.inf2.2xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ), ] } assert len(config.config_components) == 1 @@ -1191,10 +1193,14 @@ def test_training_configs_parsing(): assert config.benchmark_metrics == { "ml.tr1n1.2xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ), ], "ml.tr1n1.4xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "50", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "50", "unit": "Tokens/S", "concurrency": 1} + ), ], } assert len(config.config_components) == 1 diff --git a/tests/unit/sagemaker/jumpstart/test_utils.py b/tests/unit/sagemaker/jumpstart/test_utils.py index e6ea212994..8c3bb067be 100644 --- a/tests/unit/sagemaker/jumpstart/test_utils.py +++ b/tests/unit/sagemaker/jumpstart/test_utils.py @@ -1709,22 +1709,30 @@ def test_get_jumpstart_benchmark_stats_full_list( ) == { "neuron-inference": { "ml.inf2.2xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ) ] }, "neuron-inference-budget": { "ml.inf2.2xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ) ] }, "gpu-inference-budget": { "ml.p3.2xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ) ] }, "gpu-inference": { "ml.p3.2xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ) ] }, } @@ -1744,12 +1752,16 @@ def test_get_jumpstart_benchmark_stats_partial_list( ) == { "neuron-inference-budget": { "ml.inf2.2xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ) ] }, "gpu-inference-budget": { "ml.p3.2xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ) ] }, } @@ -1769,7 +1781,9 @@ def test_get_jumpstart_benchmark_stats_single_stat( ) == { "neuron-inference-budget": { "ml.inf2.2xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ) ] } } @@ -1797,6 +1811,16 @@ def test_get_jumpstart_benchmark_stats_training( ): patched_get_model_specs.side_effect = get_base_spec_with_prototype_configs + print( + utils.get_benchmark_stats( + "mock-region", + "mock-model", + "mock-model-version", + scope=JumpStartScriptScope.TRAINING, + config_names=["neuron-training", "gpu-training-budget"], + ) + ) + assert utils.get_benchmark_stats( "mock-region", "mock-model", @@ -1806,15 +1830,21 @@ def test_get_jumpstart_benchmark_stats_training( ) == { "neuron-training": { "ml.tr1n1.2xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ) ], "ml.tr1n1.4xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "50", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "50", "unit": "Tokens/S", "concurrency": 1} + ) ], }, "gpu-training-budget": { "ml.p3.2xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": "1"} + ) ] }, } @@ -1845,10 +1875,14 @@ def test_add_instance_rate_stats_to_benchmark_metrics( "us-west-2", { "ml.p2.xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ) ], "ml.gd4.xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ) ], }, ) @@ -1862,9 +1896,67 @@ def test_add_instance_rate_stats_to_benchmark_metrics( "name": "Instance Rate", "unit": "USD/Hrs", "value": "3.76", + "concurrency": None, } +def test__normalize_benchmark_metrics(): + rate, metrics = utils._normalize_benchmark_metrics( + [ + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ), + JumpStartBenchmarkStat( + {"name": "Throughput", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ), + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 2} + ), + JumpStartBenchmarkStat( + {"name": "Throughput", "value": "100", "unit": "Tokens/S", "concurrency": 2} + ), + JumpStartBenchmarkStat( + {"name": "Instance Rate", "unit": "USD/Hrs", "value": "3.76", "concurrency": None} + ), + ] + ) + + assert rate == JumpStartBenchmarkStat( + {"name": "Instance Rate", "unit": "USD/Hrs", "value": "3.76", "concurrency": None} + ) + assert metrics == { + 1: [ + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ), + JumpStartBenchmarkStat( + {"name": "Throughput", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ), + ], + 2: [ + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 2} + ), + JumpStartBenchmarkStat( + {"name": "Throughput", "value": "100", "unit": "Tokens/S", "concurrency": 2} + ), + ], + } + + +@pytest.mark.parametrize( + "name, expected", + [ + ("latency", "Latency for each user (TTFT in ms)"), + ("throughput", "Throughput per user (token/seconds)"), + ], +) +def test__normalize_benchmark_metric_column_name(name, expected): + out = utils._normalize_benchmark_metric_column_name(name) + + assert out == expected + + @patch("sagemaker.jumpstart.utils.get_instance_rate_per_hour") def test_add_instance_rate_stats_to_benchmark_metrics_client_ex( mock_get_instance_rate_per_hour, @@ -1883,7 +1975,9 @@ def test_add_instance_rate_stats_to_benchmark_metrics_client_ex( "us-west-2", { "ml.p2.xlarge": [ - JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"}) + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1} + ) ], }, ) @@ -1899,10 +1993,26 @@ def test_add_instance_rate_stats_to_benchmark_metrics_client_ex( [ (None, True), ( - [JumpStartBenchmarkStat({"name": "Instance Rate", "unit": "USD/Hrs", "value": "3.76"})], + [ + JumpStartBenchmarkStat( + { + "name": "Instance Rate", + "unit": "USD/Hrs", + "value": "3.76", + "concurrency": None, + } + ) + ], True, ), - ([JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"})], False), + ( + [ + JumpStartBenchmarkStat( + {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": None} + ) + ], + False, + ), ], ) def test_has_instance_rate_stat(stats, expected): diff --git a/tests/unit/sagemaker/jumpstart/utils.py b/tests/unit/sagemaker/jumpstart/utils.py index 63b964e16e..97ee36e998 100644 --- a/tests/unit/sagemaker/jumpstart/utils.py +++ b/tests/unit/sagemaker/jumpstart/utils.py @@ -366,7 +366,12 @@ def get_base_deployment_configs_metadata( for instance_type in benchmark_metrics: benchmark_metrics[instance_type].append( JumpStartBenchmarkStat( - {"name": "Instance Rate", "unit": "USD/Hrs", "value": "3.76"} + { + "name": "Instance Rate", + "unit": "USD/Hrs", + "value": "3.76", + "concurrency": None, + } ) ) @@ -409,7 +414,12 @@ def append_instance_stat_metrics( for key in metrics: metrics[key].append( JumpStartBenchmarkStat( - {"name": "Instance Rate", "value": "3.76", "unit": "USD/Hrs"} + { + "name": "Instance Rate", + "value": "3.76", + "unit": "USD/Hrs", + "concurrency": None, + } ) ) return metrics