aws · liujiaorr · May 22, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
@@ -464,10 +464,7 @@ def benchmark_metrics(self) -> pd.DataFrame:
         Returns:
             Benchmark Metrics: Pandas DataFrame object.
         """
-        df = pd.DataFrame(self._get_deployment_configs_benchmarks_data())
-        default_mask = df.apply(lambda row: any("Default" in str(val) for val in row), axis=1)
-        sorted_df = pd.concat([df[default_mask], df[~default_mask]])
-        return sorted_df
+        return pd.DataFrame(self._get_deployment_configs_benchmarks_data())
 
     def display_benchmark_metrics(self, *args, **kwargs) -> None:
         """Display deployment configs benchmark metrics."""

@@ -746,7 +746,7 @@ def _get_regional_property(
 class JumpStartBenchmarkStat(JumpStartDataHolderType):
     """Data class JumpStart benchmark stat."""
 
-    __slots__ = ["name", "value", "unit"]
+    __slots__ = ["name", "value", "unit", "concurrency"]
 
     def __init__(self, spec: Dict[str, Any]):
         """Initializes a JumpStartBenchmarkStat object.
@@ -765,6 +765,7 @@ def from_json(self, json_obj: Dict[str, Any]) -> None:
         self.name: str = json_obj["name"]
         self.value: str = json_obj["value"]
         self.unit: Union[int, str] = json_obj["unit"]
+        self.concurrency: Union[int, str] = json_obj["concurrency"]
 
     def to_json(self) -> Dict[str, Any]:
         """Returns json representation of JumpStartBenchmarkStat object."""

@@ -1082,7 +1082,9 @@ def add_instance_rate_stats_to_benchmark_metrics(
 
                 if not benchmark_metric_stats:
                     benchmark_metric_stats = []
-                benchmark_metric_stats.append(JumpStartBenchmarkStat(instance_type_rate))
+                benchmark_metric_stats.append(
+                    JumpStartBenchmarkStat({"concurrency": None, **instance_type_rate})
+                )
 
                 final_benchmark_metrics[instance_type] = benchmark_metric_stats
             except ClientError as e:
@@ -1127,43 +1129,91 @@ def get_metrics_from_deployment_configs(
     if not deployment_configs:
         return {}
 
-    data = {"Instance Type": [], "Config Name": []}
+    data = {"Instance Type": [], "Concurrent Users": [], "Config Name": []}
     instance_rate_data = {}
     for index, deployment_config in enumerate(deployment_configs):
         benchmark_metrics = deployment_config.benchmark_metrics
         if not deployment_config.deployment_args or not benchmark_metrics:
             continue
 
-        for inner_index, current_instance_type in enumerate(benchmark_metrics):
-            current_instance_type_metrics = benchmark_metrics[current_instance_type]
-
-            data["Config Name"].append(deployment_config.deployment_config_name)
-            instance_type_to_display = (
-                f"{current_instance_type} (Default)"
-                if index == 0
-                and current_instance_type == deployment_config.deployment_args.default_instance_type
-                else current_instance_type
+        for current_instance_type, current_instance_type_metrics in benchmark_metrics.items():
+            instance_type_rate, concurrent_users = _normalize_benchmark_metrics(
+                current_instance_type_metrics
             )
-            data["Instance Type"].append(instance_type_to_display)
-
-            for metric in current_instance_type_metrics:
-                column_name = f"{metric.name} ({metric.unit})"
-
-                if metric.name.lower() == "instance rate":
-                    if column_name not in instance_rate_data:
-                        instance_rate_data[column_name] = []
-                    instance_rate_data[column_name].append(metric.value)
-                else:
-                    if column_name not in data:
-                        data[column_name] = []
-                    for _ in range(len(data[column_name]), inner_index):
-                        data[column_name].append(" - ")
+
+            for concurrent_user, metrics in concurrent_users.items():
+                instance_type_to_display = (
+                    f"{current_instance_type} (Default)"
+                    if index == 0
+                    and int(concurrent_user) == 1
+                    and current_instance_type
+                    == deployment_config.deployment_args.default_instance_type
+                    else current_instance_type
+                )
+
+                instance_rate_column_name = f"{instance_type_rate.name} ({instance_type_rate.unit})"
+                instance_rate_data[instance_rate_column_name] = instance_rate_data.get(
+                    instance_rate_column_name, []
+                )
+
+                data["Config Name"].append(deployment_config.deployment_config_name)
+                data["Instance Type"].append(instance_type_to_display)
+                data["Concurrent Users"].append(concurrent_user)
+                instance_rate_data[instance_rate_column_name].append(instance_type_rate.value)
+
+                for metric in metrics:
+                    column_name = _normalize_benchmark_metric_column_name(metric.name)
+                    data[column_name] = data.get(column_name, [])
                     data[column_name].append(metric.value)
 
     data = {**data, **instance_rate_data}
     return data
 
 
+def _normalize_benchmark_metric_column_name(name: str) -> str:
+    """Normalizes benchmark metric column name.
+
+    Args:
+        name (str): Name of the metric.
+    Returns:
+        str: Normalized metric column name.
+    """
+    if "latency" in name.lower():
+        name = "Latency for each user (TTFT in ms)"
+    elif "throughput" in name.lower():
+        name = "Throughput per user (token/seconds)"
+    return name
+
+
+def _normalize_benchmark_metrics(
+    benchmark_metric_stats: List[JumpStartBenchmarkStat],
+) -> Tuple[JumpStartBenchmarkStat, Dict[str, List[JumpStartBenchmarkStat]]]:
+    """Normalizes benchmark metrics dict.
+
+    Args:
+        benchmark_metric_stats (List[JumpStartBenchmarkStat]):
+        List of benchmark metrics stats.
+    Returns:
+        Tuple[JumpStartBenchmarkStat, Dict[str, List[JumpStartBenchmarkStat]]]:
+        Normalized benchmark metrics dict.
+    """
+    instance_type_rate = None
+    concurrent_users = {}
+    for current_instance_type_metric in benchmark_metric_stats:
+        if current_instance_type_metric.name.lower() == "instance rate":
+            instance_type_rate = current_instance_type_metric
+        elif current_instance_type_metric.concurrency not in concurrent_users:
+            concurrent_users[current_instance_type_metric.concurrency] = [
+                current_instance_type_metric
+            ]
+        else:
+            concurrent_users[current_instance_type_metric.concurrency].append(
+                current_instance_type_metric
+            )
+
+    return instance_type_rate, concurrent_users
+
+
 def deployment_config_response_data(
     deployment_configs: Optional[List[DeploymentConfigMetadata]],
 ) -> List[Dict[str, Any]]:

@@ -7662,25 +7662,33 @@
     "inference_configs": {
         "neuron-inference": {
             "benchmark_metrics": {
-                "ml.inf2.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}]
+                "ml.inf2.2xlarge": [
+                    {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1}
+                ]
             },
             "component_names": ["neuron-inference"],
         },
         "neuron-inference-budget": {
             "benchmark_metrics": {
-                "ml.inf2.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}]
+                "ml.inf2.2xlarge": [
+                    {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1}
+                ]
             },
             "component_names": ["neuron-base"],
         },
         "gpu-inference-budget": {
             "benchmark_metrics": {
-                "ml.p3.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}]
+                "ml.p3.2xlarge": [
+                    {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1}
+                ]
             },
             "component_names": ["gpu-inference-budget"],
         },
         "gpu-inference": {
             "benchmark_metrics": {
-                "ml.p3.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}]
+                "ml.p3.2xlarge": [
+                    {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1}
+                ]
             },
             "component_names": ["gpu-inference"],
         },
@@ -7748,8 +7756,12 @@
     "training_configs": {
         "neuron-training": {
             "benchmark_metrics": {
-                "ml.tr1n1.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}],
-                "ml.tr1n1.4xlarge": [{"name": "Latency", "value": "50", "unit": "Tokens/S"}],
+                "ml.tr1n1.2xlarge": [
+                    {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1}
+                ],
+                "ml.tr1n1.4xlarge": [
+                    {"name": "Latency", "value": "50", "unit": "Tokens/S", "concurrency": 1}
+                ],
             },
             "component_names": ["neuron-training"],
             "default_inference_config": "neuron-inference",
@@ -7759,8 +7771,12 @@
         },
         "neuron-training-budget": {
             "benchmark_metrics": {
-                "ml.tr1n1.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}],
-                "ml.tr1n1.4xlarge": [{"name": "Latency", "value": "50", "unit": "Tokens/S"}],
+                "ml.tr1n1.2xlarge": [
+                    {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1}
+                ],
+                "ml.tr1n1.4xlarge": [
+                    {"name": "Latency", "value": "50", "unit": "Tokens/S", "concurrency": 1}
+                ],
             },
             "component_names": ["neuron-training-budget"],
             "default_inference_config": "neuron-inference-budget",
@@ -7770,7 +7786,9 @@
         },
         "gpu-training": {
             "benchmark_metrics": {
-                "ml.p3.2xlarge": [{"name": "Latency", "value": "200", "unit": "Tokens/S"}],
+                "ml.p3.2xlarge": [
+                    {"name": "Latency", "value": "200", "unit": "Tokens/S", "concurrency": "1"}
+                ],
             },
             "component_names": ["gpu-training"],
             "default_inference_config": "gpu-inference",
@@ -7780,7 +7798,9 @@
         },
         "gpu-training-budget": {
             "benchmark_metrics": {
-                "ml.p3.2xlarge": [{"name": "Latency", "value": "100", "unit": "Tokens/S"}]
+                "ml.p3.2xlarge": [
+                    {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": "1"}
+                ]
             },
             "component_names": ["gpu-training-budget"],
             "default_inference_config": "gpu-inference-budget",
@@ -7966,7 +7986,9 @@
             "ContainerStartupHealthCheckTimeout": None,
         },
         "AccelerationConfigs": None,
-        "BenchmarkMetrics": [{"name": "Instance Rate", "value": "0.0083000000", "unit": "USD/Hrs"}],
+        "BenchmarkMetrics": [
+            {"name": "Instance Rate", "value": "0.0083000000", "unit": "USD/Hrs", "concurrency": 1}
+        ],
     },
     {
         "DeploymentConfigName": "neuron-inference-budget",
@@ -7998,7 +8020,9 @@
             "ContainerStartupHealthCheckTimeout": None,
         },
         "AccelerationConfigs": None,
-        "BenchmarkMetrics": [{"name": "Instance Rate", "value": "0.0083000000", "unit": "USD/Hrs"}],
+        "BenchmarkMetrics": [
+            {"name": "Instance Rate", "value": "0.0083000000", "unit": "USD/Hrs", "concurrency": 1}
+        ],
     },
     {
         "DeploymentConfigName": "gpu-inference-budget",
@@ -8030,7 +8054,9 @@
             "ContainerStartupHealthCheckTimeout": None,
         },
         "AccelerationConfigs": None,
-        "BenchmarkMetrics": [{"name": "Instance Rate", "value": "0.0083000000", "unit": "USD/Hrs"}],
+        "BenchmarkMetrics": [
+            {"name": "Instance Rate", "value": "0.0083000000", "unit": "USD/Hrs", "concurrency": 1}
+        ],
     },
     {
         "DeploymentConfigName": "gpu-inference",

@@ -1027,7 +1027,9 @@ def test_inference_configs_parsing():
 
     assert config.benchmark_metrics == {
         "ml.inf2.2xlarge": [
-            JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"})
+            JumpStartBenchmarkStat(
+                {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1}
+            ),
         ]
     }
     assert len(config.config_components) == 1
@@ -1191,10 +1193,14 @@ def test_training_configs_parsing():
 
     assert config.benchmark_metrics == {
         "ml.tr1n1.2xlarge": [
-            JumpStartBenchmarkStat({"name": "Latency", "value": "100", "unit": "Tokens/S"})
+            JumpStartBenchmarkStat(
+                {"name": "Latency", "value": "100", "unit": "Tokens/S", "concurrency": 1}
+            ),
         ],
         "ml.tr1n1.4xlarge": [
-            JumpStartBenchmarkStat({"name": "Latency", "value": "50", "unit": "Tokens/S"})
+            JumpStartBenchmarkStat(
+                {"name": "Latency", "value": "50", "unit": "Tokens/S", "concurrency": 1}
+            ),
         ],
     }
     assert len(config.config_components) == 1