awslabs · hakuna-matatah · Jun 13, 2025 · May 29, 2025 · May 30, 2025 · Jun 2, 2025
diff --git a/tests/assets/eks-pod-identity/config.yaml b/tests/assets/eks-pod-identity/config.yaml
@@ -1,3 +1,8 @@
+{{$clusterName := DefaultParam .CL2_CLUSTER_NAME "default-cluster-name"}}
+{{$metricDimensionName := DefaultParam .CL2_METRIC_DIMENSION_NAME "ClusterName"}}
+{{$metricNamespace := DefaultParam .CL2_METRIC_NAMESPACE "EKSPodIdentityScalabilityTests"}}
+{{$metricLatencyName := DefaultParam .CL2_METRIC_LATENCY_NAME "CredentialFetchLatency"}}
+{{$metricPeriod := DefaultParam .CL2_METRIC_PERIOD 300}}
 {{$namespacePrefix := DefaultParam .CL2_NAMESPACE_PREFIX "default"}}
 {{$namespaceCount := DefaultParam .CL2_NAMESPACE_COUNT 1}}
 {{$totalEksPodIdentityPods := DefaultParam .CL2_EKS_POD_IDENTITY_PODS 5000}}
@@ -42,6 +47,11 @@ steps:
       objectTemplatePath: pod-default.yaml
       templateFillMap:
         Group: eks-pod-identity
+        ClusterName: {{$clusterName}}
+        MetricDimensionName: {{$metricDimensionName}}
+        MetricNamespace: {{$metricNamespace}}
+        MetricLatencyName: {{$metricLatencyName}}
+        MetricPeriod: {{$metricPeriod}}
 - name: Waiting for eks pod identity pods to be created
   measurements:
   - Identifier: WaitForEksPodIdentityPods
@@ -70,3 +80,8 @@ steps:
       objectTemplatePath: pod-default.yaml
       templateFillMap:
         Group: eks-pod-identity
+        ClusterName: {{$clusterName}}
+        MetricDimensionName: {{$metricDimensionName}}
+        MetricNamespace: {{$metricNamespace}}
+        MetricLatencyName: {{$metricLatencyName}}
+        MetricPeriod: {{$metricPeriod}}
diff --git a/tests/assets/eks-pod-identity/pod-default.yaml b/tests/assets/eks-pod-identity/pod-default.yaml
@@ -6,18 +6,94 @@ metadata:
     group: {{.Group}}
 spec:
   containers:
-  - image: registry.k8s.io/pause:3.9
-    imagePullPolicy: IfNotPresent
-    name: pause
-  initContainers:
-  - name: app-init
+  - name: app-with-awsapi
     image: public.ecr.aws/aws-cli/aws-cli:latest
     imagePullPolicy: IfNotPresent
+    env:
+      - name: CLUSTER_NAME
+        value: "{{.ClusterName}}"
+      - name: DIMENSION_NAME
+        value: "{{.MetricDimensionName}}"
+      - name: NAMESPACE
+        value: "{{.MetricNamespace}}"
+      - name: METRIC_LATENCY_NAME
+        value: "{{.MetricLatencyName}}"
+      - name: PERIOD
+        value: "{{.MetricPeriod}}"
     command:
       - sh
       - -c
       - |
+        AUTH_TOKEN=$(cat $AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE)
+        MAX_ATTEMPTS=7
+        INITIAL_DELAY=0.2  # 200ms
+
+        DIMENSION_VALUE=$CLUSTER_NAME
+        METRIC_MAX_RETRIES=5
+        METRIC_RETRY_DELAY=1
+
+        # make 7 attempts on credential fetching with exponential retries, and calculate the time taken
+        # push metrics on time taken on credential fetching
+        # to minimize failure from cloudwatch metrics, add retries on put-metric-data
+        start_epoch=$(date +%s%3N)
+        # fetch credentials
+        for i in $(seq 0 $((MAX_ATTEMPTS - 1))); do
+          status_code=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: $AUTH_TOKEN" http://169.254.170.23/v1/credentials)
+          if [ "$status_code" -eq 200 ]; then
+            end_epoch=$(date +%s%3N)
+            printf "Endpoint is reachable at try %d\n" "$i"
+
+            latency_ms=$((end_epoch - start_epoch))
+            latency_sec=$(awk "BEGIN { print $latency_ms / 1000 }")
+
+            # send CredentialFetchLatency metric
+            for ((j=1; j<=METRIC_MAX_RETRIES; j++)); do
+              aws cloudwatch put-metric-data \
+                --namespace "$NAMESPACE" \
+                --metric-name "$METRIC_LATENCY_NAME" \
+                --dimensions "$DIMENSION_NAME=$DIMENSION_VALUE" \
+                --value "$latency_sec" \
+                --unit Seconds && {
+                  echo "Metric CredentialFetchLatency sent successfully."
+                  break
+              }
+
+              if [ "$j" -lt "$METRIC_MAX_RETRIES" ]; then
+                echo "Attempt $j failed. Retrying in $METRIC_RETRY_DELAY seconds..." >&2
+                sleep $METRIC_RETRY_DELAY
+                METRIC_RETRY_DELAY=$((METRIC_RETRY_DELAY * 2)) # exponential backoff
+              else
+                echo "Failed to send metric CredentialFetchLatency after $METRIC_MAX_RETRIES attempts." >&2
+                exit 1
+              fi
+            done
+
+            break
+          fi
+
+          if [ "$i" -eq $((MAX_ATTEMPTS - 1)) ]; then
+            echo "Max attempts reached. Exiting with failure."
+            exit 1
+          fi
+
+          SLEEP_TIME=$(echo "$INITIAL_DELAY * (2 ^ $i)" | bc -l)
+          printf "Failed. Sleeping %.3f seconds before retry...\n" "$SLEEP_TIME"
+          sleep "$SLEEP_TIME"
+        done
+
+        # it is noted that a Pod with host network will fallback to Node role permissions that includes this s3 access
+        # however, in our test case, we are not using host network
+        # https://github.com/awslabs/kubernetes-iteration-toolkit/blob/main/tests/assets/eks_node_role.json
+        # the main reason we are not doing an STS get identity verification is about the quota of STS APIs with scale tests
+
+        # s3 api call
         while ! aws s3 ls; do
             echo "Waiting for S3 bucket access..."
         done
         echo "S3 bucket is accessible, proceeding."
+
+        # pause
+        while true; do
+            echo "Sleeping for 1 hour..."
+            sleep 3600
+        done
diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml
@@ -84,10 +84,22 @@ spec:
     default: "200"
   - name: cl2-uniform-qps
     default: "100"
+  - name: cl2-metric-dimension-name
+    description: "default metric dimension name"
+    default: "ClusterName"
+  - name: cl2-metric-namespace
+    description: "default metric namespace for pod identity"
+    default: "EKSPodIdentityScalabilityTests"
+  - name: cl2-metric-latency-name
+    description: "default metric latency name for pod identity"
+    default: "CredentialFetchLatency"
+  - name: cl2-metric-period
+    description: "default metric period"
+    default: "300"
   - name: timeout-pia-pod-creation
-    default: "20m"
+    default: "80s"
   - name: timeout-pia-pod-startup
-    default: "5m"
+    default: "60s"
   - name: launch-template-ami
     default: ""
     description: "Launch template ImageId value, which may be an AMI ID or resolve:ssm reference. By default resolve to the lates AL2023 ami for cluster version"
@@ -189,7 +201,7 @@ spec:
         value: "$(params.kubernetes-version)"
       - name: endpoint
         value: $(params.endpoint)
-      - name: node-role-name 
+      - name: node-role-name
         value: $(params.cluster-name)-node-role
       - name: ami
         value: $(params.launch-template-ami)
@@ -279,12 +291,22 @@ spec:
       value: $(params.cl2-default-burst)
     - name: cl2-uniform-qps
       value: $(params.cl2-uniform-qps)
+    - name: cl2-metric-dimension-name
+      value: $(params.cl2-metric-dimension-name)
+    - name: cl2-metric-namespace
+      value: $(params.cl2-metric-namespace)
+    - name: cl2-metric-latency-name
+      value: $(params.cl2-metric-latency-name)
+    - name: cl2-metric-period
+      value: $(params.cl2-metric-period)
     - name: results-bucket
       value: $(params.results-bucket)
     - name: nodes
       value: $(params.desired-nodes)
     - name: cluster-name
       value: $(params.cluster-name)
+    - name: endpoint
+      value: $(params.endpoint)
     - name: namespace-prefix
       value: $(params.namespace-prefix)
     - name: namespace-count

diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml
@@ -25,6 +25,18 @@ spec:
   - name: cl2-uniform-qps
     description: "uniform qps"
     default: "100"
+  - name: cl2-metric-dimension-name
+    description: "default metric dimension name"
+    default: "ClusterName"
+  - name: cl2-metric-namespace
+    description: "default metric namespace for pod identity"
+    default: "EKSPodIdentityScalabilityTests"
+  - name: cl2-metric-latency-name
+    description: "default metric latency name for pod identity"
+    default: "CredentialFetchLatency"
+  - name: cl2-metric-period
+    description: "default metric period"
+    default: "300"
   - name: nodes
     description: "number of dataplane nodes to run the load test against"
     default: "1000"
@@ -35,6 +47,8 @@ spec:
     description: The region where the cluster is in.
   - name: cluster-name
     description: "The name of the EKS cluster you want to spin"
+  - name: endpoint
+    default: ""
   - name: namespace-prefix
     default: "default"
     description: "The prefix of namespaces for EKS Pod Identity test."
@@ -89,6 +103,11 @@ spec:
       CL2_DEFAULT_QPS: $(params.cl2-default-qps)
       CL2_DEFAULT_BURST: $(params.cl2-default-burst)
       CL2_UNIFORM_QPS: $(params.cl2-uniform-qps)
+      CL2_CLUSTER_NAME: $(params.cluster-name)
+      CL2_METRIC_DIMENSION_NAME: $(params.cl2-metric-dimension-name)
+      CL2_METRIC_NAMESPACE: $(params.cl2-metric-namespace)
+      CL2_METRIC_LATENCY_NAME: $(params.cl2-metric-latency-name)
+      CL2_METRIC_PERIOD: $(params.cl2-metric-period)
       CL2_NAMESPACE_PREFIX: $(params.namespace-prefix)
       CL2_NAMESPACE_COUNT: $(params.namespace-count)
       CL2_TIMEOUT_EKS_POD_IDENTITY_POD_CREATION: $(params.timeout-pia-pod-creation)
@@ -172,9 +191,90 @@ spec:
     image: amazon/aws-cli
     workingDir: $(workspaces.results.path)
     script: |
+      yum install -y jq
+
       S3_RESULT_PATH=$(cat $(results.s3_result.path))
       echo "S3 Path: $S3_RESULT_PATH"
       aws sts get-caller-identity
+
+      REGION=$(params.region)
+      ENDPOINT_FLAG=""
+      if [ -n "$(params.endpoint)" ]; then
+        ENDPOINT_FLAG="--endpoint $(params.endpoint)"
+      fi
+
+      CLUSTER_NAME=$(params.cluster-name)
+      NAMESPACE=$(params.cl2-metric-namespace)
+      DIMENSION_NAME=$(params.cl2-metric-dimension-name)
+      DIMENSION_VALUE=$CLUSTER_NAME
+      METRIC_LATENCY_NAME=$(params.cl2-metric-latency-name)
+      PERIOD=$(params.cl2-metric-period)
+
+      # since the scalability test is running with the same cluster name, with cluster recreation
+      # it is important to know the range of start and end time to query metrics for the current run
+      # here we use cluster creation time start as start time and the current time as end time
+      START_TIME=$(aws eks $ENDPOINT_FLAG --region $REGION describe-cluster \
+        --name "$CLUSTER_NAME" \
+        --query "cluster.createdAt" \
+        --output text)
+
+      END_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+
+      response=$(aws cloudwatch get-metric-statistics \
+        --region "$REGION" \
+        --namespace "$NAMESPACE" \
+        --metric-name "$METRIC_LATENCY_NAME" \
+        --dimensions Name="$DIMENSION_NAME",Value="$DIMENSION_VALUE" \
+        --start-time "$START_TIME" \
+        --end-time "$END_TIME" \
+        --period "$PERIOD" \
+        --extended-statistics p50 p99 p99.95 \
+        --output json)
+
+      # extract p50 p99 p99.95 of credential fetching
+      latest=$(echo "$response" | jq -r '.Datapoints | sort_by(.Timestamp) | last')
+      p50=$(echo "$latest" | jq -r '."ExtendedStatistics"."p50" // "N/A"')
+      p99=$(echo "$latest" | jq -r '."ExtendedStatistics"."p99" // "N/A"')
+      p9995=$(echo "$latest" | jq -r '."ExtendedStatistics"."p99.95" // "N/A"')
+
+      response=$(aws cloudwatch get-metric-statistics \
+        --region "$REGION" \
+        --namespace "$NAMESPACE" \
+        --metric-name "$METRIC_LATENCY_NAME" \
+        --dimensions Name="$DIMENSION_NAME",Value="$DIMENSION_VALUE" \
+        --start-time "$START_TIME" \
+        --end-time "$END_TIME" \
+        --period "$PERIOD" \
+        --statistics SampleCount \
+        --output json)
+
+      total_samples=$(echo "$response" | jq '[.Datapoints[].SampleCount] | add // 0')
+      rate=$(params.cl2-default-qps)
+
+      # save metric results for s3 upload
+      cat <<EOF > eks_pod_identity_test_summary.json
+      {
+        "start_time": "$START_TIME",
+        "end_time": "$END_TIME",
+        "total_samples": $total_samples,
+        "rate": $rate,
+        "p50": $p50,
+        "p99": $p99,
+        "p99.95": $p9995
+      }
+      EOF
+
       # we expect to see all files from loadtest that clusterloader2 outputs here in this dir
       ls -larth
       aws s3 cp . s3://$S3_RESULT_PATH/  --recursive
+
+      # if p99.95 is equal to or more than 1 second, exit with failure
+      int_p9995=$(echo "$p9995" | awk '{printf "%d", $1}')
+      if [ "$int_p9995" -lt 1 ]; then
+        echo "p99.95 is less than 1 second"
+        echo "1" | tee $(results.datapoint.path)
+      else
+        echo "p99.95 is 1 second or more"
+        echo "0" | tee $(results.datapoint.path)
+        exit 1
       exit_code=$? 
       if [ $exit_code -eq 0 ]; then 
       echo "1" | tee $(results.datapoint.path) 
       else 
       echo "0" | tee $(results.datapoint.path) 
       fi 
       exit $exit_code 
 value: $(tasks.generate.results.datapoint) 
       exit_code=$? 
       if [ $exit_code -eq 0 ]; then 
       echo "1" | tee $(results.datapoint.path) 
       else 
       echo "0" | tee $(results.datapoint.path) 
       fi 
       exit $exit_code 
 value: $(tasks.generate.results.datapoint) 
+      fi
diff --git a/tests/tekton-resources/tasks/setup/eks/awscli-pod-identity-association.yaml b/tests/tekton-resources/tasks/setup/eks/awscli-pod-identity-association.yaml
@@ -48,7 +48,8 @@ spec:
         ENDPOINT_FLAG="--endpoint $(params.endpoint)"
       fi
 
-      MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
+      S3_MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
+      CLOUDWATCH_MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/CloudWatchFullAccess"
       TRUST_POLICY_FILE="pia-trust-policy.json"
       # create a trust policy json file
       curl -s $(params.pia-trust-policy-url) -o ./$TRUST_POLICY_FILE
@@ -57,7 +58,8 @@ spec:
 
         PIA_ROLE_NAME=$(params.cluster-name)-pia-role-$i
         aws iam create-role --role-name $PIA_ROLE_NAME --assume-role-policy-document file://$TRUST_POLICY_FILE
-        aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $MANAGED_POLICY_ARN
+        aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $S3_MANAGED_POLICY_ARN
+        aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $CLOUDWATCH_MANAGED_POLICY_ARN
         PIA_ROLE_ARN=$(aws iam get-role --role-name $PIA_ROLE_NAME --query 'Role.Arn' --output text)
         echo "$PIA_ROLE_ARN is created"