From 75cc295aceb849a711d3627221a51d14c6c1817c Mon Sep 17 00:00:00 2001 From: xdu31 Date: Wed, 28 May 2025 20:05:57 -0700 Subject: [PATCH 1/4] Calculate metrics of credential fetching from Pods & upload to s3 --- tests/assets/eks-pod-identity/config.yaml | 2 + .../assets/eks-pod-identity/pod-default.yaml | 74 +++++++++++++++++-- .../eks/awscli-cl2-load-with-addons-slos.yaml | 4 +- .../clusterloader/load-pod-identity.yaml | 65 ++++++++++++++++ .../eks/awscli-pod-identity-association.yaml | 6 +- 5 files changed, 143 insertions(+), 8 deletions(-) diff --git a/tests/assets/eks-pod-identity/config.yaml b/tests/assets/eks-pod-identity/config.yaml index 08448b56..055414c3 100644 --- a/tests/assets/eks-pod-identity/config.yaml +++ b/tests/assets/eks-pod-identity/config.yaml @@ -1,3 +1,4 @@ +{{$clusterName := DefaultParam .CL2_CLUSTER_NAME "default-cluster-name"}} {{$namespacePrefix := DefaultParam .CL2_NAMESPACE_PREFIX "default"}} {{$namespaceCount := DefaultParam .CL2_NAMESPACE_COUNT 1}} {{$totalEksPodIdentityPods := DefaultParam .CL2_EKS_POD_IDENTITY_PODS 5000}} @@ -42,6 +43,7 @@ steps: objectTemplatePath: pod-default.yaml templateFillMap: Group: eks-pod-identity + ClusterName: {{$clusterName}} - name: Waiting for eks pod identity pods to be created measurements: - Identifier: WaitForEksPodIdentityPods diff --git a/tests/assets/eks-pod-identity/pod-default.yaml b/tests/assets/eks-pod-identity/pod-default.yaml index 84702a6c..8eb54359 100644 --- a/tests/assets/eks-pod-identity/pod-default.yaml +++ b/tests/assets/eks-pod-identity/pod-default.yaml @@ -6,18 +6,82 @@ metadata: group: {{.Group}} spec: containers: - - image: registry.k8s.io/pause:3.9 - imagePullPolicy: IfNotPresent - name: pause - initContainers: - - name: app-init + - name: app-with-awsapi image: public.ecr.aws/aws-cli/aws-cli:latest imagePullPolicy: IfNotPresent + env: + - name: CLUSTER_NAME + value: "{{.ClusterName}}" command: - sh - -c - | + AUTH_TOKEN=$(cat $AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE) + MAX_ATTEMPTS=7 + INITIAL_DELAY=0.2 # 200ms + + NAMESPACE="EKSPodIdentityScalabilityTests" + DIMENSION_NAME="ClusterName" + DIMENSION_VALUE=$CLUSTER_NAME + METRIC_LATENCY_NAME="CredentialFetchLatency" + PERIOD=300 + + METRIC_MAX_RETRIES=5 + METRIC_RETRY_DELAY=1 + + start_epoch=$(date +%s%3N) + # fetch credentials + for i in $(seq 0 $((MAX_ATTEMPTS - 1))); do + if curl -S -H "Authorization: $AUTH_TOKEN" http://169.254.170.23/v1/credentials; then + end_epoch=$(date +%s%3N) + printf "Endpoint is reachable at try %d\n" "$i" + + latency_ms=$((end_epoch - start_epoch)) + latency_sec=$(awk "BEGIN { print $latency_ms / 1000 }") + + # send CredentialFetchLatency metric + for ((j=1; j<=METRIC_MAX_RETRIES; j++)); do + aws cloudwatch put-metric-data \ + --namespace "$NAMESPACE" \ + --metric-name "$METRIC_LATENCY_NAME" \ + --dimensions "$DIMENSION_NAME=$DIMENSION_VALUE" \ + --value "$latency_sec" \ + --unit Seconds && { + echo "Metric CredentialFetchLatency sent successfully." + break + } + + if [ "$j" -lt "$METRIC_MAX_RETRIES" ]; then + echo "Attempt $j failed. Retrying in $METRIC_RETRY_DELAY seconds..." >&2 + sleep $METRIC_RETRY_DELAY + METRIC_RETRY_DELAY=$((METRIC_RETRY_DELAY * 2)) # exponential backoff + else + echo "Failed to send metric CredentialFetchLatency after $METRIC_MAX_RETRIES attempts." >&2 + exit 1 + fi + done + + break + fi + + if [ "$i" -eq $((MAX_ATTEMPTS - 1)) ]; then + echo "Max attempts reached. Exiting with failure." + exit 1 + fi + + SLEEP_TIME=$(echo "$INITIAL_DELAY * (2 ^ $i)" | bc -l) + printf "Failed. Sleeping %.3f seconds before retry...\n" "$SLEEP_TIME" + sleep "$SLEEP_TIME" + done + + # s3 api call while ! aws s3 ls; do echo "Waiting for S3 bucket access..." done echo "S3 bucket is accessible, proceeding." + + # pause + while true; do + echo "Sleeping for 1 hour..." + sleep 3600 + done diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index c58c5cb5..d573dffd 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -189,7 +189,7 @@ spec: value: "$(params.kubernetes-version)" - name: endpoint value: $(params.endpoint) - - name: node-role-name + - name: node-role-name value: $(params.cluster-name)-node-role - name: ami value: $(params.launch-template-ami) @@ -285,6 +285,8 @@ spec: value: $(params.desired-nodes) - name: cluster-name value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) - name: namespace-prefix value: $(params.namespace-prefix) - name: namespace-count diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml index 47477be5..f6186408 100644 --- a/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml +++ b/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml @@ -35,6 +35,8 @@ spec: description: The region where the cluster is in. - name: cluster-name description: "The name of the EKS cluster you want to spin" + - name: endpoint + default: "" - name: namespace-prefix default: "default" description: "The prefix of namespaces for EKS Pod Identity test." @@ -89,6 +91,7 @@ spec: CL2_DEFAULT_QPS: $(params.cl2-default-qps) CL2_DEFAULT_BURST: $(params.cl2-default-burst) CL2_UNIFORM_QPS: $(params.cl2-uniform-qps) + CL2_CLUSTER_NAME: $(params.cluster-name) CL2_NAMESPACE_PREFIX: $(params.namespace-prefix) CL2_NAMESPACE_COUNT: $(params.namespace-count) CL2_TIMEOUT_EKS_POD_IDENTITY_POD_CREATION: $(params.timeout-pia-pod-creation) @@ -175,6 +178,68 @@ spec: S3_RESULT_PATH=$(cat $(results.s3_result.path)) echo "S3 Path: $S3_RESULT_PATH" aws sts get-caller-identity + + REGION=$(params.region) + ENDPOINT_FLAG="" + if [ -n "$(params.endpoint)" ]; then + ENDPOINT_FLAG="--endpoint $(params.endpoint)" + fi + + CLUSTER_NAME=$(params.cluster-name) + NAMESPACE="EKSPodIdentityScalabilityTests" + DIMENSION_NAME="ClusterName" + DIMENSION_VALUE=$CLUSTER_NAME + METRIC_LATENCY_NAME="CredentialFetchLatency" + PERIOD=300 + + START_TIME=$(aws eks $ENDPOINT_FLAG --region $REGION describe-cluster \ + --name "$CLUSTER_NAME" \ + --query "cluster.createdAt" \ + --output text) + + END_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + + response=$(aws cloudwatch get-metric-statistics \ + --region "$REGION" \ + --namespace "$NAMESPACE" \ + --metric-name "$METRIC_LATENCY_NAME" \ + --dimensions Name="$DIMENSION_NAME",Value="$DIMENSION_VALUE" \ + --start-time "$START_TIME" \ + --end-time "$END_TIME" \ + --period "$PERIOD" \ + --extended-statistics p50 p90 p99 \ + --output json) + + # extract p50 p90 p99 + latest=$(echo "$response" | jq -r '.Datapoints | sort_by(.Timestamp) | last') + p50=$(echo "$latest" | jq -r '."ExtendedStatistics"."p50" // "N/A"') + p90=$(echo "$latest" | jq -r '."ExtendedStatistics"."p90" // "N/A"') + p99=$(echo "$latest" | jq -r '."ExtendedStatistics"."p99" // "N/A"') + + response=$(aws cloudwatch get-metric-statistics \ + --region "$REGION" \ + --namespace "$NAMESPACE" \ + --metric-name "$METRIC_LATENCY_NAME" \ + --dimensions Name="$DIMENSION_NAME",Value="$DIMENSION_VALUE" \ + --start-time "$START_TIME" \ + --end-time "$END_TIME" \ + --period "$PERIOD" \ + --statistics SampleCount \ + --output json) + + total_samples=$(echo "$response" | jq '[.Datapoints[].SampleCount] | add // 0') + + cat < eks_pod_identity_test_summary.json + { + "start_time": "$START_TIME", + "end_time": "$END_TIME", + "total_samples": $total_samples, + "p50": $p50, + "p90": $p90, + "p99": $p99 + } + EOF + # we expect to see all files from loadtest that clusterloader2 outputs here in this dir ls -larth aws s3 cp . s3://$S3_RESULT_PATH/ --recursive diff --git a/tests/tekton-resources/tasks/setup/eks/awscli-pod-identity-association.yaml b/tests/tekton-resources/tasks/setup/eks/awscli-pod-identity-association.yaml index a569c702..6fbd0e5f 100644 --- a/tests/tekton-resources/tasks/setup/eks/awscli-pod-identity-association.yaml +++ b/tests/tekton-resources/tasks/setup/eks/awscli-pod-identity-association.yaml @@ -48,7 +48,8 @@ spec: ENDPOINT_FLAG="--endpoint $(params.endpoint)" fi - MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" + S3_MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" + CLOUDWATCH_MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/CloudWatchFullAccess" TRUST_POLICY_FILE="pia-trust-policy.json" # create a trust policy json file curl -s $(params.pia-trust-policy-url) -o ./$TRUST_POLICY_FILE @@ -57,7 +58,8 @@ spec: PIA_ROLE_NAME=$(params.cluster-name)-pia-role-$i aws iam create-role --role-name $PIA_ROLE_NAME --assume-role-policy-document file://$TRUST_POLICY_FILE - aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $MANAGED_POLICY_ARN + aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $S3_MANAGED_POLICY_ARN + aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $CLOUDWATCH_MANAGED_POLICY_ARN PIA_ROLE_ARN=$(aws iam get-role --role-name $PIA_ROLE_NAME --query 'Role.Arn' --output text) echo "$PIA_ROLE_ARN is created" From 835c9e28ddb7bfb825880844f77d9fcbef714034 Mon Sep 17 00:00:00 2001 From: xdu31 Date: Fri, 30 May 2025 08:26:01 -0700 Subject: [PATCH 2/4] review comments --- tests/assets/eks-pod-identity/config.yaml | 13 +++++++ .../assets/eks-pod-identity/pod-default.yaml | 16 +++++--- .../eks/awscli-cl2-load-with-addons-slos.yaml | 20 ++++++++++ .../clusterloader/load-pod-identity.yaml | 39 ++++++++++++++++--- 4 files changed, 78 insertions(+), 10 deletions(-) diff --git a/tests/assets/eks-pod-identity/config.yaml b/tests/assets/eks-pod-identity/config.yaml index 055414c3..e139471d 100644 --- a/tests/assets/eks-pod-identity/config.yaml +++ b/tests/assets/eks-pod-identity/config.yaml @@ -1,4 +1,8 @@ {{$clusterName := DefaultParam .CL2_CLUSTER_NAME "default-cluster-name"}} +{{$metricDimensionName := DefaultParam .CL2_METRIC_DIMENSION_NAME "ClusterName"}} +{{$metricNamespace := DefaultParam .CL2_METRIC_NAMESPACE "EKSPodIdentityScalabilityTests"}} +{{$metricLatencyName := DefaultParam .CL2_METRIC_LATENCY_NAME "CredentialFetchLatency"}} +{{$metricPeriod := DefaultParam .CL2_METRIC_PERIOD 300}} {{$namespacePrefix := DefaultParam .CL2_NAMESPACE_PREFIX "default"}} {{$namespaceCount := DefaultParam .CL2_NAMESPACE_COUNT 1}} {{$totalEksPodIdentityPods := DefaultParam .CL2_EKS_POD_IDENTITY_PODS 5000}} @@ -44,6 +48,10 @@ steps: templateFillMap: Group: eks-pod-identity ClusterName: {{$clusterName}} + MetricDimensionName: {{$metricDimensionName}} + MetricNamespace: {{$metricNamespace}} + MetricLatencyName: {{$metricLatencyName}} + MetricPeriod: {{$metricPeriod}} - name: Waiting for eks pod identity pods to be created measurements: - Identifier: WaitForEksPodIdentityPods @@ -72,3 +80,8 @@ steps: objectTemplatePath: pod-default.yaml templateFillMap: Group: eks-pod-identity + ClusterName: {{$clusterName}} + MetricDimensionName: {{$metricDimensionName}} + MetricNamespace: {{$metricNamespace}} + MetricLatencyName: {{$metricLatencyName}} + MetricPeriod: {{$metricPeriod}} diff --git a/tests/assets/eks-pod-identity/pod-default.yaml b/tests/assets/eks-pod-identity/pod-default.yaml index 8eb54359..e14cf438 100644 --- a/tests/assets/eks-pod-identity/pod-default.yaml +++ b/tests/assets/eks-pod-identity/pod-default.yaml @@ -12,6 +12,14 @@ spec: env: - name: CLUSTER_NAME value: "{{.ClusterName}}" + - name: DIMENSION_NAME + value: "{{.MetricDimensionName}}" + - name: NAMESPACE + value: "{{.MetricNamespace}}" + - name: METRIC_LATENCY_NAME + value: "{{.MetricLatencyName}}" + - name: PERIOD + value: "{{.MetricPeriod}}" command: - sh - -c @@ -20,15 +28,13 @@ spec: MAX_ATTEMPTS=7 INITIAL_DELAY=0.2 # 200ms - NAMESPACE="EKSPodIdentityScalabilityTests" - DIMENSION_NAME="ClusterName" DIMENSION_VALUE=$CLUSTER_NAME - METRIC_LATENCY_NAME="CredentialFetchLatency" - PERIOD=300 - METRIC_MAX_RETRIES=5 METRIC_RETRY_DELAY=1 + # make 7 attempts on credential fetching with exponential retries, and calculate the time taken + # push metrics on time taken on credential fetching + # to minimize failure from cloudwatch metrics, add retries on put-metric-data start_epoch=$(date +%s%3N) # fetch credentials for i in $(seq 0 $((MAX_ATTEMPTS - 1))); do diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index d573dffd..45063461 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -84,6 +84,18 @@ spec: default: "200" - name: cl2-uniform-qps default: "100" + - name: cl2-metric-dimension-name + description: "default metric dimension name" + default: "ClusterName" + - name: cl2-metric-namespace + description: "default metric namespace for pod identity" + default: "EKSPodIdentityScalabilityTests" + - name: cl2-metric-latency-name + description: "default metric latency name for pod identity" + default: "CredentialFetchLatency" + - name: cl2-metric-period + description: "default metric period" + default: "300" - name: timeout-pia-pod-creation default: "20m" - name: timeout-pia-pod-startup @@ -279,6 +291,14 @@ spec: value: $(params.cl2-default-burst) - name: cl2-uniform-qps value: $(params.cl2-uniform-qps) + - name: cl2-metric-dimension-name + value: $(params.cl2-metric-dimension-name) + - name: cl2-metric-namespace + value: $(params.cl2-metric-namespace) + - name: cl2-metric-latency-name + value: $(params.cl2-metric-latency-name) + - name: cl2-metric-period + value: $(params.cl2-metric-period) - name: results-bucket value: $(params.results-bucket) - name: nodes diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml index f6186408..db51e09e 100644 --- a/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml +++ b/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml @@ -25,6 +25,18 @@ spec: - name: cl2-uniform-qps description: "uniform qps" default: "100" + - name: cl2-metric-dimension-name + description: "default metric dimension name" + default: "ClusterName" + - name: cl2-metric-namespace + description: "default metric namespace for pod identity" + default: "EKSPodIdentityScalabilityTests" + - name: cl2-metric-latency-name + description: "default metric latency name for pod identity" + default: "CredentialFetchLatency" + - name: cl2-metric-period + description: "default metric period" + default: "300" - name: nodes description: "number of dataplane nodes to run the load test against" default: "1000" @@ -92,6 +104,10 @@ spec: CL2_DEFAULT_BURST: $(params.cl2-default-burst) CL2_UNIFORM_QPS: $(params.cl2-uniform-qps) CL2_CLUSTER_NAME: $(params.cluster-name) + CL2_METRIC_DIMENSION_NAME: $(params.cl2-metric-dimension-name) + CL2_METRIC_NAMESPACE: $(params.cl2-metric-namespace) + CL2_METRIC_LATENCY_NAME: $(params.cl2-metric-latency-name) + CL2_METRIC_PERIOD: $(params.cl2-metric-period) CL2_NAMESPACE_PREFIX: $(params.namespace-prefix) CL2_NAMESPACE_COUNT: $(params.namespace-count) CL2_TIMEOUT_EKS_POD_IDENTITY_POD_CREATION: $(params.timeout-pia-pod-creation) @@ -186,12 +202,15 @@ spec: fi CLUSTER_NAME=$(params.cluster-name) - NAMESPACE="EKSPodIdentityScalabilityTests" - DIMENSION_NAME="ClusterName" + NAMESPACE=$(params.cl2-metric-namespace) + DIMENSION_NAME=$(params.cl2-metric-dimension-name) DIMENSION_VALUE=$CLUSTER_NAME - METRIC_LATENCY_NAME="CredentialFetchLatency" - PERIOD=300 + METRIC_LATENCY_NAME=$(params.cl2-metric-latency-name) + PERIOD=$(params.cl2-metric-period) + # since the scalability test is running with the same cluster name, with cluster recreation + # it is important to know the range of start and end time to query metrics for the current run + # here we use cluster creation time start as start time and the current time as end time START_TIME=$(aws eks $ENDPOINT_FLAG --region $REGION describe-cluster \ --name "$CLUSTER_NAME" \ --query "cluster.createdAt" \ @@ -210,7 +229,7 @@ spec: --extended-statistics p50 p90 p99 \ --output json) - # extract p50 p90 p99 + # extract p50 p90 p99 of credential fetching latest=$(echo "$response" | jq -r '.Datapoints | sort_by(.Timestamp) | last') p50=$(echo "$latest" | jq -r '."ExtendedStatistics"."p50" // "N/A"') p90=$(echo "$latest" | jq -r '."ExtendedStatistics"."p90" // "N/A"') @@ -229,6 +248,7 @@ spec: total_samples=$(echo "$response" | jq '[.Datapoints[].SampleCount] | add // 0') + # save metric results for s3 upload cat < eks_pod_identity_test_summary.json { "start_time": "$START_TIME", @@ -243,3 +263,12 @@ spec: # we expect to see all files from loadtest that clusterloader2 outputs here in this dir ls -larth aws s3 cp . s3://$S3_RESULT_PATH/ --recursive + + # if p99 is equal to or more than 2 seconds, exit with failure + int_p99=$(echo "$p99" | awk '{printf "%d", $1}') + if [ "$int_p99" -lt 2 ]; then + echo "p99 is less than 2" + else + echo "p99 is 2 or more" + exit 1 + fi From 265ec49925ab2efc5742bed4a0aa430c64e5fb0c Mon Sep 17 00:00:00 2001 From: xdu31 Date: Mon, 2 Jun 2025 14:24:19 -0700 Subject: [PATCH 3/4] review comments --- .../assets/eks-pod-identity/pod-default.yaml | 30 ++++++++++++++++++- .../clusterloader/load-pod-identity.yaml | 6 ++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/tests/assets/eks-pod-identity/pod-default.yaml b/tests/assets/eks-pod-identity/pod-default.yaml index e14cf438..12788938 100644 --- a/tests/assets/eks-pod-identity/pod-default.yaml +++ b/tests/assets/eks-pod-identity/pod-default.yaml @@ -38,7 +38,8 @@ spec: start_epoch=$(date +%s%3N) # fetch credentials for i in $(seq 0 $((MAX_ATTEMPTS - 1))); do - if curl -S -H "Authorization: $AUTH_TOKEN" http://169.254.170.23/v1/credentials; then + status_code=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: $AUTH_TOKEN" http://169.254.170.23/v1/credentials) + if [ "$status_code" -eq 200 ]; then end_epoch=$(date +%s%3N) printf "Endpoint is reachable at try %d\n" "$i" @@ -80,6 +81,33 @@ spec: sleep "$SLEEP_TIME" done + PIA_ROLE_PREFIX=$CLUSTER_NAME-pia-role + STS_MAX_RETRIES=5 + STS_RETRY_DELAY=1 + STS_SUCCESS=0 + + # call sts to check on associated role + for ((j=1; j<=STS_MAX_RETRIES; j++)); do + ARN=$(aws sts get-caller-identity --query Arn --output text 2>/dev/null) + if [[ $? -eq 0 && -n "$ARN" ]]; then + if [[ "$ARN" == *"$PIA_ROLE_PREFIX"* ]]; then + echo "ARN contains role name prefix '$PIA_ROLE_PREFIX': $ARN" + STS_SUCCESS=1 + break + else + echo "ARN does not contain expected role name prefix '$PIA_ROLE_PREFIX': $ARN" + exit 1 + fi + fi + echo "Attempt $i failed to get caller identity. Retrying in $STS_RETRY_DELAY seconds..." + sleep $STS_RETRY_DELAY + done + + if [[ $STS_SUCCESS -ne 1 ]]; then + echo "Failed to retrieve matching caller identity after $STS_MAX_RETRIES attempts." + exit 1 + fi + # s3 api call while ! aws s3 ls; do echo "Waiting for S3 bucket access..." diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml index db51e09e..039410d0 100644 --- a/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml +++ b/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml @@ -191,6 +191,8 @@ spec: image: amazon/aws-cli workingDir: $(workspaces.results.path) script: | + yum install -y jq + S3_RESULT_PATH=$(cat $(results.s3_result.path)) echo "S3 Path: $S3_RESULT_PATH" aws sts get-caller-identity @@ -247,6 +249,7 @@ spec: --output json) total_samples=$(echo "$response" | jq '[.Datapoints[].SampleCount] | add // 0') + rate=$(params.cl2-default-qps) # save metric results for s3 upload cat < eks_pod_identity_test_summary.json @@ -254,6 +257,7 @@ spec: "start_time": "$START_TIME", "end_time": "$END_TIME", "total_samples": $total_samples, + "rate": $rate, "p50": $p50, "p90": $p90, "p99": $p99 @@ -268,7 +272,9 @@ spec: int_p99=$(echo "$p99" | awk '{printf "%d", $1}') if [ "$int_p99" -lt 2 ]; then echo "p99 is less than 2" + echo "1" | tee $(results.datapoint.path) else echo "p99 is 2 or more" + echo "0" | tee $(results.datapoint.path) exit 1 fi From 17053ff3aa85263deebfbf37e5630f3de06c8809 Mon Sep 17 00:00:00 2001 From: xdu31 Date: Thu, 5 Jun 2025 16:41:54 -0700 Subject: [PATCH 4/4] Remove calls to STS --- .../assets/eks-pod-identity/pod-default.yaml | 30 +++---------------- .../eks/awscli-cl2-load-with-addons-slos.yaml | 4 +-- .../clusterloader/load-pod-identity.yaml | 20 ++++++------- 3 files changed, 16 insertions(+), 38 deletions(-) diff --git a/tests/assets/eks-pod-identity/pod-default.yaml b/tests/assets/eks-pod-identity/pod-default.yaml index 12788938..748bc8e5 100644 --- a/tests/assets/eks-pod-identity/pod-default.yaml +++ b/tests/assets/eks-pod-identity/pod-default.yaml @@ -81,32 +81,10 @@ spec: sleep "$SLEEP_TIME" done - PIA_ROLE_PREFIX=$CLUSTER_NAME-pia-role - STS_MAX_RETRIES=5 - STS_RETRY_DELAY=1 - STS_SUCCESS=0 - - # call sts to check on associated role - for ((j=1; j<=STS_MAX_RETRIES; j++)); do - ARN=$(aws sts get-caller-identity --query Arn --output text 2>/dev/null) - if [[ $? -eq 0 && -n "$ARN" ]]; then - if [[ "$ARN" == *"$PIA_ROLE_PREFIX"* ]]; then - echo "ARN contains role name prefix '$PIA_ROLE_PREFIX': $ARN" - STS_SUCCESS=1 - break - else - echo "ARN does not contain expected role name prefix '$PIA_ROLE_PREFIX': $ARN" - exit 1 - fi - fi - echo "Attempt $i failed to get caller identity. Retrying in $STS_RETRY_DELAY seconds..." - sleep $STS_RETRY_DELAY - done - - if [[ $STS_SUCCESS -ne 1 ]]; then - echo "Failed to retrieve matching caller identity after $STS_MAX_RETRIES attempts." - exit 1 - fi + # it is noted that a Pod with host network will fallback to Node role permissions that includes this s3 access + # however, in our test case, we are not using host network + # https://github.com/awslabs/kubernetes-iteration-toolkit/blob/main/tests/assets/eks_node_role.json + # the main reason we are not doing an STS get identity verification is about the quota of STS APIs with scale tests # s3 api call while ! aws s3 ls; do diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index 45063461..0aec2e39 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -97,9 +97,9 @@ spec: description: "default metric period" default: "300" - name: timeout-pia-pod-creation - default: "20m" + default: "80s" - name: timeout-pia-pod-startup - default: "5m" + default: "60s" - name: launch-template-ami default: "" description: "Launch template ImageId value, which may be an AMI ID or resolve:ssm reference. By default resolve to the lates AL2023 ami for cluster version" diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml index 039410d0..9666c77a 100644 --- a/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml +++ b/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml @@ -228,14 +228,14 @@ spec: --start-time "$START_TIME" \ --end-time "$END_TIME" \ --period "$PERIOD" \ - --extended-statistics p50 p90 p99 \ + --extended-statistics p50 p99 p99.95 \ --output json) - # extract p50 p90 p99 of credential fetching + # extract p50 p99 p99.95 of credential fetching latest=$(echo "$response" | jq -r '.Datapoints | sort_by(.Timestamp) | last') p50=$(echo "$latest" | jq -r '."ExtendedStatistics"."p50" // "N/A"') - p90=$(echo "$latest" | jq -r '."ExtendedStatistics"."p90" // "N/A"') p99=$(echo "$latest" | jq -r '."ExtendedStatistics"."p99" // "N/A"') + p9995=$(echo "$latest" | jq -r '."ExtendedStatistics"."p99.95" // "N/A"') response=$(aws cloudwatch get-metric-statistics \ --region "$REGION" \ @@ -259,8 +259,8 @@ spec: "total_samples": $total_samples, "rate": $rate, "p50": $p50, - "p90": $p90, - "p99": $p99 + "p99": $p99, + "p99.95": $p9995 } EOF @@ -268,13 +268,13 @@ spec: ls -larth aws s3 cp . s3://$S3_RESULT_PATH/ --recursive - # if p99 is equal to or more than 2 seconds, exit with failure - int_p99=$(echo "$p99" | awk '{printf "%d", $1}') - if [ "$int_p99" -lt 2 ]; then - echo "p99 is less than 2" + # if p99.95 is equal to or more than 1 second, exit with failure + int_p9995=$(echo "$p9995" | awk '{printf "%d", $1}') + if [ "$int_p9995" -lt 1 ]; then + echo "p99.95 is less than 1 second" echo "1" | tee $(results.datapoint.path) else - echo "p99 is 2 or more" + echo "p99.95 is 1 second or more" echo "0" | tee $(results.datapoint.path) exit 1 fi