diff --git a/tests/assets/eks-pod-identity/config.yaml b/tests/assets/eks-pod-identity/config.yaml index 08448b56..e139471d 100644 --- a/tests/assets/eks-pod-identity/config.yaml +++ b/tests/assets/eks-pod-identity/config.yaml @@ -1,3 +1,8 @@ +{{$clusterName := DefaultParam .CL2_CLUSTER_NAME "default-cluster-name"}} +{{$metricDimensionName := DefaultParam .CL2_METRIC_DIMENSION_NAME "ClusterName"}} +{{$metricNamespace := DefaultParam .CL2_METRIC_NAMESPACE "EKSPodIdentityScalabilityTests"}} +{{$metricLatencyName := DefaultParam .CL2_METRIC_LATENCY_NAME "CredentialFetchLatency"}} +{{$metricPeriod := DefaultParam .CL2_METRIC_PERIOD 300}} {{$namespacePrefix := DefaultParam .CL2_NAMESPACE_PREFIX "default"}} {{$namespaceCount := DefaultParam .CL2_NAMESPACE_COUNT 1}} {{$totalEksPodIdentityPods := DefaultParam .CL2_EKS_POD_IDENTITY_PODS 5000}} @@ -42,6 +47,11 @@ steps: objectTemplatePath: pod-default.yaml templateFillMap: Group: eks-pod-identity + ClusterName: {{$clusterName}} + MetricDimensionName: {{$metricDimensionName}} + MetricNamespace: {{$metricNamespace}} + MetricLatencyName: {{$metricLatencyName}} + MetricPeriod: {{$metricPeriod}} - name: Waiting for eks pod identity pods to be created measurements: - Identifier: WaitForEksPodIdentityPods @@ -70,3 +80,8 @@ steps: objectTemplatePath: pod-default.yaml templateFillMap: Group: eks-pod-identity + ClusterName: {{$clusterName}} + MetricDimensionName: {{$metricDimensionName}} + MetricNamespace: {{$metricNamespace}} + MetricLatencyName: {{$metricLatencyName}} + MetricPeriod: {{$metricPeriod}} diff --git a/tests/assets/eks-pod-identity/pod-default.yaml b/tests/assets/eks-pod-identity/pod-default.yaml index 84702a6c..748bc8e5 100644 --- a/tests/assets/eks-pod-identity/pod-default.yaml +++ b/tests/assets/eks-pod-identity/pod-default.yaml @@ -6,18 +6,94 @@ metadata: group: {{.Group}} spec: containers: - - image: registry.k8s.io/pause:3.9 - imagePullPolicy: IfNotPresent - name: pause - initContainers: - - name: app-init + - name: app-with-awsapi image: public.ecr.aws/aws-cli/aws-cli:latest imagePullPolicy: IfNotPresent + env: + - name: CLUSTER_NAME + value: "{{.ClusterName}}" + - name: DIMENSION_NAME + value: "{{.MetricDimensionName}}" + - name: NAMESPACE + value: "{{.MetricNamespace}}" + - name: METRIC_LATENCY_NAME + value: "{{.MetricLatencyName}}" + - name: PERIOD + value: "{{.MetricPeriod}}" command: - sh - -c - | + AUTH_TOKEN=$(cat $AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE) + MAX_ATTEMPTS=7 + INITIAL_DELAY=0.2 # 200ms + + DIMENSION_VALUE=$CLUSTER_NAME + METRIC_MAX_RETRIES=5 + METRIC_RETRY_DELAY=1 + + # make 7 attempts on credential fetching with exponential retries, and calculate the time taken + # push metrics on time taken on credential fetching + # to minimize failure from cloudwatch metrics, add retries on put-metric-data + start_epoch=$(date +%s%3N) + # fetch credentials + for i in $(seq 0 $((MAX_ATTEMPTS - 1))); do + status_code=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: $AUTH_TOKEN" http://169.254.170.23/v1/credentials) + if [ "$status_code" -eq 200 ]; then + end_epoch=$(date +%s%3N) + printf "Endpoint is reachable at try %d\n" "$i" + + latency_ms=$((end_epoch - start_epoch)) + latency_sec=$(awk "BEGIN { print $latency_ms / 1000 }") + + # send CredentialFetchLatency metric + for ((j=1; j<=METRIC_MAX_RETRIES; j++)); do + aws cloudwatch put-metric-data \ + --namespace "$NAMESPACE" \ + --metric-name "$METRIC_LATENCY_NAME" \ + --dimensions "$DIMENSION_NAME=$DIMENSION_VALUE" \ + --value "$latency_sec" \ + --unit Seconds && { + echo "Metric CredentialFetchLatency sent successfully." + break + } + + if [ "$j" -lt "$METRIC_MAX_RETRIES" ]; then + echo "Attempt $j failed. Retrying in $METRIC_RETRY_DELAY seconds..." >&2 + sleep $METRIC_RETRY_DELAY + METRIC_RETRY_DELAY=$((METRIC_RETRY_DELAY * 2)) # exponential backoff + else + echo "Failed to send metric CredentialFetchLatency after $METRIC_MAX_RETRIES attempts." >&2 + exit 1 + fi + done + + break + fi + + if [ "$i" -eq $((MAX_ATTEMPTS - 1)) ]; then + echo "Max attempts reached. Exiting with failure." + exit 1 + fi + + SLEEP_TIME=$(echo "$INITIAL_DELAY * (2 ^ $i)" | bc -l) + printf "Failed. Sleeping %.3f seconds before retry...\n" "$SLEEP_TIME" + sleep "$SLEEP_TIME" + done + + # it is noted that a Pod with host network will fallback to Node role permissions that includes this s3 access + # however, in our test case, we are not using host network + # https://github.com/awslabs/kubernetes-iteration-toolkit/blob/main/tests/assets/eks_node_role.json + # the main reason we are not doing an STS get identity verification is about the quota of STS APIs with scale tests + + # s3 api call while ! aws s3 ls; do echo "Waiting for S3 bucket access..." done echo "S3 bucket is accessible, proceeding." + + # pause + while true; do + echo "Sleeping for 1 hour..." + sleep 3600 + done diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index c58c5cb5..0aec2e39 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -84,10 +84,22 @@ spec: default: "200" - name: cl2-uniform-qps default: "100" + - name: cl2-metric-dimension-name + description: "default metric dimension name" + default: "ClusterName" + - name: cl2-metric-namespace + description: "default metric namespace for pod identity" + default: "EKSPodIdentityScalabilityTests" + - name: cl2-metric-latency-name + description: "default metric latency name for pod identity" + default: "CredentialFetchLatency" + - name: cl2-metric-period + description: "default metric period" + default: "300" - name: timeout-pia-pod-creation - default: "20m" + default: "80s" - name: timeout-pia-pod-startup - default: "5m" + default: "60s" - name: launch-template-ami default: "" description: "Launch template ImageId value, which may be an AMI ID or resolve:ssm reference. By default resolve to the lates AL2023 ami for cluster version" @@ -189,7 +201,7 @@ spec: value: "$(params.kubernetes-version)" - name: endpoint value: $(params.endpoint) - - name: node-role-name + - name: node-role-name value: $(params.cluster-name)-node-role - name: ami value: $(params.launch-template-ami) @@ -279,12 +291,22 @@ spec: value: $(params.cl2-default-burst) - name: cl2-uniform-qps value: $(params.cl2-uniform-qps) + - name: cl2-metric-dimension-name + value: $(params.cl2-metric-dimension-name) + - name: cl2-metric-namespace + value: $(params.cl2-metric-namespace) + - name: cl2-metric-latency-name + value: $(params.cl2-metric-latency-name) + - name: cl2-metric-period + value: $(params.cl2-metric-period) - name: results-bucket value: $(params.results-bucket) - name: nodes value: $(params.desired-nodes) - name: cluster-name value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) - name: namespace-prefix value: $(params.namespace-prefix) - name: namespace-count diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml index 47477be5..9666c77a 100644 --- a/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml +++ b/tests/tekton-resources/tasks/generators/clusterloader/load-pod-identity.yaml @@ -25,6 +25,18 @@ spec: - name: cl2-uniform-qps description: "uniform qps" default: "100" + - name: cl2-metric-dimension-name + description: "default metric dimension name" + default: "ClusterName" + - name: cl2-metric-namespace + description: "default metric namespace for pod identity" + default: "EKSPodIdentityScalabilityTests" + - name: cl2-metric-latency-name + description: "default metric latency name for pod identity" + default: "CredentialFetchLatency" + - name: cl2-metric-period + description: "default metric period" + default: "300" - name: nodes description: "number of dataplane nodes to run the load test against" default: "1000" @@ -35,6 +47,8 @@ spec: description: The region where the cluster is in. - name: cluster-name description: "The name of the EKS cluster you want to spin" + - name: endpoint + default: "" - name: namespace-prefix default: "default" description: "The prefix of namespaces for EKS Pod Identity test." @@ -89,6 +103,11 @@ spec: CL2_DEFAULT_QPS: $(params.cl2-default-qps) CL2_DEFAULT_BURST: $(params.cl2-default-burst) CL2_UNIFORM_QPS: $(params.cl2-uniform-qps) + CL2_CLUSTER_NAME: $(params.cluster-name) + CL2_METRIC_DIMENSION_NAME: $(params.cl2-metric-dimension-name) + CL2_METRIC_NAMESPACE: $(params.cl2-metric-namespace) + CL2_METRIC_LATENCY_NAME: $(params.cl2-metric-latency-name) + CL2_METRIC_PERIOD: $(params.cl2-metric-period) CL2_NAMESPACE_PREFIX: $(params.namespace-prefix) CL2_NAMESPACE_COUNT: $(params.namespace-count) CL2_TIMEOUT_EKS_POD_IDENTITY_POD_CREATION: $(params.timeout-pia-pod-creation) @@ -172,9 +191,90 @@ spec: image: amazon/aws-cli workingDir: $(workspaces.results.path) script: | + yum install -y jq + S3_RESULT_PATH=$(cat $(results.s3_result.path)) echo "S3 Path: $S3_RESULT_PATH" aws sts get-caller-identity + + REGION=$(params.region) + ENDPOINT_FLAG="" + if [ -n "$(params.endpoint)" ]; then + ENDPOINT_FLAG="--endpoint $(params.endpoint)" + fi + + CLUSTER_NAME=$(params.cluster-name) + NAMESPACE=$(params.cl2-metric-namespace) + DIMENSION_NAME=$(params.cl2-metric-dimension-name) + DIMENSION_VALUE=$CLUSTER_NAME + METRIC_LATENCY_NAME=$(params.cl2-metric-latency-name) + PERIOD=$(params.cl2-metric-period) + + # since the scalability test is running with the same cluster name, with cluster recreation + # it is important to know the range of start and end time to query metrics for the current run + # here we use cluster creation time start as start time and the current time as end time + START_TIME=$(aws eks $ENDPOINT_FLAG --region $REGION describe-cluster \ + --name "$CLUSTER_NAME" \ + --query "cluster.createdAt" \ + --output text) + + END_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + + response=$(aws cloudwatch get-metric-statistics \ + --region "$REGION" \ + --namespace "$NAMESPACE" \ + --metric-name "$METRIC_LATENCY_NAME" \ + --dimensions Name="$DIMENSION_NAME",Value="$DIMENSION_VALUE" \ + --start-time "$START_TIME" \ + --end-time "$END_TIME" \ + --period "$PERIOD" \ + --extended-statistics p50 p99 p99.95 \ + --output json) + + # extract p50 p99 p99.95 of credential fetching + latest=$(echo "$response" | jq -r '.Datapoints | sort_by(.Timestamp) | last') + p50=$(echo "$latest" | jq -r '."ExtendedStatistics"."p50" // "N/A"') + p99=$(echo "$latest" | jq -r '."ExtendedStatistics"."p99" // "N/A"') + p9995=$(echo "$latest" | jq -r '."ExtendedStatistics"."p99.95" // "N/A"') + + response=$(aws cloudwatch get-metric-statistics \ + --region "$REGION" \ + --namespace "$NAMESPACE" \ + --metric-name "$METRIC_LATENCY_NAME" \ + --dimensions Name="$DIMENSION_NAME",Value="$DIMENSION_VALUE" \ + --start-time "$START_TIME" \ + --end-time "$END_TIME" \ + --period "$PERIOD" \ + --statistics SampleCount \ + --output json) + + total_samples=$(echo "$response" | jq '[.Datapoints[].SampleCount] | add // 0') + rate=$(params.cl2-default-qps) + + # save metric results for s3 upload + cat < eks_pod_identity_test_summary.json + { + "start_time": "$START_TIME", + "end_time": "$END_TIME", + "total_samples": $total_samples, + "rate": $rate, + "p50": $p50, + "p99": $p99, + "p99.95": $p9995 + } + EOF + # we expect to see all files from loadtest that clusterloader2 outputs here in this dir ls -larth aws s3 cp . s3://$S3_RESULT_PATH/ --recursive + + # if p99.95 is equal to or more than 1 second, exit with failure + int_p9995=$(echo "$p9995" | awk '{printf "%d", $1}') + if [ "$int_p9995" -lt 1 ]; then + echo "p99.95 is less than 1 second" + echo "1" | tee $(results.datapoint.path) + else + echo "p99.95 is 1 second or more" + echo "0" | tee $(results.datapoint.path) + exit 1 + fi diff --git a/tests/tekton-resources/tasks/setup/eks/awscli-pod-identity-association.yaml b/tests/tekton-resources/tasks/setup/eks/awscli-pod-identity-association.yaml index a569c702..6fbd0e5f 100644 --- a/tests/tekton-resources/tasks/setup/eks/awscli-pod-identity-association.yaml +++ b/tests/tekton-resources/tasks/setup/eks/awscli-pod-identity-association.yaml @@ -48,7 +48,8 @@ spec: ENDPOINT_FLAG="--endpoint $(params.endpoint)" fi - MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" + S3_MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" + CLOUDWATCH_MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/CloudWatchFullAccess" TRUST_POLICY_FILE="pia-trust-policy.json" # create a trust policy json file curl -s $(params.pia-trust-policy-url) -o ./$TRUST_POLICY_FILE @@ -57,7 +58,8 @@ spec: PIA_ROLE_NAME=$(params.cluster-name)-pia-role-$i aws iam create-role --role-name $PIA_ROLE_NAME --assume-role-policy-document file://$TRUST_POLICY_FILE - aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $MANAGED_POLICY_ARN + aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $S3_MANAGED_POLICY_ARN + aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $CLOUDWATCH_MANAGED_POLICY_ARN PIA_ROLE_ARN=$(aws iam get-role --role-name $PIA_ROLE_NAME --query 'Role.Arn' --output text) echo "$PIA_ROLE_ARN is created"