Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions tests/assets/eks-pod-identity/config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
{{$clusterName := DefaultParam .CL2_CLUSTER_NAME "default-cluster-name"}}
{{$metricDimensionName := DefaultParam .CL2_METRIC_DIMENSION_NAME "ClusterName"}}
{{$metricNamespace := DefaultParam .CL2_METRIC_NAMESPACE "EKSPodIdentityScalabilityTests"}}
{{$metricLatencyName := DefaultParam .CL2_METRIC_LATENCY_NAME "CredentialFetchLatency"}}
{{$metricPeriod := DefaultParam .CL2_METRIC_PERIOD 300}}
{{$namespacePrefix := DefaultParam .CL2_NAMESPACE_PREFIX "default"}}
{{$namespaceCount := DefaultParam .CL2_NAMESPACE_COUNT 1}}
{{$totalEksPodIdentityPods := DefaultParam .CL2_EKS_POD_IDENTITY_PODS 5000}}
Expand Down Expand Up @@ -42,6 +47,11 @@ steps:
objectTemplatePath: pod-default.yaml
templateFillMap:
Group: eks-pod-identity
ClusterName: {{$clusterName}}
MetricDimensionName: {{$metricDimensionName}}
MetricNamespace: {{$metricNamespace}}
MetricLatencyName: {{$metricLatencyName}}
MetricPeriod: {{$metricPeriod}}
- name: Waiting for eks pod identity pods to be created
measurements:
- Identifier: WaitForEksPodIdentityPods
Expand Down Expand Up @@ -70,3 +80,8 @@ steps:
objectTemplatePath: pod-default.yaml
templateFillMap:
Group: eks-pod-identity
ClusterName: {{$clusterName}}
MetricDimensionName: {{$metricDimensionName}}
MetricNamespace: {{$metricNamespace}}
MetricLatencyName: {{$metricLatencyName}}
MetricPeriod: {{$metricPeriod}}
86 changes: 81 additions & 5 deletions tests/assets/eks-pod-identity/pod-default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,94 @@ metadata:
group: {{.Group}}
spec:
containers:
- image: registry.k8s.io/pause:3.9
imagePullPolicy: IfNotPresent
name: pause
initContainers:
- name: app-init
- name: app-with-awsapi
image: public.ecr.aws/aws-cli/aws-cli:latest
imagePullPolicy: IfNotPresent
env:
- name: CLUSTER_NAME
value: "{{.ClusterName}}"
- name: DIMENSION_NAME
value: "{{.MetricDimensionName}}"
- name: NAMESPACE
value: "{{.MetricNamespace}}"
- name: METRIC_LATENCY_NAME
value: "{{.MetricLatencyName}}"
- name: PERIOD
value: "{{.MetricPeriod}}"
command:
- sh
- -c
- |
AUTH_TOKEN=$(cat $AWS_CONTAINER_AUTHORIZATION_TOKEN_FILE)
MAX_ATTEMPTS=7
INITIAL_DELAY=0.2 # 200ms

DIMENSION_VALUE=$CLUSTER_NAME
METRIC_MAX_RETRIES=5
METRIC_RETRY_DELAY=1

# make 7 attempts on credential fetching with exponential retries, and calculate the time taken
# push metrics on time taken on credential fetching
# to minimize failure from cloudwatch metrics, add retries on put-metric-data
start_epoch=$(date +%s%3N)
# fetch credentials
for i in $(seq 0 $((MAX_ATTEMPTS - 1))); do
status_code=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: $AUTH_TOKEN" http://169.254.170.23/v1/credentials)
if [ "$status_code" -eq 200 ]; then
end_epoch=$(date +%s%3N)
printf "Endpoint is reachable at try %d\n" "$i"

latency_ms=$((end_epoch - start_epoch))
latency_sec=$(awk "BEGIN { print $latency_ms / 1000 }")

# send CredentialFetchLatency metric
for ((j=1; j<=METRIC_MAX_RETRIES; j++)); do
aws cloudwatch put-metric-data \
--namespace "$NAMESPACE" \
--metric-name "$METRIC_LATENCY_NAME" \
--dimensions "$DIMENSION_NAME=$DIMENSION_VALUE" \
--value "$latency_sec" \
--unit Seconds && {
echo "Metric CredentialFetchLatency sent successfully."
break
}

if [ "$j" -lt "$METRIC_MAX_RETRIES" ]; then
echo "Attempt $j failed. Retrying in $METRIC_RETRY_DELAY seconds..." >&2
sleep $METRIC_RETRY_DELAY
METRIC_RETRY_DELAY=$((METRIC_RETRY_DELAY * 2)) # exponential backoff
else
echo "Failed to send metric CredentialFetchLatency after $METRIC_MAX_RETRIES attempts." >&2
exit 1
fi
done

break
fi

if [ "$i" -eq $((MAX_ATTEMPTS - 1)) ]; then
echo "Max attempts reached. Exiting with failure."
exit 1
fi

SLEEP_TIME=$(echo "$INITIAL_DELAY * (2 ^ $i)" | bc -l)
printf "Failed. Sleeping %.3f seconds before retry...\n" "$SLEEP_TIME"
sleep "$SLEEP_TIME"
done

# it is noted that a Pod with host network will fallback to Node role permissions that includes this s3 access
# however, in our test case, we are not using host network
# https://github.com/awslabs/kubernetes-iteration-toolkit/blob/main/tests/assets/eks_node_role.json
# the main reason we are not doing an STS get identity verification is about the quota of STS APIs with scale tests

# s3 api call
while ! aws s3 ls; do
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think its better to verify the role being used to make sure we are not using the instance role.
aws sts get-caller-identity | grep <role_name>

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit concerned about sts quota we consume running in this account

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i understand about the limits but i think we need a way to make sure that the command is successful because of token by pod identity and not using the instance role.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a comment on how we can make sure this test is working as intended without checking on the assumed role identity

echo "Waiting for S3 bucket access..."
done
echo "S3 bucket is accessible, proceeding."

# pause
while true; do
echo "Sleeping for 1 hour..."
sleep 3600
done
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,22 @@ spec:
default: "200"
- name: cl2-uniform-qps
default: "100"
- name: cl2-metric-dimension-name
description: "default metric dimension name"
default: "ClusterName"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have the same defaults at task level, we don't have to carry this to pipeline. Pipeline would take task level defaults when not supplied.

Same for other params as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but when we make a runpipeline, that's on pipeline level? So we don't need to make another code change here to change the name

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, but run pipeline will take task level defaults when not supplied.

- name: cl2-metric-namespace
description: "default metric namespace for pod identity"
default: "EKSPodIdentityScalabilityTests"
- name: cl2-metric-latency-name
description: "default metric latency name for pod identity"
default: "CredentialFetchLatency"
- name: cl2-metric-period
description: "default metric period"
default: "300"
- name: timeout-pia-pod-creation
default: "20m"
default: "80s"
- name: timeout-pia-pod-startup
default: "5m"
default: "60s"
- name: launch-template-ami
default: ""
description: "Launch template ImageId value, which may be an AMI ID or resolve:ssm reference. By default resolve to the lates AL2023 ami for cluster version"
Expand Down Expand Up @@ -189,7 +201,7 @@ spec:
value: "$(params.kubernetes-version)"
- name: endpoint
value: $(params.endpoint)
- name: node-role-name
- name: node-role-name
value: $(params.cluster-name)-node-role
- name: ami
value: $(params.launch-template-ami)
Expand Down Expand Up @@ -279,12 +291,22 @@ spec:
value: $(params.cl2-default-burst)
- name: cl2-uniform-qps
value: $(params.cl2-uniform-qps)
- name: cl2-metric-dimension-name
value: $(params.cl2-metric-dimension-name)
- name: cl2-metric-namespace
value: $(params.cl2-metric-namespace)
- name: cl2-metric-latency-name
value: $(params.cl2-metric-latency-name)
- name: cl2-metric-period
value: $(params.cl2-metric-period)
- name: results-bucket
value: $(params.results-bucket)
- name: nodes
value: $(params.desired-nodes)
- name: cluster-name
value: $(params.cluster-name)
- name: endpoint
value: $(params.endpoint)
- name: namespace-prefix
value: $(params.namespace-prefix)
- name: namespace-count
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,18 @@ spec:
- name: cl2-uniform-qps
description: "uniform qps"
default: "100"
- name: cl2-metric-dimension-name
description: "default metric dimension name"
default: "ClusterName"
- name: cl2-metric-namespace
description: "default metric namespace for pod identity"
default: "EKSPodIdentityScalabilityTests"
- name: cl2-metric-latency-name
description: "default metric latency name for pod identity"
default: "CredentialFetchLatency"
- name: cl2-metric-period
description: "default metric period"
default: "300"
- name: nodes
description: "number of dataplane nodes to run the load test against"
default: "1000"
Expand All @@ -35,6 +47,8 @@ spec:
description: The region where the cluster is in.
- name: cluster-name
description: "The name of the EKS cluster you want to spin"
- name: endpoint
default: ""
- name: namespace-prefix
default: "default"
description: "The prefix of namespaces for EKS Pod Identity test."
Expand Down Expand Up @@ -89,6 +103,11 @@ spec:
CL2_DEFAULT_QPS: $(params.cl2-default-qps)
CL2_DEFAULT_BURST: $(params.cl2-default-burst)
CL2_UNIFORM_QPS: $(params.cl2-uniform-qps)
CL2_CLUSTER_NAME: $(params.cluster-name)
CL2_METRIC_DIMENSION_NAME: $(params.cl2-metric-dimension-name)
CL2_METRIC_NAMESPACE: $(params.cl2-metric-namespace)
CL2_METRIC_LATENCY_NAME: $(params.cl2-metric-latency-name)
CL2_METRIC_PERIOD: $(params.cl2-metric-period)
CL2_NAMESPACE_PREFIX: $(params.namespace-prefix)
CL2_NAMESPACE_COUNT: $(params.namespace-count)
CL2_TIMEOUT_EKS_POD_IDENTITY_POD_CREATION: $(params.timeout-pia-pod-creation)
Expand Down Expand Up @@ -172,9 +191,90 @@ spec:
image: amazon/aws-cli
workingDir: $(workspaces.results.path)
script: |
yum install -y jq

S3_RESULT_PATH=$(cat $(results.s3_result.path))
echo "S3 Path: $S3_RESULT_PATH"
aws sts get-caller-identity

REGION=$(params.region)
ENDPOINT_FLAG=""
if [ -n "$(params.endpoint)" ]; then
ENDPOINT_FLAG="--endpoint $(params.endpoint)"
fi

CLUSTER_NAME=$(params.cluster-name)
NAMESPACE=$(params.cl2-metric-namespace)
DIMENSION_NAME=$(params.cl2-metric-dimension-name)
DIMENSION_VALUE=$CLUSTER_NAME
METRIC_LATENCY_NAME=$(params.cl2-metric-latency-name)
PERIOD=$(params.cl2-metric-period)

# since the scalability test is running with the same cluster name, with cluster recreation
# it is important to know the range of start and end time to query metrics for the current run
# here we use cluster creation time start as start time and the current time as end time
START_TIME=$(aws eks $ENDPOINT_FLAG --region $REGION describe-cluster \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you add comments on why you consider this as start time ?

Also, please add comments overall to make it more readable for future users/consumers on your team, especially wherever you are making assumptions.

--name "$CLUSTER_NAME" \
--query "cluster.createdAt" \
--output text)

END_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

response=$(aws cloudwatch get-metric-statistics \
--region "$REGION" \
--namespace "$NAMESPACE" \
--metric-name "$METRIC_LATENCY_NAME" \
--dimensions Name="$DIMENSION_NAME",Value="$DIMENSION_VALUE" \
--start-time "$START_TIME" \
--end-time "$END_TIME" \
--period "$PERIOD" \
--extended-statistics p50 p99 p99.95 \
--output json)

# extract p50 p99 p99.95 of credential fetching
latest=$(echo "$response" | jq -r '.Datapoints | sort_by(.Timestamp) | last')
p50=$(echo "$latest" | jq -r '."ExtendedStatistics"."p50" // "N/A"')
p99=$(echo "$latest" | jq -r '."ExtendedStatistics"."p99" // "N/A"')
p9995=$(echo "$latest" | jq -r '."ExtendedStatistics"."p99.95" // "N/A"')

response=$(aws cloudwatch get-metric-statistics \
--region "$REGION" \
--namespace "$NAMESPACE" \
--metric-name "$METRIC_LATENCY_NAME" \
--dimensions Name="$DIMENSION_NAME",Value="$DIMENSION_VALUE" \
--start-time "$START_TIME" \
--end-time "$END_TIME" \
--period "$PERIOD" \
--statistics SampleCount \
--output json)

total_samples=$(echo "$response" | jq '[.Datapoints[].SampleCount] | add // 0')
Copy link
Contributor

@hakuna-matatah hakuna-matatah Jun 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to capture what is the rate at which credentials are fetched from service. To be able to compare that against the scheduler throughput.

Currently you have some kind of measuring by able to control the timeout param https://github.com/awslabs/kubernetes-iteration-toolkit/pull/512/files#diff-d2d660edac904aa96e330bfae7bf67ef6885190877c5ab7668f0f157057da03fR61
accordingly by computing total number of pods and client pod creation rate/scheduler rate and what your service throughput could be.

This will only give some kind of approximation but not fully accurate throughput of your service using this kind of measurement.

rate=$(params.cl2-default-qps)

# save metric results for s3 upload
cat <<EOF > eks_pod_identity_test_summary.json
{
"start_time": "$START_TIME",
"end_time": "$END_TIME",
"total_samples": $total_samples,
"rate": $rate,
"p50": $p50,
"p99": $p99,
"p99.95": $p9995
}
EOF

# we expect to see all files from loadtest that clusterloader2 outputs here in this dir
ls -larth
aws s3 cp . s3://$S3_RESULT_PATH/ --recursive

# if p99.95 is equal to or more than 1 second, exit with failure
int_p9995=$(echo "$p9995" | awk '{printf "%d", $1}')
if [ "$int_p9995" -lt 1 ]; then
echo "p99.95 is less than 1 second"
echo "1" | tee $(results.datapoint.path)
else
echo "p99.95 is 1 second or more"
echo "0" | tee $(results.datapoint.path)
exit 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't want to exit and fail the test, we need to capture the test result like this

exit_code=$?
if [ $exit_code -eq 0 ]; then
echo "1" | tee $(results.datapoint.path)
else
echo "0" | tee $(results.datapoint.path)
fi
exit $exit_code

and use that to emit result like how we do it here -

So it can used configure alarm and cut tickets to your team.

fi
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ spec:
ENDPOINT_FLAG="--endpoint $(params.endpoint)"
fi

MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
S3_MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
CLOUDWATCH_MANAGED_POLICY_ARN="arn:aws:iam::aws:policy/CloudWatchFullAccess"
TRUST_POLICY_FILE="pia-trust-policy.json"
# create a trust policy json file
curl -s $(params.pia-trust-policy-url) -o ./$TRUST_POLICY_FILE
Expand All @@ -57,7 +58,8 @@ spec:

PIA_ROLE_NAME=$(params.cluster-name)-pia-role-$i
aws iam create-role --role-name $PIA_ROLE_NAME --assume-role-policy-document file://$TRUST_POLICY_FILE
aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $MANAGED_POLICY_ARN
aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $S3_MANAGED_POLICY_ARN
aws iam attach-role-policy --role-name $PIA_ROLE_NAME --policy-arn $CLOUDWATCH_MANAGED_POLICY_ARN
PIA_ROLE_ARN=$(aws iam get-role --role-name $PIA_ROLE_NAME --query 'Role.Arn' --output text)
echo "$PIA_ROLE_ARN is created"

Expand Down