.github/workflows/application-signals-python-e2e-eks-test.yml

test #12

Workflow file for this run

.github/workflows/application-signals-python-e2e-eks-test.yml at 3350790

	## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
	## SPDX-License-Identifier: Apache-2.0

	# This is a reusable workflow for running the E2E test for Application Signals.
	# It is meant to be called from another workflow.
	# Read more about reusable workflows: https://docs.github.com/en/actions/using-workflows/reusing-workflows#overview
	name: Application Signals Enablement E2E Testing - Python EKS
	on:
	workflow_call:
	inputs:
	aws-region:
	required: true
	type: string
	test-cluster-name:
	required: true
	type: string
	application-signals-adot-image:
	required: false
	type: string
	application-signals-adot-image-tag:
	required: false
	type: string
	caller-workflow-name:
	required: true
	type: string

	permissions:
	id-token: write
	contents: read

	env:
	METRIC_NAMESPACE: ApplicationSignals
	LOG_GROUP_NAME: /aws/application-signals/data
	TEST_RESOURCES_FOLDER: ${GITHUB_WORKSPACE}

	jobs:
	python-e2e-eks-test:
	runs-on: ubuntu-latest
	container:
	image: public.ecr.aws/h6o3z5z9/aws-application-signals-test-framework-workflow-container:latest
	steps:
	- uses: actions/checkout@v4
	with:
	repository: aws-observability/aws-application-signals-test-framework
	ref: ga-python
	fetch-depth: 0

	- name: Download enablement script
	uses: ./.github/workflows/actions/execute_and_retry
	with:
	pre-command: "mkdir enablement-script && cd enablement-script"
	command: "wget https://raw.githubusercontent.com/aws-observability/application-signals-demo/main/scripts/eks/appsignals/enable-app-signals.sh
	&& wget https://raw.githubusercontent.com/aws-observability/application-signals-demo/main/scripts/eks/appsignals/clean-app-signals.sh"
	cleanup: "rm -f enable-app-signals.sh && rm -f clean-app-signals.sh"
	post-command: "chmod +x enable-app-signals.sh && chmod +x clean-app-signals.sh"

	# In preparation for next release so we do not get alarms once the release is done
	- name: Enforce EKS add on v1.6.0-eksbuild.1
	working-directory: enablement-script
	run: \|
	sed -i '\#--addon-name amazon-cloudwatch-observability \\#a--addon-version v1.6.0-eksbuild.1 \\' enable-app-signals.sh

	- name: Remove log group deletion command
	if: always()
	working-directory: enablement-script
	run: \|
	delete_log_group="aws logs delete-log-group --log-group-name '/aws/appsignals/eks' --region \$REGION"
	sed -i "s#$delete_log_group##g" clean-app-signals.sh

	- name: Generate testing id and python sample app namespace
	run: \|
	echo TESTING_ID="${{ inputs.aws-region }}-${{ github.run_id }}-${{ github.run_number }}" >> $GITHUB_ENV
	echo PYTHON_SAMPLE_APP_NAMESPACE="ns-${{ github.run_id }}-${{ github.run_number }}" >> $GITHUB_ENV

	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: ${{ secrets.E2E_SECRET_TEST_ROLE_ARN }}
	aws-region: us-east-1

	- name: Retrieve account
	uses: aws-actions/aws-secretsmanager-get-secrets@v1
	with:
	secret-ids:
	ACCOUNT_ID, region-account/${{ inputs.aws-region }}

	# ADOT_E2E_TEST_ROLE_ARN is used to access main build e2e test cluster
	# E2E_TEST_ROLE_ARN is used to access canary e2e test cluster
	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: arn:aws:iam::${{ env.ACCOUNT_ID }}:role/${{ inputs['caller-workflow-name'] == 'main-build' && secrets.ADOT_E2E_TEST_ROLE_ARN \|\| secrets.E2E_TEST_ROLE_ARN }}
	aws-region: ${{ inputs.aws-region }}

	- name: Set up kubeconfig
	run: aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ inputs.aws-region }}

	- name: Add eksctl to Github Path
	run: \|
	echo "${{ github.workspace }}/eksctl" >> $GITHUB_PATH

	- name: Create role for AWS access from the sample app
	id: create_service_account
	uses: ./.github/workflows/actions/execute_and_retry
	with:
	command: "eksctl create iamserviceaccount \
	--name service-account-${{ env.TESTING_ID }} \
	--namespace ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }} \
	--cluster ${{ inputs.test-cluster-name }} \
	--role-name eks-s3-access-${{ env.TESTING_ID }} \
	--attach-policy-arn arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess \
	--region ${{ inputs.aws-region }} \
	--approve"
	cleanup: "eksctl delete iamserviceaccount \
	--name service-account-${{ env.TESTING_ID }} \
	--namespace ${{ env.SAMPLE_APP_NAMESPACE }} \
	--cluster ${{ inputs.test-cluster-name }} \
	--region ${{ inputs.aws-region }}"
	sleep_time: 60

	- name: Initiate Terraform
	uses: ./.github/workflows/actions/execute_and_retry
	with:
	command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/python/eks && terraform init && terraform validate"
	cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl"
	sleep_time: 60

	- name: Deploy sample app via terraform and wait for the endpoint to come online
	id: deploy-python-app
	working-directory: terraform/python/eks
	run: \|
	# Attempt to deploy the sample app on an EKS instance and wait for its endpoint to come online.
	# There may be occasional failures due to transitivity issues, so try up to 2 times.
	# deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates
	# that it failed at some point
	retry_counter=0
	max_retry=2
	while [ $retry_counter -lt $max_retry ]; do
	echo "Attempt $retry_counter"
	deployment_failed=0
	terraform apply -auto-approve \
	-var="test_id=${{ env.TESTING_ID }}" \
	-var="aws_region=${{ inputs.aws-region }}" \
	-var="kube_directory_path=${{ github.workspace }}/.kube" \
	-var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
	-var="eks_cluster_context_name=$(kubectl config current-context)" \
	-var="test_namespace=${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}" \
	-var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
	-var="python_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_PYTHON_E2E_FE_SA_IMG }}" \
	-var="python_remote_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_PYTHON_E2E_RE_SA_IMG }}" \
	\|\| deployment_failed=$?

	if [ $deployment_failed -eq 1 ]; then
	echo "Terraform deployment was unsuccessful. Will attempt to retry deployment."
	fi

	# If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint
	# after installing Application Signals. Attempts to connect will be made for up to 10 minutes
	if [ $deployment_failed -eq 0 ]; then
	echo "Installing application signals to the sample app"
	. ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
	execute_and_retry 3 \
	"${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/enable-app-signals.sh \
	${{ inputs.test-cluster-name }} \
	${{ inputs.aws-region }} \
	${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}" \
	"${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/clean-app-signals.sh \
	${{ inputs.test-cluster-name }} \
	${{ inputs.aws-region }} \
	${{ env.PYTHON_SAMPLE_APP_NAMESPACE }} && \
	aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ inputs.aws-region }}" \
	60

	# if [ "${{ inputs.caller-workflow-name }}" == "main-build" ]; then
	# echo "Patching staging adot image for main build:"
	# execute_and_retry 2 "kubectl patch deploy -namazon-cloudwatch amazon-cloudwatch-observability-controller-manager --type='json' \
	# -p='[{"op": \"replace\", \"path\": \"/spec/template/spec/containers/0/args/0\", \"value\": \"--auto-instrumentation-python-image=${{ inputs.application-signals-adot-image }}:${{ inputs.application-signals-adot-image-tag }}\"}]'"
	# execute_and_retry 2 "kubectl delete pods --all -n amazon-cloudwatch"
	# execute_and_retry 2 "kubectl wait --for=condition=Ready pod --all -n amazon-cloudwatch"
	# fi

	echo "Patching staging adot image for main build"
	execute_and_retry 2 "kubectl patch deploy -namazon-cloudwatch amazon-cloudwatch-observability-controller-manager --type='json' \
	-p='[{"op": \"replace\", \"path\": \"/spec/template/spec/containers/0/args/2\", \"value\": \"--auto-instrumentation-python-image=${{ inputs.application-signals-adot-image }}:${{ inputs.application-signals-adot-image-tag }}\"}]'"
Check failure on line 184 in .github/workflows/application-signals-python-e2e-eks-test.yml View workflow run for this annotation GitHub Actions / .github/workflows/application-signals-python-e2e-eks-test.yml Invalid workflow file `You have an error in your yaml syntax on line 184`

	execute_and_retry 2 "kubectl patch amazoncloudwatchagents -n amazon-cloudwatch cloudwatch-agent --type='json' -p='[{"op": "replace", "path": "/spec/image", "value": "public.ecr.aws/y8s3a7r9/cloudwatch-agent:latest"}]'"
	execute_and_retry 2 "kubectl delete pods --all -n amazon-cloudwatch"
	execute_and_retry 2 "kubectl wait --for=condition=Ready pod --all -n amazon-cloudwatch"

	execute_and_retry 2 "kubectl delete pods --all -n ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}" "" 60
	execute_and_retry 2 "kubectl wait --for=condition=Ready --request-timeout '5m' pod --all -n ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}" "" 10

	echo "Attempting to connect to the main sample app endpoint"
	python_app_endpoint=http://$(terraform output python_app_endpoint)
	attempt_counter=0
	max_attempts=60
	until $(curl --output /dev/null --silent --head --fail $(echo "$python_app_endpoint" \| tr -d '"')); do
	if [ ${attempt_counter} -eq ${max_attempts} ];then
	echo "Failed to connect to endpoint ($python_app_endpoint). Will attempt to redeploy sample app."
	deployment_failed=1
	break
	fi

	printf '.'
	attempt_counter=$(($attempt_counter+1))
	sleep 10
	done
	fi

	# If the deployment_failed is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the
	# resources created from terraform and try again.
	if [ $deployment_failed -eq 1 ]; then
	echo "Cleaning up Application Signal"
	${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/clean-app-signals.sh \
	${{ inputs.test-cluster-name }} \
	${{ inputs.aws-region }} \
	${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}

	# Running clean-app-signal.sh removes the current cluster from the config. Update the cluster again for subsequent runs.
	aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ inputs.aws-region }}

	echo "Destroying terraform"
	terraform destroy -auto-approve \
	-var="test_id=${{ env.TESTING_ID }}" \
	-var="aws_region=${{ inputs.aws-region }}" \
	-var="kube_directory_path=${{ github.workspace }}/.kube" \
	-var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
	-var="eks_cluster_context_name=$(kubectl config current-context)" \
	-var="test_namespace=${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}" \
	-var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
	-var="python_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_PYTHON_E2E_FE_SA_IMG }}" \
	-var="python_remote_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_PYTHON_E2E_RE_SA_IMG }}"

	retry_counter=$(($retry_counter+1))
	else
	# If deployment succeeded, then exit the loop
	break
	fi

	if [ $retry_counter -ge $max_retry ]; then
	echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code"
	exit 1
	fi
	done

	- name: Get remote service pod name and IP
	run: \|
	echo "REMOTE_SERVICE_DEPLOYMENT_NAME=$(kubectl get deployments -n ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }} --selector=app=remote-app -o jsonpath='{.items[0].metadata.name}')" >> $GITHUB_ENV
	echo "REMOTE_SERVICE_POD_IP=$(kubectl get pods -n ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }} --selector=app=remote-app -o jsonpath='{.items[0].status.podIP}')" >> $GITHUB_ENV

	- name: Verify pod Adot image
	run: \|
	kubectl get pods -n ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }} --output json \| \
	jq '.items[0].status.initContainerStatuses[0].imageID'

	- name: Verify pod CWAgent image
	run: \|
	kubectl get pods -n amazon-cloudwatch --output json \| \
	jq '.items[0].status.containerStatuses[0].imageID'

	- name: Get the sample app endpoint
	run: echo "APP_ENDPOINT=$(terraform output python_app_endpoint)" >> $GITHUB_ENV
	working-directory: terraform/python/eks

	# This steps increases the speed of the validation by creating the telemetry data in advance
	- name: Call all test APIs
	continue-on-error: true
	run: \|
	curl -S -s -o /dev/null "http://${{ env.APP_ENDPOINT }}/outgoing-http-call"; echo
	curl -S -s -o /dev/null "http://${{ env.APP_ENDPOINT }}/aws-sdk-call?ip=${{ env.REMOTE_SERVICE_POD_IP }}&testingId=${{ env.TESTING_ID }}"; echo
	curl -S -s -o /dev/null "http://${{ env.APP_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_POD_IP }}&testingId=${{ env.TESTING_ID }}"; echo
	curl -S -s -o /dev/null "http://${{ env.APP_ENDPOINT }}/client-call"; echo

	- name: Initiate Gradlew Daemon
	uses: ./.github/workflows/actions/execute_and_retry
	with:
	command: "./gradlew"
	cleanup: "./gradlew clean"
	max_retry: 4
	sleep_time: 30

	# Validation for application signals telemetry data
	- name: Call endpoint and validate generated EMF logs
	id: log-validation
	if: steps.deploy-python-app.outcome == 'success' && !cancelled()
	run: ./gradlew validator:run --args='-c python/eks/log-validation.yml
	--testing-id ${{ env.TESTING_ID }}
	--endpoint http://${{ env.APP_ENDPOINT }}
	--region ${{ inputs.aws-region }}
	--account-id ${{ env.ACCOUNT_ID }}
	--metric-namespace ${{ env.METRIC_NAMESPACE }}
	--log-group ${{ env.LOG_GROUP_NAME }}
	--app-namespace ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}
	--platform-info ${{ inputs.test-cluster-name }}
	--service-name python-application-${{ env.TESTING_ID }}
	--remote-service-deployment-name ${{ env.REMOTE_SERVICE_DEPLOYMENT_NAME }}
	--query-string ip=${{ env.REMOTE_SERVICE_POD_IP }}&testingId=${{ env.TESTING_ID }}
	--rollup'

	- name: Call endpoints and validate generated metrics
	id: metric-validation
	if: (steps.deploy-python-app.outcome == 'success' \|\| steps.log-validation.outcome == 'failure') && !cancelled()
	run: ./gradlew validator:run --args='-c python/eks/metric-validation.yml
	--testing-id ${{ env.TESTING_ID }}
	--endpoint http://${{ env.APP_ENDPOINT }}
	--region ${{ inputs.aws-region }}
	--account-id ${{ env.ACCOUNT_ID }}
	--metric-namespace ${{ env.METRIC_NAMESPACE }}
	--log-group ${{ env.LOG_GROUP_NAME }}
	--app-namespace ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}
	--platform-info ${{ inputs.test-cluster-name }}
	--service-name python-application-${{ env.TESTING_ID }}
	--remote-service-name python-remote-application-${{ env.TESTING_ID }}
	--remote-service-deployment-name ${{ env.REMOTE_SERVICE_DEPLOYMENT_NAME }}
	--query-string ip=${{ env.REMOTE_SERVICE_POD_IP }}&testingId=${{ env.TESTING_ID }}
	--rollup'

	- name: Call endpoints and validate generated traces
	id: trace-validation
	if: (steps.deploy-python-app.outcome == 'success' \|\| steps.log-validation.outcome == 'failure' \|\| steps.metric-validation.outcome == 'failure') && !cancelled()
	run: ./gradlew validator:run --args='-c python/eks/trace-validation.yml
	--testing-id ${{ env.TESTING_ID }}
	--endpoint http://${{ env.APP_ENDPOINT }}
	--region ${{ inputs.aws-region }}
	--account-id ${{ env.ACCOUNT_ID }}
	--log-group ${{ env.LOG_GROUP_NAME }}
	--app-namespace ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}
	--platform-info ${{ inputs.test-cluster-name }}
	--service-name python-application-${{ env.TESTING_ID }}
	--remote-service-deployment-name ${{ env.REMOTE_SERVICE_DEPLOYMENT_NAME }}
	--query-string ip=${{ env.REMOTE_SERVICE_POD_IP }}&testingId=${{ env.TESTING_ID }}
	--rollup'

	- name: Publish metric on test result
	if: always()
	run: \|
	if [ "${{ steps.log-validation.outcome }}" = "success" ] && [ "${{ steps.metric-validation.outcome }}" = "success" ] && [ "${{ steps.trace-validation.outcome }}" = "success" ]; then
	aws cloudwatch put-metric-data --namespace 'ADOT/GitHubActions' \
	--metric-name Failure \
	--dimensions repository=${{ github.repository }},branch=${{ github.ref_name }},workflow=${{ inputs.caller-workflow-name }} \
	--value 0.0 \
	--region ${{ inputs.aws-region }}
	else
	aws cloudwatch put-metric-data --namespace 'ADOT/GitHubActions' \
	--metric-name Failure \
	--dimensions repository=${{ github.repository }},branch=${{ github.ref_name }},workflow=${{ inputs.caller-workflow-name }} \
	--value 1.0 \
	--region ${{ inputs.aws-region }}
	fi

	# Clean up Procedures

	- name: Clean Up Application Signals
	if: always()
	continue-on-error: true
	working-directory: enablement-script
	run: \|
	./clean-app-signals.sh \
	${{ inputs.test-cluster-name }} \
	${{ inputs.aws-region }} \
	${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}

	# This step also deletes lingering resources from previous test runs
	- name: Delete all sample app resources
	if: always()
	continue-on-error: true
	timeout-minutes: 10
	run: kubectl delete namespace ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}

	- name: Terraform destroy
	if: always()
	continue-on-error: true
	timeout-minutes: 5
	working-directory: terraform/python/eks
	run: \|
	terraform destroy -auto-approve \
	-var="test_id=${{ env.TESTING_ID }}" \
	-var="aws_region=${{ inputs.aws-region }}" \
	-var="kube_directory_path=${{ github.workspace }}/.kube" \
	-var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
	-var="test_namespace=${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}" \
	-var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
	-var="python_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_PYTHON_E2E_FE_SA_IMG }}" \
	-var="python_remote_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_PYTHON_E2E_RE_SA_IMG }}"

	- name: Remove aws access service account
	if: always()
	continue-on-error: true
	run: \|
	eksctl delete iamserviceaccount \
	--name service-account-${{ env.TESTING_ID }} \
	--namespace ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }} \
	--cluster ${{ inputs.test-cluster-name }} \
	--region ${{ inputs.aws-region }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

test #12

Workflow file

test #12

Jobs

Run details

Workflow file for this run

GitHub Actions / .github/workflows/application-signals-python-e2e-eks-test.yml