Skip to content
## Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
## SPDX-License-Identifier: Apache-2.0
# This is a reusable workflow for running the E2E test for Application Signals.
# It is meant to be called from another workflow.
# Read more about reusable workflows: https://docs.github.com/en/actions/using-workflows/reusing-workflows#overview
name: Application Signals Enablement E2E Testing - Python EKS
on:
workflow_call:
inputs:
aws-region:
required: true
type: string
test-cluster-name:
required: true
type: string
application-signals-adot-image:
required: false
type: string
application-signals-adot-image-tag:
required: false
type: string
caller-workflow-name:
required: true
type: string
permissions:
id-token: write
contents: read
env:
METRIC_NAMESPACE: ApplicationSignals
LOG_GROUP_NAME: /aws/application-signals/data
TEST_RESOURCES_FOLDER: ${GITHUB_WORKSPACE}
jobs:
python-e2e-eks-test:
runs-on: ubuntu-latest
container:
image: public.ecr.aws/h6o3z5z9/aws-application-signals-test-framework-workflow-container:latest
steps:
- uses: actions/checkout@v4
with:
repository: aws-observability/aws-application-signals-test-framework
ref: ga-python
fetch-depth: 0
- name: Download enablement script
uses: ./.github/workflows/actions/execute_and_retry
with:
pre-command: "mkdir enablement-script && cd enablement-script"
command: "wget https://raw.githubusercontent.com/aws-observability/application-signals-demo/main/scripts/eks/appsignals/enable-app-signals.sh
&& wget https://raw.githubusercontent.com/aws-observability/application-signals-demo/main/scripts/eks/appsignals/clean-app-signals.sh"
cleanup: "rm -f enable-app-signals.sh && rm -f clean-app-signals.sh"
post-command: "chmod +x enable-app-signals.sh && chmod +x clean-app-signals.sh"
# In preparation for next release so we do not get alarms once the release is done
- name: Enforce EKS add on v1.6.0-eksbuild.1
working-directory: enablement-script
run: |
sed -i '\#--addon-name amazon-cloudwatch-observability \\#a--addon-version v1.6.0-eksbuild.1 \\' enable-app-signals.sh
- name: Remove log group deletion command
if: always()
working-directory: enablement-script
run: |
delete_log_group="aws logs delete-log-group --log-group-name '/aws/appsignals/eks' --region \$REGION"
sed -i "s#$delete_log_group##g" clean-app-signals.sh
- name: Generate testing id and python sample app namespace
run: |
echo TESTING_ID="${{ inputs.aws-region }}-${{ github.run_id }}-${{ github.run_number }}" >> $GITHUB_ENV
echo PYTHON_SAMPLE_APP_NAMESPACE="ns-${{ github.run_id }}-${{ github.run_number }}" >> $GITHUB_ENV
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.E2E_SECRET_TEST_ROLE_ARN }}
aws-region: us-east-1
- name: Retrieve account
uses: aws-actions/aws-secretsmanager-get-secrets@v1
with:
secret-ids:
ACCOUNT_ID, region-account/${{ inputs.aws-region }}
# ADOT_E2E_TEST_ROLE_ARN is used to access main build e2e test cluster
# E2E_TEST_ROLE_ARN is used to access canary e2e test cluster
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::${{ env.ACCOUNT_ID }}:role/${{ inputs['caller-workflow-name'] == 'main-build' && secrets.ADOT_E2E_TEST_ROLE_ARN || secrets.E2E_TEST_ROLE_ARN }}
aws-region: ${{ inputs.aws-region }}
- name: Set up kubeconfig
run: aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ inputs.aws-region }}
- name: Add eksctl to Github Path
run: |
echo "${{ github.workspace }}/eksctl" >> $GITHUB_PATH
- name: Create role for AWS access from the sample app
id: create_service_account
uses: ./.github/workflows/actions/execute_and_retry
with:
command: "eksctl create iamserviceaccount \
--name service-account-${{ env.TESTING_ID }} \
--namespace ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }} \
--cluster ${{ inputs.test-cluster-name }} \
--role-name eks-s3-access-${{ env.TESTING_ID }} \
--attach-policy-arn arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess \
--region ${{ inputs.aws-region }} \
--approve"
cleanup: "eksctl delete iamserviceaccount \
--name service-account-${{ env.TESTING_ID }} \
--namespace ${{ env.SAMPLE_APP_NAMESPACE }} \
--cluster ${{ inputs.test-cluster-name }} \
--region ${{ inputs.aws-region }}"
sleep_time: 60
- name: Initiate Terraform
uses: ./.github/workflows/actions/execute_and_retry
with:
command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/python/eks && terraform init && terraform validate"
cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl"
sleep_time: 60
- name: Deploy sample app via terraform and wait for the endpoint to come online
id: deploy-python-app
working-directory: terraform/python/eks
run: |
# Attempt to deploy the sample app on an EKS instance and wait for its endpoint to come online.
# There may be occasional failures due to transitivity issues, so try up to 2 times.
# deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates
# that it failed at some point
retry_counter=0
max_retry=2
while [ $retry_counter -lt $max_retry ]; do
echo "Attempt $retry_counter"
deployment_failed=0
terraform apply -auto-approve \
-var="test_id=${{ env.TESTING_ID }}" \
-var="aws_region=${{ inputs.aws-region }}" \
-var="kube_directory_path=${{ github.workspace }}/.kube" \
-var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
-var="eks_cluster_context_name=$(kubectl config current-context)" \
-var="test_namespace=${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}" \
-var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
-var="python_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_PYTHON_E2E_FE_SA_IMG }}" \
-var="python_remote_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_PYTHON_E2E_RE_SA_IMG }}" \
|| deployment_failed=$?
if [ $deployment_failed -eq 1 ]; then
echo "Terraform deployment was unsuccessful. Will attempt to retry deployment."
fi
# If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint
# after installing Application Signals. Attempts to connect will be made for up to 10 minutes
if [ $deployment_failed -eq 0 ]; then
echo "Installing application signals to the sample app"
. ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh
execute_and_retry 3 \
"${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/enable-app-signals.sh \
${{ inputs.test-cluster-name }} \
${{ inputs.aws-region }} \
${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}" \
"${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/clean-app-signals.sh \
${{ inputs.test-cluster-name }} \
${{ inputs.aws-region }} \
${{ env.PYTHON_SAMPLE_APP_NAMESPACE }} && \
aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ inputs.aws-region }}" \
60
# if [ "${{ inputs.caller-workflow-name }}" == "main-build" ]; then
# echo "Patching staging adot image for main build:"
# execute_and_retry 2 "kubectl patch deploy -namazon-cloudwatch amazon-cloudwatch-observability-controller-manager --type='json' \
# -p='[{"op": \"replace\", \"path\": \"/spec/template/spec/containers/0/args/0\", \"value\": \"--auto-instrumentation-python-image=${{ inputs.application-signals-adot-image }}:${{ inputs.application-signals-adot-image-tag }}\"}]'"
# execute_and_retry 2 "kubectl delete pods --all -n amazon-cloudwatch"
# execute_and_retry 2 "kubectl wait --for=condition=Ready pod --all -n amazon-cloudwatch"
# fi
echo "Patching staging adot image for main build"
execute_and_retry 2 "kubectl patch deploy -namazon-cloudwatch amazon-cloudwatch-observability-controller-manager --type='json' \
-p='[{"op": \"replace\", \"path\": \"/spec/template/spec/containers/0/args/2\", \"value\": \"--auto-instrumentation-python-image=${{ inputs.application-signals-adot-image }}:${{ inputs.application-signals-adot-image-tag }}\"}]'"

Check failure on line 184 in .github/workflows/application-signals-python-e2e-eks-test.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/application-signals-python-e2e-eks-test.yml

Invalid workflow file

You have an error in your yaml syntax on line 184
execute_and_retry 2 "kubectl patch amazoncloudwatchagents -n amazon-cloudwatch cloudwatch-agent --type='json' -p='[{"op": "replace", "path": "/spec/image", "value": "public.ecr.aws/y8s3a7r9/cloudwatch-agent:latest"}]'"
execute_and_retry 2 "kubectl delete pods --all -n amazon-cloudwatch"
execute_and_retry 2 "kubectl wait --for=condition=Ready pod --all -n amazon-cloudwatch"
execute_and_retry 2 "kubectl delete pods --all -n ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}" "" 60
execute_and_retry 2 "kubectl wait --for=condition=Ready --request-timeout '5m' pod --all -n ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}" "" 10
echo "Attempting to connect to the main sample app endpoint"
python_app_endpoint=http://$(terraform output python_app_endpoint)
attempt_counter=0
max_attempts=60
until $(curl --output /dev/null --silent --head --fail $(echo "$python_app_endpoint" | tr -d '"')); do
if [ ${attempt_counter} -eq ${max_attempts} ];then
echo "Failed to connect to endpoint ($python_app_endpoint). Will attempt to redeploy sample app."
deployment_failed=1
break
fi
printf '.'
attempt_counter=$(($attempt_counter+1))
sleep 10
done
fi
# If the deployment_failed is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the
# resources created from terraform and try again.
if [ $deployment_failed -eq 1 ]; then
echo "Cleaning up Application Signal"
${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/clean-app-signals.sh \
${{ inputs.test-cluster-name }} \
${{ inputs.aws-region }} \
${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}
# Running clean-app-signal.sh removes the current cluster from the config. Update the cluster again for subsequent runs.
aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ inputs.aws-region }}
echo "Destroying terraform"
terraform destroy -auto-approve \
-var="test_id=${{ env.TESTING_ID }}" \
-var="aws_region=${{ inputs.aws-region }}" \
-var="kube_directory_path=${{ github.workspace }}/.kube" \
-var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
-var="eks_cluster_context_name=$(kubectl config current-context)" \
-var="test_namespace=${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}" \
-var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
-var="python_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_PYTHON_E2E_FE_SA_IMG }}" \
-var="python_remote_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_PYTHON_E2E_RE_SA_IMG }}"
retry_counter=$(($retry_counter+1))
else
# If deployment succeeded, then exit the loop
break
fi
if [ $retry_counter -ge $max_retry ]; then
echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code"
exit 1
fi
done
- name: Get remote service pod name and IP
run: |
echo "REMOTE_SERVICE_DEPLOYMENT_NAME=$(kubectl get deployments -n ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }} --selector=app=remote-app -o jsonpath='{.items[0].metadata.name}')" >> $GITHUB_ENV
echo "REMOTE_SERVICE_POD_IP=$(kubectl get pods -n ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }} --selector=app=remote-app -o jsonpath='{.items[0].status.podIP}')" >> $GITHUB_ENV
- name: Verify pod Adot image
run: |
kubectl get pods -n ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }} --output json | \
jq '.items[0].status.initContainerStatuses[0].imageID'
- name: Verify pod CWAgent image
run: |
kubectl get pods -n amazon-cloudwatch --output json | \
jq '.items[0].status.containerStatuses[0].imageID'
- name: Get the sample app endpoint
run: echo "APP_ENDPOINT=$(terraform output python_app_endpoint)" >> $GITHUB_ENV
working-directory: terraform/python/eks
# This steps increases the speed of the validation by creating the telemetry data in advance
- name: Call all test APIs
continue-on-error: true
run: |
curl -S -s -o /dev/null "http://${{ env.APP_ENDPOINT }}/outgoing-http-call"; echo
curl -S -s -o /dev/null "http://${{ env.APP_ENDPOINT }}/aws-sdk-call?ip=${{ env.REMOTE_SERVICE_POD_IP }}&testingId=${{ env.TESTING_ID }}"; echo
curl -S -s -o /dev/null "http://${{ env.APP_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_POD_IP }}&testingId=${{ env.TESTING_ID }}"; echo
curl -S -s -o /dev/null "http://${{ env.APP_ENDPOINT }}/client-call"; echo
- name: Initiate Gradlew Daemon
uses: ./.github/workflows/actions/execute_and_retry
with:
command: "./gradlew"
cleanup: "./gradlew clean"
max_retry: 4
sleep_time: 30
# Validation for application signals telemetry data
- name: Call endpoint and validate generated EMF logs
id: log-validation
if: steps.deploy-python-app.outcome == 'success' && !cancelled()
run: ./gradlew validator:run --args='-c python/eks/log-validation.yml
--testing-id ${{ env.TESTING_ID }}
--endpoint http://${{ env.APP_ENDPOINT }}
--region ${{ inputs.aws-region }}
--account-id ${{ env.ACCOUNT_ID }}
--metric-namespace ${{ env.METRIC_NAMESPACE }}
--log-group ${{ env.LOG_GROUP_NAME }}
--app-namespace ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}
--platform-info ${{ inputs.test-cluster-name }}
--service-name python-application-${{ env.TESTING_ID }}
--remote-service-deployment-name ${{ env.REMOTE_SERVICE_DEPLOYMENT_NAME }}
--query-string ip=${{ env.REMOTE_SERVICE_POD_IP }}&testingId=${{ env.TESTING_ID }}
--rollup'
- name: Call endpoints and validate generated metrics
id: metric-validation
if: (steps.deploy-python-app.outcome == 'success' || steps.log-validation.outcome == 'failure') && !cancelled()
run: ./gradlew validator:run --args='-c python/eks/metric-validation.yml
--testing-id ${{ env.TESTING_ID }}
--endpoint http://${{ env.APP_ENDPOINT }}
--region ${{ inputs.aws-region }}
--account-id ${{ env.ACCOUNT_ID }}
--metric-namespace ${{ env.METRIC_NAMESPACE }}
--log-group ${{ env.LOG_GROUP_NAME }}
--app-namespace ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}
--platform-info ${{ inputs.test-cluster-name }}
--service-name python-application-${{ env.TESTING_ID }}
--remote-service-name python-remote-application-${{ env.TESTING_ID }}
--remote-service-deployment-name ${{ env.REMOTE_SERVICE_DEPLOYMENT_NAME }}
--query-string ip=${{ env.REMOTE_SERVICE_POD_IP }}&testingId=${{ env.TESTING_ID }}
--rollup'
- name: Call endpoints and validate generated traces
id: trace-validation
if: (steps.deploy-python-app.outcome == 'success' || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled()
run: ./gradlew validator:run --args='-c python/eks/trace-validation.yml
--testing-id ${{ env.TESTING_ID }}
--endpoint http://${{ env.APP_ENDPOINT }}
--region ${{ inputs.aws-region }}
--account-id ${{ env.ACCOUNT_ID }}
--log-group ${{ env.LOG_GROUP_NAME }}
--app-namespace ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}
--platform-info ${{ inputs.test-cluster-name }}
--service-name python-application-${{ env.TESTING_ID }}
--remote-service-deployment-name ${{ env.REMOTE_SERVICE_DEPLOYMENT_NAME }}
--query-string ip=${{ env.REMOTE_SERVICE_POD_IP }}&testingId=${{ env.TESTING_ID }}
--rollup'
- name: Publish metric on test result
if: always()
run: |
if [ "${{ steps.log-validation.outcome }}" = "success" ] && [ "${{ steps.metric-validation.outcome }}" = "success" ] && [ "${{ steps.trace-validation.outcome }}" = "success" ]; then
aws cloudwatch put-metric-data --namespace 'ADOT/GitHubActions' \
--metric-name Failure \
--dimensions repository=${{ github.repository }},branch=${{ github.ref_name }},workflow=${{ inputs.caller-workflow-name }} \
--value 0.0 \
--region ${{ inputs.aws-region }}
else
aws cloudwatch put-metric-data --namespace 'ADOT/GitHubActions' \
--metric-name Failure \
--dimensions repository=${{ github.repository }},branch=${{ github.ref_name }},workflow=${{ inputs.caller-workflow-name }} \
--value 1.0 \
--region ${{ inputs.aws-region }}
fi
# Clean up Procedures
- name: Clean Up Application Signals
if: always()
continue-on-error: true
working-directory: enablement-script
run: |
./clean-app-signals.sh \
${{ inputs.test-cluster-name }} \
${{ inputs.aws-region }} \
${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}
# This step also deletes lingering resources from previous test runs
- name: Delete all sample app resources
if: always()
continue-on-error: true
timeout-minutes: 10
run: kubectl delete namespace ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}
- name: Terraform destroy
if: always()
continue-on-error: true
timeout-minutes: 5
working-directory: terraform/python/eks
run: |
terraform destroy -auto-approve \
-var="test_id=${{ env.TESTING_ID }}" \
-var="aws_region=${{ inputs.aws-region }}" \
-var="kube_directory_path=${{ github.workspace }}/.kube" \
-var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
-var="test_namespace=${{ env.PYTHON_SAMPLE_APP_NAMESPACE }}" \
-var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
-var="python_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_PYTHON_E2E_FE_SA_IMG }}" \
-var="python_remote_app_image=${{ env.ACCOUNT_ID }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/${{ secrets.APP_SIGNALS_PYTHON_E2E_RE_SA_IMG }}"
- name: Remove aws access service account
if: always()
continue-on-error: true
run: |
eksctl delete iamserviceaccount \
--name service-account-${{ env.TESTING_ID }} \
--namespace ${{ env.PYTHON_SAMPLE_APP_NAMESPACE }} \
--cluster ${{ inputs.test-cluster-name }} \
--region ${{ inputs.aws-region }}