diff --git a/CODEOWNERS b/CODEOWNERS index 24f705f0..e133bd49 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -3,4 +3,4 @@ # https://github.com/blog/2392-introducing-code-owners # Amazon SageMaker CodeOwners -* @akartsky @jkuruba @mbaijal @RedbackThomson @surajkota +* @akartsky @mbaijal @surajkota diff --git a/test/canary/Dockerfile.canary b/test/canary/Dockerfile.canary new file mode 100644 index 00000000..4fc03850 --- /dev/null +++ b/test/canary/Dockerfile.canary @@ -0,0 +1,50 @@ +FROM ubuntu:18.04 + +# Build time parameters +ARG SERVICE=sagemaker + +RUN apt-get update && apt-get install -y curl \ + wget \ + git \ + python3.8 \ + python3-pip \ + python3.8-dev \ + vim \ + sudo \ + jq \ + unzip + +# Install awscli +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \ + && unzip -qq awscliv2.zip \ + && ./aws/install + +# Add yq repository and install yq +RUN apt-get update && apt install -y software-properties-common \ + && sudo add-apt-repository ppa:rmescandon/yq \ + && apt update && apt install -y yq + +# Install kubectl +RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.18.6/bin/linux/amd64/kubectl \ + && chmod +x ./kubectl \ + && cp ./kubectl /bin + +# Install eksctl +RUN curl --silent --location "https://github.com/weaveworks/eksctl/releases/download/latest_release/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && mv /tmp/eksctl /bin + +# Install Helm +RUN curl -q -L "https://get.helm.sh/helm-v3.2.4-linux-amd64.tar.gz" | tar zxf - -C /usr/local/bin/ \ + && mv /usr/local/bin/linux-amd64/helm /usr/local/bin/helm \ + && rm -r /usr/local/bin/linux-amd64 \ + && chmod +x /usr/local/bin/helm + +ENV SERVICE_REPO_PATH=/$SERVICE-controller +COPY ./test/e2e/requirements.txt requirements.txt + +RUN ln -s /usr/bin/python3.8 /usr/bin/python \ + && python -m pip install --upgrade pip + +RUN python -m pip install -r requirements.txt + +WORKDIR /$SERVICE_REPO_PATH +CMD ["./test/canary/scripts/run_test.sh"] \ No newline at end of file diff --git a/test/canary/canary.buildspec.yaml b/test/canary/canary.buildspec.yaml new file mode 100644 index 00000000..6e94affe --- /dev/null +++ b/test/canary/canary.buildspec.yaml @@ -0,0 +1,29 @@ +version: 0.2 + +phases: + pre_build: + commands: + # Make all shell scripts executable. This is required when running code copied from S3 + - find ./ -type f -name "*.sh" -exec chmod +x {} \; + + # Get cached test image + - aws ecr get-login-password --region $CLUSTER_REGION | docker login --username AWS --password-stdin $ECR_CACHE_URI || true + - docker pull ${ECR_CACHE_URI}:latest --quiet || true + + # Login to dockerhub to avoid hitting throttle limit + - docker login -u $DOCKER_CONFIG_USERNAME -p $DOCKER_CONFIG_PASSWORD + + # Build test image + - > + docker build -f ./test/canary/Dockerfile.canary . -t ${ECR_CACHE_URI}:latest + --build-arg SERVICE="${SERVICE##*/}" --quiet + || echo "Docker Build Failed" || true + build: + commands: + # Run tests + - docker run --name ack-canary $(env | cut -f1 -d= | sed 's/^/-e /') --mount type=bind,source="$(pwd)/",target="/${SERVICE}-controller/" ${ECR_CACHE_URI}:latest + + # Push test image to cache ECR repo + - docker push ${ECR_CACHE_URI}:latest || true + + diff --git a/test/canary/scripts/install_controller_helm.sh b/test/canary/scripts/install_controller_helm.sh new file mode 100755 index 00000000..9b1363a3 --- /dev/null +++ b/test/canary/scripts/install_controller_helm.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +# Deploy ACK Helm Charts + +function install_helm_chart() { + local service="$1" + local oidc_role_arn="$2" + local region="$3" + local namespace="$4" + + yq w -i helm/values.yaml "serviceAccount.annotations" "" + yq w -i helm/values.yaml 'serviceAccount.annotations."eks.amazonaws.com/role-arn"' "$oidc_role_arn" + yq w -i helm/values.yaml "aws.region" $region + + kubectl create namespace $namespace + helm install -n $namespace ack-$service-controller helm +} \ No newline at end of file diff --git a/test/canary/scripts/run_test.sh b/test/canary/scripts/run_test.sh new file mode 100755 index 00000000..4546650f --- /dev/null +++ b/test/canary/scripts/run_test.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +# cleanup on EXIT regardles of error + +# Inputs to this file as environment variables +# SERVICE +# SERVICE_REGION +# CLUSTER_REGION +# CLUSTER_NAME +# SERVICE_REPO_PATH +# NAMESPACE + +set -euo pipefail +export NAMESPACE=${NAMESPACE:-"ack-system"} +export AWS_DEFAULT_REGION=$SERVICE_REGION +export E2E_DIR=$SERVICE_REPO_PATH/test/e2e/ +SCRIPTS_DIR=${SERVICE_REPO_PATH}/test/canary/scripts + +source $SCRIPTS_DIR/setup_oidc.sh +source $SCRIPTS_DIR/install_controller_helm.sh + +function print_controller_logs() { + pod_id=$( kubectl get pods -n $NAMESPACE --field-selector="status.phase=Running" \ + --sort-by=.metadata.creationTimestamp \ + | grep ack-sagemaker-controller | awk '{print $1}' 2>/dev/null ) + + kubectl -n $NAMESPACE logs "$pod_id" +} + +function cleanup { + echo "Cleaning up resources" + set +e + kubectl delete endpoints.sagemaker --all + kubectl delete endpointconfigs --all + kubectl delete models --all + kubectl delete trainingjobs --all + kubectl delete processingjobs --all + kubectl delete transformjobs --all + kubectl delete hyperparametertuningjobs --all + kubectl delete dataqualityjobdefinitions --all + kubectl delete modelbiasjobdefinitions --all + kubectl delete modelexplainabilityjobdefinitions --all + kubectl delete modelqualityjobdefinitions --all + kubectl delete monitoringschedules --all + kubectl delete adoptedresources --all + + print_controller_logs + + helm delete -n $NAMESPACE ack-$SERVICE-controller + kubectl delete namespace $NAMESPACE + + cd $E2E_DIR + export PYTHONPATH=.. + python service_cleanup.py + +} +trap cleanup EXIT + +# Update kubeconfig +aws --region $CLUSTER_REGION eks update-kubeconfig --name $CLUSTER_NAME + +# Setup OIDC +create_oidc_role "$CLUSTER_NAME" "$CLUSTER_REGION" "$NAMESPACE" + +# Install service helm chart +install_helm_chart $SERVICE $OIDC_ROLE_ARN $SERVICE_REGION $NAMESPACE + +echo "Log helm charts are deployed properly" +kubectl -n $NAMESPACE get pods +kubectl get crds + +pushd $E2E_DIR + export PYTHONPATH=.. + # create resources for test + python service_bootstrap.py + sleep 5m + + # run tests + echo "Run Tests" + pytest -n 10 --dist loadfile --log-cli-level INFO -m canary +popd \ No newline at end of file diff --git a/test/canary/scripts/setup_oidc.sh b/test/canary/scripts/setup_oidc.sh new file mode 100755 index 00000000..691d8d75 --- /dev/null +++ b/test/canary/scripts/setup_oidc.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# OIDC Setup + +# A function to get the OIDC_ID associated with an EKS cluster +function get_oidc_id() { + local cluster_name="$1" + local region = "$2" + eksctl utils associate-iam-oidc-provider --cluster $cluster_name --region $region --approve + local oidc_url=$(aws eks describe-cluster --region $region --name $cluster_name --query "cluster.identity.oidc.issuer" --output text | cut -c9-) + echo "${oidc_url}" +} + + +function generate_trust_policy() { + local oidc_url="$1" + local namespace="$2" + local account_id=$(aws sts get-caller-identity --output text --query "Account") + + cat < trust.json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:aws:iam::${account_id}:oidc-provider/${oidc_url}" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "${oidc_url}:aud": "sts.amazonaws.com", + "${oidc_url}:sub": ["system:serviceaccount:${namespace}:ack-sagemaker-controller"] + } + } + } + ] +} +EOF +} + +function create_oidc_role() { + local cluster_name="$1" + local region="$2" + local namespace="$3" + local oidc_role_name=ack-oidc-role-$cluster_name-$namespace + + # Create role only if it does not exist + set +e + aws iam get-role --role-name ${oidc_role_name} + exit_code=$? + set -euo pipefail + + if [[ $exit_code -eq 0 ]]; then + echo "A role for this cluster and namespace already exists in this account, assuming sagemaker access and proceeding." + else + echo "Creating new IAM role: $oidc_role_name" + local oidc_url=$(get_oidc_id "$cluster_name" "$region") + local trustfile="trust.json" + generate_trust_policy "$oidc_url" "$namespace" + aws iam create-role --role-name "$oidc_role_name" --assume-role-policy-document file://${trustfile} + aws iam attach-role-policy --role-name "$oidc_role_name" --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + aws iam attach-role-policy --role-name "$oidc_role_name" --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess + rm "${trustfile}" + fi + local oidc_role_arn=$(aws iam get-role --role-name $oidc_role_name --output text --query 'Role.Arn') + export OIDC_ROLE_ARN=$oidc_role_arn +} \ No newline at end of file diff --git a/test/e2e/requirements.txt b/test/e2e/requirements.txt index fb0a773b..8afa5151 100644 --- a/test/e2e/requirements.txt +++ b/test/e2e/requirements.txt @@ -1,2 +1,2 @@ -acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@955d7831ee374a212250179e95a5f3b75e555fd9 +acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@5ed60a505afa953096e53c9d3d6779830250915b black==20.8b1 diff --git a/test/e2e/service_bootstrap.py b/test/e2e/service_bootstrap.py index b9f1f787..c2063b9e 100644 --- a/test/e2e/service_bootstrap.py +++ b/test/e2e/service_bootstrap.py @@ -17,6 +17,7 @@ import json import logging import time +import subprocess from acktest import resources from acktest.aws.identity import get_region, get_account_id @@ -88,7 +89,13 @@ def create_data_bucket() -> str: source_bucket = s3_resource.Bucket(SAGEMAKER_SOURCE_DATA_BUCKET) destination_bucket = s3_resource.Bucket(bucket_name) - duplicate_bucket_contents(source_bucket, destination_bucket) + temp_dir = "/tmp/ack_s3_data" + # duplicate_bucket_contents(source_bucket, destination_bucket) + # workaround to copy if buckets are across regions + # TODO: check if there is a better way and merge to test-infra + subprocess.call(['mkdir',f'{temp_dir}']) + subprocess.call(['aws', 's3', 'sync', f's3://{SAGEMAKER_SOURCE_DATA_BUCKET}', f'./{temp_dir}/', '--quiet']) + subprocess.call(['aws', 's3', 'sync', f'./{temp_dir}/', f's3://{bucket_name}', '--quiet']) logging.info(f"Synced data bucket") diff --git a/test/e2e/tests/test_endpoint_config.py b/test/e2e/tests/test_endpoint_config.py index 5c60a757..f4ad9dc3 100644 --- a/test/e2e/tests/test_endpoint_config.py +++ b/test/e2e/tests/test_endpoint_config.py @@ -26,7 +26,7 @@ create_sagemaker_resource, ) from e2e.replacement_values import REPLACEMENT_VALUES -from e2e.common.config import config as cfg +from e2e.common import config as cfg @pytest.fixture(scope="module")