Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
# https://github.com/blog/2392-introducing-code-owners

# Amazon SageMaker CodeOwners
* @akartsky @jkuruba @mbaijal @RedbackThomson @surajkota
* @akartsky @mbaijal @surajkota
50 changes: 50 additions & 0 deletions test/canary/Dockerfile.canary
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
FROM ubuntu:18.04

# Build time parameters
ARG SERVICE=sagemaker

RUN apt-get update && apt-get install -y curl \
wget \
git \
python3.8 \
python3-pip \
python3.8-dev \
vim \
sudo \
jq \
unzip

# Install awscli
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
&& unzip -qq awscliv2.zip \
&& ./aws/install

# Add yq repository and install yq
RUN apt-get update && apt install -y software-properties-common \
&& sudo add-apt-repository ppa:rmescandon/yq \
&& apt update && apt install -y yq

# Install kubectl
RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.18.6/bin/linux/amd64/kubectl \
&& chmod +x ./kubectl \
&& cp ./kubectl /bin

# Install eksctl
RUN curl --silent --location "https://github.com/weaveworks/eksctl/releases/download/latest_release/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && mv /tmp/eksctl /bin

# Install Helm
RUN curl -q -L "https://get.helm.sh/helm-v3.2.4-linux-amd64.tar.gz" | tar zxf - -C /usr/local/bin/ \
&& mv /usr/local/bin/linux-amd64/helm /usr/local/bin/helm \
&& rm -r /usr/local/bin/linux-amd64 \
&& chmod +x /usr/local/bin/helm

ENV SERVICE_REPO_PATH=/$SERVICE-controller
COPY ./test/e2e/requirements.txt requirements.txt

RUN ln -s /usr/bin/python3.8 /usr/bin/python \
&& python -m pip install --upgrade pip

RUN python -m pip install -r requirements.txt

WORKDIR /$SERVICE_REPO_PATH
CMD ["./test/canary/scripts/run_test.sh"]
29 changes: 29 additions & 0 deletions test/canary/canary.buildspec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
version: 0.2

phases:
pre_build:
commands:
# Make all shell scripts executable. This is required when running code copied from S3
- find ./ -type f -name "*.sh" -exec chmod +x {} \;

# Get cached test image
- aws ecr get-login-password --region $CLUSTER_REGION | docker login --username AWS --password-stdin $ECR_CACHE_URI || true
- docker pull ${ECR_CACHE_URI}:latest --quiet || true

# Login to dockerhub to avoid hitting throttle limit
- docker login -u $DOCKER_CONFIG_USERNAME -p $DOCKER_CONFIG_PASSWORD

# Build test image
- >
docker build -f ./test/canary/Dockerfile.canary . -t ${ECR_CACHE_URI}:latest
--build-arg SERVICE="${SERVICE##*/}" --quiet
|| echo "Docker Build Failed" || true
build:
commands:
# Run tests
- docker run --name ack-canary $(env | cut -f1 -d= | sed 's/^/-e /') --mount type=bind,source="$(pwd)/",target="/${SERVICE}-controller/" ${ECR_CACHE_URI}:latest

# Push test image to cache ECR repo
- docker push ${ECR_CACHE_URI}:latest || true


17 changes: 17 additions & 0 deletions test/canary/scripts/install_controller_helm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env bash

# Deploy ACK Helm Charts

function install_helm_chart() {
local service="$1"
local oidc_role_arn="$2"
local region="$3"
local namespace="$4"

yq w -i helm/values.yaml "serviceAccount.annotations" ""
yq w -i helm/values.yaml 'serviceAccount.annotations."eks.amazonaws.com/role-arn"' "$oidc_role_arn"
yq w -i helm/values.yaml "aws.region" $region

kubectl create namespace $namespace
helm install -n $namespace ack-$service-controller helm
}
81 changes: 81 additions & 0 deletions test/canary/scripts/run_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/bin/bash

# cleanup on EXIT regardles of error

# Inputs to this file as environment variables
# SERVICE
# SERVICE_REGION
# CLUSTER_REGION
# CLUSTER_NAME
# SERVICE_REPO_PATH
# NAMESPACE

set -euo pipefail
export NAMESPACE=${NAMESPACE:-"ack-system"}
export AWS_DEFAULT_REGION=$SERVICE_REGION
export E2E_DIR=$SERVICE_REPO_PATH/test/e2e/
SCRIPTS_DIR=${SERVICE_REPO_PATH}/test/canary/scripts

source $SCRIPTS_DIR/setup_oidc.sh
source $SCRIPTS_DIR/install_controller_helm.sh

function print_controller_logs() {
pod_id=$( kubectl get pods -n $NAMESPACE --field-selector="status.phase=Running" \
--sort-by=.metadata.creationTimestamp \
| grep ack-sagemaker-controller | awk '{print $1}' 2>/dev/null )

kubectl -n $NAMESPACE logs "$pod_id"
}

function cleanup {
echo "Cleaning up resources"
set +e
kubectl delete endpoints.sagemaker --all
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this means we won't be able to run codebuild tests in parallel as one test might delete resources being used in another test.

Right now we don't need this as we don't have a problem of leaking resources.
By doing this we will not come to know if there are some leaking resources in future.

I think we should just handle these kinds of cleanup better in the the tests itself.
(this will help while running tests locally too as they might leave resources behind)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there will some more work to make them run in parallel like not installing helm charts

lets keep this until then

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also there is the problem of finalizer where delete might get stuck forever

kubectl delete endpointconfigs --all
kubectl delete models --all
kubectl delete trainingjobs --all
kubectl delete processingjobs --all
kubectl delete transformjobs --all
kubectl delete hyperparametertuningjobs --all
kubectl delete dataqualityjobdefinitions --all
kubectl delete modelbiasjobdefinitions --all
kubectl delete modelexplainabilityjobdefinitions --all
kubectl delete modelqualityjobdefinitions --all
kubectl delete monitoringschedules --all
kubectl delete adoptedresources --all

print_controller_logs

helm delete -n $NAMESPACE ack-$SERVICE-controller
kubectl delete namespace $NAMESPACE

cd $E2E_DIR
export PYTHONPATH=..
python service_cleanup.py

}
trap cleanup EXIT

# Update kubeconfig
aws --region $CLUSTER_REGION eks update-kubeconfig --name $CLUSTER_NAME

# Setup OIDC
create_oidc_role "$CLUSTER_NAME" "$CLUSTER_REGION" "$NAMESPACE"

# Install service helm chart
install_helm_chart $SERVICE $OIDC_ROLE_ARN $SERVICE_REGION $NAMESPACE

echo "Log helm charts are deployed properly"
kubectl -n $NAMESPACE get pods
kubectl get crds

pushd $E2E_DIR
export PYTHONPATH=..
# create resources for test
python service_bootstrap.py
sleep 5m

# run tests
echo "Run Tests"
pytest -n 10 --dist loadfile --log-cli-level INFO -m canary
popd
67 changes: 67 additions & 0 deletions test/canary/scripts/setup_oidc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env bash
# OIDC Setup

# A function to get the OIDC_ID associated with an EKS cluster
function get_oidc_id() {
local cluster_name="$1"
local region = "$2"
eksctl utils associate-iam-oidc-provider --cluster $cluster_name --region $region --approve
local oidc_url=$(aws eks describe-cluster --region $region --name $cluster_name --query "cluster.identity.oidc.issuer" --output text | cut -c9-)
echo "${oidc_url}"
}


function generate_trust_policy() {
local oidc_url="$1"
local namespace="$2"
local account_id=$(aws sts get-caller-identity --output text --query "Account")

cat <<EOF > trust.json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Federated": "arn:aws:iam::${account_id}:oidc-provider/${oidc_url}"
},
"Action": "sts:AssumeRoleWithWebIdentity",
"Condition": {
"StringEquals": {
"${oidc_url}:aud": "sts.amazonaws.com",
"${oidc_url}:sub": ["system:serviceaccount:${namespace}:ack-sagemaker-controller"]
}
}
}
]
}
EOF
}

function create_oidc_role() {
local cluster_name="$1"
local region="$2"
local namespace="$3"
local oidc_role_name=ack-oidc-role-$cluster_name-$namespace

# Create role only if it does not exist
set +e
aws iam get-role --role-name ${oidc_role_name}
exit_code=$?
set -euo pipefail

if [[ $exit_code -eq 0 ]]; then
echo "A role for this cluster and namespace already exists in this account, assuming sagemaker access and proceeding."
else
echo "Creating new IAM role: $oidc_role_name"
local oidc_url=$(get_oidc_id "$cluster_name" "$region")
local trustfile="trust.json"
generate_trust_policy "$oidc_url" "$namespace"
aws iam create-role --role-name "$oidc_role_name" --assume-role-policy-document file://${trustfile}
aws iam attach-role-policy --role-name "$oidc_role_name" --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
aws iam attach-role-policy --role-name "$oidc_role_name" --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess
rm "${trustfile}"
fi
local oidc_role_arn=$(aws iam get-role --role-name $oidc_role_name --output text --query 'Role.Arn')
export OIDC_ROLE_ARN=$oidc_role_arn
}
2 changes: 1 addition & 1 deletion test/e2e/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@955d7831ee374a212250179e95a5f3b75e555fd9
acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@5ed60a505afa953096e53c9d3d6779830250915b
black==20.8b1
9 changes: 8 additions & 1 deletion test/e2e/service_bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import json
import logging
import time
import subprocess

from acktest import resources
from acktest.aws.identity import get_region, get_account_id
Expand Down Expand Up @@ -88,7 +89,13 @@ def create_data_bucket() -> str:

source_bucket = s3_resource.Bucket(SAGEMAKER_SOURCE_DATA_BUCKET)
destination_bucket = s3_resource.Bucket(bucket_name)
duplicate_bucket_contents(source_bucket, destination_bucket)
temp_dir = "/tmp/ack_s3_data"
# duplicate_bucket_contents(source_bucket, destination_bucket)
# workaround to copy if buckets are across regions
# TODO: check if there is a better way and merge to test-infra
subprocess.call(['mkdir',f'{temp_dir}'])
subprocess.call(['aws', 's3', 'sync', f's3://{SAGEMAKER_SOURCE_DATA_BUCKET}', f'./{temp_dir}/', '--quiet'])
subprocess.call(['aws', 's3', 'sync', f'./{temp_dir}/', f's3://{bucket_name}', '--quiet'])

logging.info(f"Synced data bucket")

Expand Down
2 changes: 1 addition & 1 deletion test/e2e/tests/test_endpoint_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
create_sagemaker_resource,
)
from e2e.replacement_values import REPLACEMENT_VALUES
from e2e.common.config import config as cfg
from e2e.common import config as cfg


@pytest.fixture(scope="module")
Expand Down