From f18afeaba2ec77bd50da27b05e06552f24f84e06 Mon Sep 17 00:00:00 2001 From: Derek Frank Date: Mon, 16 Jun 2025 11:17:41 -0700 Subject: [PATCH 01/13] add script to validate that karpenter will not drift a cluster --- tests/assets/karpenter-no-drift-validation.sh | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100755 tests/assets/karpenter-no-drift-validation.sh diff --git a/tests/assets/karpenter-no-drift-validation.sh b/tests/assets/karpenter-no-drift-validation.sh new file mode 100755 index 00000000..85a1be08 --- /dev/null +++ b/tests/assets/karpenter-no-drift-validation.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Script to validate that an eks cluster with karpenter installed will not drift +# Validates: +# 1. Nodepools have a disruption budget of 0% set +# 2. Nodeclasses have pinned ami's +echo "Checking disruption budgets in nodepool specs..." +echo "-----------------------------------" + +FAILED=0 +# Get all nodepools and check their disruption budget settings +for nodepool in $(kubectl get nodepools -o json | jq -r '.items[] | {name: .metadata.name, budgets: .spec.disruption.budgets[].nodes} | @json'); do + echo $nodepool + NAME=$(echo $nodepool | jq -r '.name') + NDB=$(echo $nodepool | jq -r '.budgets') + + # Remove any % symbol and convert to number + NDB_NUM=$(echo $NDB | sed 's/%//') + + if [ "$NDB_NUM" -eq 0 ]; then + echo "✅ Disruption budget correctly set to $NDB for nodepool: $NAME" + else + echo "❌ Disruption budget too high for nodepool: $NAME (current: $NDB)" + export FAILED=1 + fi +done + +echo "Checking AMI versions in EC2NodeClass resources..." +echo "------------------------------------------------" + +# Get EC2NodeClass resources and check for @latest +for nodeclass in $(kubectl get ec2nodeclasses -o json | jq -r '.items[] | {name: .metadata.name, ami: .spec.amiFamily, amiSelector: .spec.amiSelectorTerms} | @json'); do + NAME=$(echo $nodeclass | jq -r '.name') + AMI_FAMILY=$(echo $nodeclass | jq -r '.ami') + AMI_SELECTOR=$(echo $nodeclass | jq -r '.amiSelector') + + echo "NodeClass: $NAME" + echo "AMI Family: $AMI_FAMILY" + echo "AMI Selector Terms: $AMI_SELECTOR" + + # Check if @latest is used in any selector terms + if echo "$AMI_SELECTOR" | grep -q "@latest"; then + echo "❌ WARNING: @latest version detected in NodeClass $NAME" + export FAILED=1 + else + echo "✅ No @latest version found in NodeClass $NAME" + fi + echo "------------------------------------------------" +done + +echo "-----------------------------------" +if [ $FAILED -eq 1 ]; then + echo "❌ Some nodepools or nodeclasses do not have the correct configuration" + exit 1 +else + echo "✅ All nodepools or nodeclasses have the correct configuration" + exit 0 +fi From f9abe30c293d158ceb16e18009890bcbc33213e7 Mon Sep 17 00:00:00 2001 From: DerekFrank Date: Fri, 11 Jul 2025 16:01:58 -0700 Subject: [PATCH 02/13] WIP draft of karpenter setup --- .../controller-role-policy-document.json | 112 ++++++++++++ .../controller-role-trust-policy.json | 18 ++ .../karpenter/node-role-policy-document.json | 12 ++ .../generators/karpenter/kubectl-drift.yaml | 0 .../generators/karpenter/kubectl-scale.yaml | 0 .../karpenter/awscli-controller-role.yaml | 172 ++++++++++++++++++ .../karpenter/awscli-instanceprofiles.yaml | 17 ++ .../tasks/setup/karpenter/awscli-mng.yaml | 39 ++++ .../setup/karpenter/awscli-node-role.yaml | 42 +++++ .../karpenter/awscli-securitygroups.yaml | 23 +++ .../tasks/setup/karpenter/awscli-subnets.yaml | 24 +++ .../karpenter/helm-karpenter-install.yaml | 53 ++++++ .../karpenter/kubectl-create-namespace.yaml | 18 ++ .../setup/karpenter/kubectl-nodeclass.yaml | 95 ++++++++++ .../setup/karpenter/kubectl-nodepools.yaml | 85 +++++++++ .../karpenter/awscli-controller-role.yaml | 0 .../karpenter/awscli-instanceprofiles.yaml | 0 .../tasks/teardown/karpenter/awscli-mng.yaml | 0 .../teardown/karpenter/awscli-node-role.yaml | 0 .../karpenter/helm-karpenter-uninstall.yaml | 0 20 files changed, 710 insertions(+) create mode 100644 tests/assets/karpenter/controller-role-policy-document.json create mode 100644 tests/assets/karpenter/controller-role-trust-policy.json create mode 100644 tests/assets/karpenter/node-role-policy-document.json create mode 100644 tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml create mode 100644 tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml create mode 100644 tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml create mode 100644 tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml create mode 100644 tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml create mode 100644 tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml create mode 100644 tests/tekton-resources/tasks/setup/karpenter/awscli-securitygroups.yaml create mode 100644 tests/tekton-resources/tasks/setup/karpenter/awscli-subnets.yaml create mode 100644 tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml create mode 100644 tests/tekton-resources/tasks/setup/karpenter/kubectl-create-namespace.yaml create mode 100644 tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml create mode 100644 tests/tekton-resources/tasks/setup/karpenter/kubectl-nodepools.yaml create mode 100644 tests/tekton-resources/tasks/teardown/karpenter/awscli-controller-role.yaml create mode 100644 tests/tekton-resources/tasks/teardown/karpenter/awscli-instanceprofiles.yaml create mode 100644 tests/tekton-resources/tasks/teardown/karpenter/awscli-mng.yaml create mode 100644 tests/tekton-resources/tasks/teardown/karpenter/awscli-node-role.yaml create mode 100644 tests/tekton-resources/tasks/teardown/karpenter/helm-karpenter-uninstall.yaml diff --git a/tests/assets/karpenter/controller-role-policy-document.json b/tests/assets/karpenter/controller-role-policy-document.json new file mode 100644 index 00000000..483c6cd0 --- /dev/null +++ b/tests/assets/karpenter/controller-role-policy-document.json @@ -0,0 +1,112 @@ +{ + "Statement": [ + { + "Action": [ + "ssm:GetParameter", + "ec2:DescribeImages", + "ec2:RunInstances", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeLaunchTemplates", + "ec2:DescribeInstances", + "ec2:DescribeInstanceTypes", + "ec2:DescribeInstanceTypeOfferings", + "ec2:DeleteLaunchTemplate", + "ec2:CreateTags", + "ec2:CreateLaunchTemplate", + "ec2:CreateFleet", + "ec2:DescribeSpotPriceHistory", + "pricing:GetProducts" + ], + "Effect": "Allow", + "Resource": "*", + "Sid": "Karpenter" + }, + { + "Action": "ec2:TerminateInstances", + "Condition": { + "StringLike": { + "ec2:ResourceTag/karpenter.sh/nodepool": "*" + } + }, + "Effect": "Allow", + "Resource": "*", + "Sid": "ConditionalEC2Termination" + }, + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "arn:$(params.aws-partition):iam::$(params.aws-account-id):role/KarpenterNodeRole-$(params.cluster-name)", + "Sid": "PassNodeIAMRole" + }, + { + "Effect": "Allow", + "Action": "eks:DescribeCluster", + "Resource": "arn:$(params.aws-partition):eks:$(params.aws-region):$(params.aws-account-id):cluster/$(params.cluster-name)", + "Sid": "EKSClusterEndpointLookup" + }, + { + "Sid": "AllowScopedInstanceProfileCreationActions", + "Effect": "Allow", + "Resource": "*", + "Action": [ + "iam:CreateInstanceProfile" + ], + "Condition": { + "StringEquals": { + "aws:RequestTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", + "aws:RequestTag/topology.kubernetes.io/region": "$(params.aws-region)" + }, + "StringLike": { + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" + } + } + }, + { + "Sid": "AllowScopedInstanceProfileTagActions", + "Effect": "Allow", + "Resource": "*", + "Action": [ + "iam:TagInstanceProfile" + ], + "Condition": { + "StringEquals": { + "aws:ResourceTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", + "aws:ResourceTag/topology.kubernetes.io/region": "$(params.aws-region)", + "aws:RequestTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", + "aws:RequestTag/topology.kubernetes.io/region": "$(params.aws-region)" + }, + "StringLike": { + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*", + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" + } + } + }, + { + "Sid": "AllowScopedInstanceProfileActions", + "Effect": "Allow", + "Resource": "*", + "Action": [ + "iam:AddRoleToInstanceProfile", + "iam:RemoveRoleFromInstanceProfile", + "iam:DeleteInstanceProfile" + ], + "Condition": { + "StringEquals": { + "aws:ResourceTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", + "aws:ResourceTag/topology.kubernetes.io/region": "$(params.aws-region)" + }, + "StringLike": { + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*" + } + } + }, + { + "Sid": "AllowInstanceProfileReadActions", + "Effect": "Allow", + "Resource": "*", + "Action": "iam:GetInstanceProfile" + } + ], + "Version": "2012-10-17" +} \ No newline at end of file diff --git a/tests/assets/karpenter/controller-role-trust-policy.json b/tests/assets/karpenter/controller-role-trust-policy.json new file mode 100644 index 00000000..83acbb2f --- /dev/null +++ b/tests/assets/karpenter/controller-role-trust-policy.json @@ -0,0 +1,18 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:$(params.aws-partition):iam::$(params.aws-account-id):oidc-provider/${OIDC_ENDPOINT#*//}" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "${OIDC_ENDPOINT#*//}:aud": "sts.amazonaws.com", + "${OIDC_ENDPOINT#*//}:sub": "system:serviceaccount:karpenter:karpenter" + } + } + } + ] +} \ No newline at end of file diff --git a/tests/assets/karpenter/node-role-policy-document.json b/tests/assets/karpenter/node-role-policy-document.json new file mode 100644 index 00000000..19859682 --- /dev/null +++ b/tests/assets/karpenter/node-role-policy-document.json @@ -0,0 +1,12 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "ec2.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} \ No newline at end of file diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml new file mode 100644 index 00000000..e69de29b diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml new file mode 100644 index 00000000..e69de29b diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml new file mode 100644 index 00000000..2da13759 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml @@ -0,0 +1,172 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-controller-role + namespace: karpenter +spec: + description: | + Creates the karpenter Controller Role + results: + - name: node-role-arn + description: Stores the controller role arn created by the task + params: + - name: cluster-name + description: The name of the cluster + - name: eks-endpoint + description: endpoint + - name: aws-region + description: region + - name: aws-account-id + description: account id + - name: aws-partition + description: partition + steps: + - name: create-role + image: alpine/k8s:1.23.7 + script: | + OIDC_ENDPOINT="$(aws eks --endpoint $(params.eks-endpoint) describe-cluster --name "$(params.cluster-name)" \ + --query "cluster.identity.oidc.issuer" --output text)" + OIDC_ID=$(aws eks --endpoint $(params.eks-endpoint) describe-cluster --name $(params.cluster-name) --region $(params.aws-region) --query "cluster.identity.oidc.issuer" --output text | cut -d '/' -f 5) + + cat << EOF > controller-trust-policy.json + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:$(params.aws-partition):iam::$(params.aws-account-id):oidc-provider/${OIDC_ENDPOINT#*//}" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "${OIDC_ENDPOINT#*//}:aud": "sts.amazonaws.com", + "${OIDC_ENDPOINT#*//}:sub": "system:serviceaccount:karpenter:karpenter" + } + } + } + ] + } + EOF + + aws iam create-role --role-name "KarpenterControllerRole-$(params.cluster-name)" \ + --assume-role-policy-document file://controller-trust-policy.json + + cat << EOF > controller-policy.json + { + "Statement": [ + { + "Action": [ + "ssm:GetParameter", + "ec2:DescribeImages", + "ec2:RunInstances", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeLaunchTemplates", + "ec2:DescribeInstances", + "ec2:DescribeInstanceTypes", + "ec2:DescribeInstanceTypeOfferings", + "ec2:DeleteLaunchTemplate", + "ec2:CreateTags", + "ec2:CreateLaunchTemplate", + "ec2:CreateFleet", + "ec2:DescribeSpotPriceHistory", + "pricing:GetProducts" + ], + "Effect": "Allow", + "Resource": "*", + "Sid": "Karpenter" + }, + { + "Action": "ec2:TerminateInstances", + "Condition": { + "StringLike": { + "ec2:ResourceTag/karpenter.sh/nodepool": "*" + } + }, + "Effect": "Allow", + "Resource": "*", + "Sid": "ConditionalEC2Termination" + }, + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "arn:$(params.aws-partition):iam::$(params.aws-account-id):role/KarpenterNodeRole-$(params.cluster-name)", + "Sid": "PassNodeIAMRole" + }, + { + "Effect": "Allow", + "Action": "eks:DescribeCluster", + "Resource": "arn:$(params.aws-partition):eks:$(params.aws-region):$(params.aws-account-id):cluster/$(params.cluster-name)", + "Sid": "EKSClusterEndpointLookup" + }, + { + "Sid": "AllowScopedInstanceProfileCreationActions", + "Effect": "Allow", + "Resource": "*", + "Action": [ + "iam:CreateInstanceProfile" + ], + "Condition": { + "StringEquals": { + "aws:RequestTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", + "aws:RequestTag/topology.kubernetes.io/region": "$(params.aws-region)" + }, + "StringLike": { + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" + } + } + }, + { + "Sid": "AllowScopedInstanceProfileTagActions", + "Effect": "Allow", + "Resource": "*", + "Action": [ + "iam:TagInstanceProfile" + ], + "Condition": { + "StringEquals": { + "aws:ResourceTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", + "aws:ResourceTag/topology.kubernetes.io/region": "$(params.aws-region)", + "aws:RequestTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", + "aws:RequestTag/topology.kubernetes.io/region": "$(params.aws-region)" + }, + "StringLike": { + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*", + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" + } + } + }, + { + "Sid": "AllowScopedInstanceProfileActions", + "Effect": "Allow", + "Resource": "*", + "Action": [ + "iam:AddRoleToInstanceProfile", + "iam:RemoveRoleFromInstanceProfile", + "iam:DeleteInstanceProfile" + ], + "Condition": { + "StringEquals": { + "aws:ResourceTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", + "aws:ResourceTag/topology.kubernetes.io/region": "$(params.aws-region)" + }, + "StringLike": { + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*" + } + } + }, + { + "Sid": "AllowInstanceProfileReadActions", + "Effect": "Allow", + "Resource": "*", + "Action": "iam:GetInstanceProfile" + } + ], + "Version": "2012-10-17" + } + E + + aws iam put-role-policy --role-name "KarpenterControllerRole-$(params.cluster-name)" \ + --policy-name "KarpenterControllerPolicy-$(params.cluster-name)" \ + --policy-document file://controller-policy.json \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml new file mode 100644 index 00000000..579fceb4 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml @@ -0,0 +1,17 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-instanceprofiles + namespace: karpenter +spec: + description: | + Creates the karpenter instance profile + params: + - name: cluster-name + description: The name of the cluster + steps: + - name: create-role + image: alpine/k8s:1.23.7 + script: | + aws iam create-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" + aws iam add-role-to-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" --role-name "KarpenterNodeRole-$(params.cluster-name)" diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml new file mode 100644 index 00000000..ac1d3317 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml @@ -0,0 +1,39 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-mng + namespace: karpenter +spec: + description: | + Creates the karpenter MNG + params: + - name: cluster-name + description: The name of the cluster + - name: node-role-arn + description: + steps: + - name: create-mng + image: alpine/k8s:1.23.7 + script: | + SUBNET_IDS=$(aws ec2 describe-subnets \ + --filters "Name=tag:karpenter.sh/discovery,Values=$(params.cluster-name)" \ + --query 'Subnets[*].SubnetId' \ + --output text) + + aws eks create-nodegroup \ + --cluster-name $(params.cluster-name) \ + --nodegroup-name karpenter-system-large \ + --node-role $(params.node-role-arn) \ + --instance-types r5.24xlarge \ + --scaling-config minSize=2,maxSize=3,desiredSize=2 \ + --subnets ${SUBNET_IDS} \ + --labels dedicated=karpenter \ + --region $(params.aws-region) \ + --endpoint-url https://api.beta.us-west-2.wesley.amazonaws.com + + + # quick validation + aws eks list-nodegroups \ + --endpoint https://api.beta.us-west-2.wesley.amazonaws.com \ + --cluster-name eks-titan-perflab-c05 \ + --region $(params.aws-region) \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml new file mode 100644 index 00000000..00e7e292 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml @@ -0,0 +1,42 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-node-role + namespace: karpenter +spec: + description: | + Creates the karpenter Node Role + results: + - name: node-role-arn + description: Stores the node role arn created by the task + params: + - name: cluster-name + description: The name of the cluster + steps: + - name: create-role + image: alpine/k8s:1.23.7 + script: | + aws iam create-role --role-name "KarpenterNodeRole-$(params.cluster-name)" \ + --assume-role-policy-document '{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "ec2.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] + }' + + # Attach required policies + aws iam attach-role-policy --role-name "KarpenterNodeRole-$(params.cluster-name)" \ + --policy-arn arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy + aws iam attach-role-policy --role-name "KarpenterNodeRole-$(params.cluster-name)" \ + --policy-arn arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy + aws iam attach-role-policy --role-name "KarpenterNodeRole-$(params.cluster-name)" \ + --policy-arn arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly + aws iam attach-role-policy --role-name "KarpenterNodeRole-$(params.cluster-name)" \ + --policy-arn arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore + aws iam get-role --role-name KarpenterNodeRole-$(params.cluster-name) --query 'Role.[Arn]' --output text > $(results.role-arn.path) \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-securitygroups.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-securitygroups.yaml new file mode 100644 index 00000000..7bbe52ec --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-securitygroups.yaml @@ -0,0 +1,23 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-securitygroups + namespace: karpenter +spec: + description: | + Creates the karpenter Subnet Role + params: + - name: cluster-name + description: The name of the cluster + - name: cfn-stack-arn + description: The arn of the stack that the security groups are created by + - name: aws-region + description: AWS region that test is running in + steps: + - name: create-role + image: alpine/k8s:1.23.7 + script: | + aws ec2 create-tags \ + --resources $(aws ec2 describe-security-groups --filter "Name=tag:aws:cloudformation:stack-id,Values=$(params.cfn-stack-arn)" --region $(params.aws-region) --output text --query 'SecurityGroupId') \ + --region $(params.aws-region) \ + --tags Key=karpenter.sh/discovery,Value=$(params.cluster-name) \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-subnets.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-subnets.yaml new file mode 100644 index 00000000..e68c4b70 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-subnets.yaml @@ -0,0 +1,24 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-subnets + namespace: karpenter +spec: + description: | + Creates the karpenter Subnet Role + params: + - name: cluster-name + description: The name of the cluster + - name: aws-region + description: AWS region that test is running in + steps: + - name: create-role + image: alpine/k8s:1.23.7 + script: | + for SUBNET in $(aws ec2 describe-subnets --filter "Name=tag:aws:cloudformation:stack-id,Values=$(params.cfn-stack-arn)" --region $(params.aws-region) --output text --query 'SubnetId') + do + aws ec2 create-tags \ + --resources ${SUBNET} \ + --region $(params.aws-region) \ + --tags Key=karpenter.sh/discovery,Value=$(params.cluster-name) + done \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml b/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml new file mode 100644 index 00000000..92e16616 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml @@ -0,0 +1,53 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: helm-karpenter-install + namespace: karpenter +spec: + description: | + Install karpenter on the cluster + params: + - name: cluster-name + description: The name of the cluster + - name: node-role-arn + description: + steps: + - name: install karpenter + image: alpine/k8s:1.23.7 + script: | + aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 953421922360.dkr.ecr.us-west-2.amazonaws.com + + helm upgrade --install karpenter oci://953421922360.dkr.ecr.us-west-2.amazonaws.com/karpenter/karpenter --version ${KARPENTER_VERSION} \ + --namespace "karpenter" \ + --create-namespace \ + --set "settings.clusterName=${CLUSTER_NAME}" \ + --set "settings.interruptionQueue=" \ + --set "settings.eksControlPlane=true" \ + --set-string "settings.awsCreateQPS=60" \ + --set "settings.featureGates.disableMetricsControllers=true" \ + --set "settings.featureGates.nodeRepair=true" \ + --set settings.featureGates.reservedCapacity="true" \ + --set settings.featureGates.spotToSpotConsolidation="true" \ + --set settings.featureGates.disableMetricsControllers="true" \ + --set settings.preferencePolicy=Ignore \ + --set "serviceAccount.annotations.eks\.amazonaws\.com/role-arn=${CONTROLLER_ROLE_ARN}" \ + --set controller.resources.requests.cpu=60 \ + --set controller.resources.requests.memory=200Gi \ + --set controller.resources.limits.cpu=60 \ + --set controller.resources.limits.memory=200Gi \ + --set "controller.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key=dedicated" \ + --set "controller.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator=In" \ + --set "controller.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0]=karpenter" \ + --set "tolerations[0].key=dedicated" \ + --set "tolerations[0].value=karpenter" \ + --set "tolerations[0].operator=Equal" \ + --set "dnsPolicy=Default" \ + --set-string "controller.env[0].name=AWS_ENDPOINT_URL_EKS" \ + --set-string "controller.env[0].value=https://api.beta.us-west-2.wesley.amazonaws.com" \ + --set-string "controller.env[1].name=KUBE_CLIENT_QPS" \ + --set-string "controller.env[1].value=50000" \ + --set-string "controller.env[2].name=KUBE_CLIENT_BURST" \ + --set-string "controller.env[2].value=50000" \ + --set-string "controller.env[3].name=ENABLE_PROFILING" \ + --set-string "controller.env[3].value=true" \ + --wait \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/kubectl-create-namespace.yaml b/tests/tekton-resources/tasks/setup/karpenter/kubectl-create-namespace.yaml new file mode 100644 index 00000000..e610c464 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/kubectl-create-namespace.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: kubectl-create-namespace + namespace: karpenter +spec: + description: | + Create a namespace within the cluster + workspaces: + - name: config + description: | + A workspace into which a kubeconfig file called `kubeconfig` will be written that will contain the information required to access the cluster. The `kubeconfig` will expect to use [aws-iam-authenticator](https://github.com/kubernetes-sigs/aws-iam-authenticator/) to authenticate, so in order for it to be used it must be run in a container which contains both `kubectl` and `aws-iam-authenticator`. + steps: + - name: create namespace + image: docker.io/weaveworks/eksctl:0.66.0 + script: | + kubeconfig create namespace karpenter --kubeconfig $(workspaces.config.path)/kubeconfig diff --git a/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml b/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml new file mode 100644 index 00000000..0622bc94 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml @@ -0,0 +1,95 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: create-ec2nodeclass + namespace: karpenter +spec: + description: | + Install karpenter on the cluster + params: + - name: cluster-name + description: The name of the cluster + - name: node-role-arn + description: + steps: + - name: create-ec2nodeclass + image: alpine/k8s:1.23.7 + script: | + CLUSTER_CA=$(aws eks describe-cluster \ + --name $(params.cluster-name) \ + --endpoint-url https://api.beta.us-west-2.wesley.amazonaws.com \ + --query 'cluster.certificateAuthority.data' \ + --output text) + + + CLUSTER_ENDPOINT=$(aws eks describe-cluster \ + --name $(params.cluster-name) \ + --endpoint-url https://api.beta.us-west-2.wesley.amazonaws.com \ + --query 'cluster.endpoint' \ + --output text) + + echo "Cluster endpoint: ${CLUSTER_ENDPOINT}" + + + cat < Date: Mon, 28 Jul 2025 14:47:05 -0700 Subject: [PATCH 03/13] fix env var in policy templates --- .../karpenter/controller-role-policy-document.json | 12 ++++++------ ...on => controller-role-trust-policy-document.json} | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) rename tests/assets/karpenter/{controller-role-trust-policy.json => controller-role-trust-policy-document.json} (78%) diff --git a/tests/assets/karpenter/controller-role-policy-document.json b/tests/assets/karpenter/controller-role-policy-document.json index 483c6cd0..2d58e1d3 100644 --- a/tests/assets/karpenter/controller-role-policy-document.json +++ b/tests/assets/karpenter/controller-role-policy-document.json @@ -36,13 +36,13 @@ { "Effect": "Allow", "Action": "iam:PassRole", - "Resource": "arn:$(params.aws-partition):iam::$(params.aws-account-id):role/KarpenterNodeRole-$(params.cluster-name)", + "Resource": "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}", "Sid": "PassNodeIAMRole" }, { "Effect": "Allow", "Action": "eks:DescribeCluster", - "Resource": "arn:$(params.aws-partition):eks:$(params.aws-region):$(params.aws-account-id):cluster/$(params.cluster-name)", + "Resource": "arn:${AWS_PARTITION}:eks:${AWS_ACCOUNT_ID}:$(params.aws-account-id):cluster/${CLUSTER_NAME}", "Sid": "EKSClusterEndpointLookup" }, { @@ -55,7 +55,7 @@ "Condition": { "StringEquals": { "aws:RequestTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", - "aws:RequestTag/topology.kubernetes.io/region": "$(params.aws-region)" + "aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}" }, "StringLike": { "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" @@ -72,9 +72,9 @@ "Condition": { "StringEquals": { "aws:ResourceTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", - "aws:ResourceTag/topology.kubernetes.io/region": "$(params.aws-region)", + "aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}", "aws:RequestTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", - "aws:RequestTag/topology.kubernetes.io/region": "$(params.aws-region)" + "aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}" }, "StringLike": { "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*", @@ -94,7 +94,7 @@ "Condition": { "StringEquals": { "aws:ResourceTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", - "aws:ResourceTag/topology.kubernetes.io/region": "$(params.aws-region)" + "aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}" }, "StringLike": { "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*" diff --git a/tests/assets/karpenter/controller-role-trust-policy.json b/tests/assets/karpenter/controller-role-trust-policy-document.json similarity index 78% rename from tests/assets/karpenter/controller-role-trust-policy.json rename to tests/assets/karpenter/controller-role-trust-policy-document.json index 83acbb2f..2c97c0bf 100644 --- a/tests/assets/karpenter/controller-role-trust-policy.json +++ b/tests/assets/karpenter/controller-role-trust-policy-document.json @@ -4,7 +4,7 @@ { "Effect": "Allow", "Principal": { - "Federated": "arn:$(params.aws-partition):iam::$(params.aws-account-id):oidc-provider/${OIDC_ENDPOINT#*//}" + "Federated": "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:oidc-provider/${OIDC_ENDPOINT#*//}" }, "Action": "sts:AssumeRoleWithWebIdentity", "Condition": { From 02618b548116aad70a65b188f903629354ef437e Mon Sep 17 00:00:00 2001 From: DerekFrank Date: Mon, 28 Jul 2025 14:58:01 -0700 Subject: [PATCH 04/13] fix env var in policy templates 2 --- .../karpenter/controller-role-policy-document.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/assets/karpenter/controller-role-policy-document.json b/tests/assets/karpenter/controller-role-policy-document.json index 2d58e1d3..f4c95a0d 100644 --- a/tests/assets/karpenter/controller-role-policy-document.json +++ b/tests/assets/karpenter/controller-role-policy-document.json @@ -42,7 +42,7 @@ { "Effect": "Allow", "Action": "eks:DescribeCluster", - "Resource": "arn:${AWS_PARTITION}:eks:${AWS_ACCOUNT_ID}:$(params.aws-account-id):cluster/${CLUSTER_NAME}", + "Resource": "arn:${AWS_PARTITION}:eks:${AWS_ACCOUNT_ID}:${AWS_ACCOUNT_ID}:cluster/${CLUSTER_NAME}", "Sid": "EKSClusterEndpointLookup" }, { @@ -54,7 +54,7 @@ ], "Condition": { "StringEquals": { - "aws:RequestTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", + "aws:RequestTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", "aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}" }, "StringLike": { @@ -71,9 +71,9 @@ ], "Condition": { "StringEquals": { - "aws:ResourceTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", + "aws:ResourceTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", "aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}", - "aws:RequestTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", + "aws:RequestTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", "aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}" }, "StringLike": { @@ -93,7 +93,7 @@ ], "Condition": { "StringEquals": { - "aws:ResourceTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", + "aws:ResourceTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", "aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}" }, "StringLike": { From 621267eb2af8eebb3368184fa62d964792fa3a5b Mon Sep 17 00:00:00 2001 From: DerekFrank Date: Mon, 28 Jul 2025 16:01:11 -0700 Subject: [PATCH 05/13] fixing env vars 3 --- .../karpenter/controller-role-trust-policy-document.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/assets/karpenter/controller-role-trust-policy-document.json b/tests/assets/karpenter/controller-role-trust-policy-document.json index 2c97c0bf..18b6e23b 100644 --- a/tests/assets/karpenter/controller-role-trust-policy-document.json +++ b/tests/assets/karpenter/controller-role-trust-policy-document.json @@ -4,13 +4,13 @@ { "Effect": "Allow", "Principal": { - "Federated": "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:oidc-provider/${OIDC_ENDPOINT#*//}" + "Federated": "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:oidc-provider/${OIDC_ENDPOINT}" }, "Action": "sts:AssumeRoleWithWebIdentity", "Condition": { "StringEquals": { - "${OIDC_ENDPOINT#*//}:aud": "sts.amazonaws.com", - "${OIDC_ENDPOINT#*//}:sub": "system:serviceaccount:karpenter:karpenter" + "${OIDC_ENDPOINT}:aud": "sts.amazonaws.com", + "${OIDC_ENDPOINT}:sub": "system:serviceaccount:karpenter:karpenter" } } } From 6399244376031689cc6fa375ae70e8c8ba2ef224 Mon Sep 17 00:00:00 2001 From: DerekFrank Date: Mon, 28 Jul 2025 17:08:27 -0700 Subject: [PATCH 06/13] Adding nodepool and nodeclass yaml --- .../controller-role-policy-document.json | 2 +- tests/assets/karpenter/nodeclass.yaml | 60 +++++++++++++++++++ tests/assets/karpenter/nodepool.yaml | 47 +++++++++++++++ 3 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 tests/assets/karpenter/nodeclass.yaml create mode 100644 tests/assets/karpenter/nodepool.yaml diff --git a/tests/assets/karpenter/controller-role-policy-document.json b/tests/assets/karpenter/controller-role-policy-document.json index f4c95a0d..18f11f94 100644 --- a/tests/assets/karpenter/controller-role-policy-document.json +++ b/tests/assets/karpenter/controller-role-policy-document.json @@ -42,7 +42,7 @@ { "Effect": "Allow", "Action": "eks:DescribeCluster", - "Resource": "arn:${AWS_PARTITION}:eks:${AWS_ACCOUNT_ID}:${AWS_ACCOUNT_ID}:cluster/${CLUSTER_NAME}", + "Resource": "arn:${AWS_PARTITION}:eks:${AWS_REGION}:${AWS_ACCOUNT_ID}:cluster/${CLUSTER_NAME}", "Sid": "EKSClusterEndpointLookup" }, { diff --git a/tests/assets/karpenter/nodeclass.yaml b/tests/assets/karpenter/nodeclass.yaml new file mode 100644 index 00000000..d11a1cf9 --- /dev/null +++ b/tests/assets/karpenter/nodeclass.yaml @@ -0,0 +1,60 @@ +apiVersion: karpenter.k8s.aws/v1 +kind: EC2NodeClass +metadata: + name: default +spec: + amiFamily: Custom + instanceProfile: "KarpenterNodeInstanceProfile-${CLUSTER_NAME}" + amiSelectorTerms: + - alias: "al2023@${ALIAS_VERSION}" + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: "${CLUSTER_NAME}" + - tags: + aws:cloudformation:stack-name: "${CLUSTER_NAME}" + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: "${CLUSTER_NAME}" + - tags: + aws:cloudformation:stack-name: "${CLUSTER_NAME}" + - tags: + kubernetes.io/cluster/${CLUSTER_NAME}: owned + kubelet: + maxPods: 110 + systemReserved: + cpu: 100m + memory: 100Mi + ephemeral-storage: 1Gi + kubeReserved: + cpu: 100m + memory: 100Mi + ephemeral-storage: 1Gi + evictionHard: + memory.available: 5% + nodefs.available: 10% + nodefs.inodesFree: 10% + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: application/node.eks.aws + + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + cluster: + name: ${CLUSTER_NAME} + apiServerEndpoint: ${CLUSTER_ENDPOINT} # Using the actual cluster endpoint + certificateAuthority: ${CLUSTER_CA} + cidr: "172.20.0.0/16" + kubelet: + config: + nodeStatusReportFrequency: "60m" + nodeLeaseDurationSeconds: 60 + maxPods: 110 + clusterDNS: ["172.20.0.10"] + flags: + - --node-labels=karpenter.sh/capacity-type=on-demand,karpenter.sh/nodepool=titan-pool + - --register-with-taints=karpenter.sh/unregistered:NoExecute + --BOUNDARY-- \ No newline at end of file diff --git a/tests/assets/karpenter/nodepool.yaml b/tests/assets/karpenter/nodepool.yaml new file mode 100644 index 00000000..47dc873f --- /dev/null +++ b/tests/assets/karpenter/nodepool.yaml @@ -0,0 +1,47 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: ${CLUSTER_NAME} +spec: + disruption: + budgets: + - nodes: 5% + consolidateAfter: 0s + consolidationPolicy: WhenEmptyOrUnderutilized + replicas: 0 + template: + spec: + expireAfter: 720h + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: default + requirements: + - key: topology.kubernetes.io/zone + operator: In + values: + - $AZ + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: kubernetes.io/os + operator: In + values: + - linux + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - key: node.kubernetes.io/instance-category + operator: In + values: + - c + - m + - r + - t + - key: karpenter.k8s.aws/instance-size + operator: In + values: + - medium + - large \ No newline at end of file From f6fb725dea1e0bce1d1a1e0cf7e750ac6c240e43 Mon Sep 17 00:00:00 2001 From: DerekFrank Date: Tue, 29 Jul 2025 10:51:11 -0700 Subject: [PATCH 07/13] Fixing nodepool name --- tests/assets/karpenter/nodepool.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/assets/karpenter/nodepool.yaml b/tests/assets/karpenter/nodepool.yaml index 47dc873f..23ca72fa 100644 --- a/tests/assets/karpenter/nodepool.yaml +++ b/tests/assets/karpenter/nodepool.yaml @@ -1,7 +1,7 @@ apiVersion: karpenter.sh/v1 kind: NodePool metadata: - name: ${CLUSTER_NAME} + name: ${CLUSTER_NAME}-${AZ} spec: disruption: budgets: From bb40c79486a72ad6ba17e6f79c08b9fcb3dfc25f Mon Sep 17 00:00:00 2001 From: DerekFrank Date: Tue, 29 Jul 2025 10:53:22 -0700 Subject: [PATCH 08/13] Fixing nodepool AZ --- tests/assets/karpenter/nodepool.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/assets/karpenter/nodepool.yaml b/tests/assets/karpenter/nodepool.yaml index 23ca72fa..0eacf187 100644 --- a/tests/assets/karpenter/nodepool.yaml +++ b/tests/assets/karpenter/nodepool.yaml @@ -20,7 +20,7 @@ spec: - key: topology.kubernetes.io/zone operator: In values: - - $AZ + - ${AZ} - key: kubernetes.io/arch operator: In values: From 321a192f9c9221ee6e282415a8e06f59f54c97f3 Mon Sep 17 00:00:00 2001 From: DerekFrank Date: Tue, 29 Jul 2025 14:11:58 -0700 Subject: [PATCH 09/13] Tested tasks --- .../karpenter/kubectl-cluster-wait.yaml | 49 +++++ .../generators/karpenter/kubectl-drift.yaml | 22 +++ .../generators/karpenter/kubectl-scale.yaml | 23 +++ .../karpenter/awscli-controller-role.yaml | 178 ++++-------------- .../karpenter/awscli-instanceprofiles.yaml | 2 +- .../karpenter/awscli-karpenter-cfn-stack.yaml | 65 +++++++ .../tasks/setup/karpenter/awscli-mng.yaml | 28 ++- .../setup/karpenter/awscli-node-role.yaml | 4 +- .../karpenter/awscli-securitygroups.yaml | 23 --- .../tasks/setup/karpenter/awscli-subnets.yaml | 24 --- .../karpenter/helm-karpenter-install.yaml | 55 +++++- .../karpenter/kubectl-create-namespace.yaml | 18 -- .../setup/karpenter/kubectl-nodeclass.yaml | 95 +++------- .../setup/karpenter/kubectl-nodepools.yaml | 90 +++------ .../karpenter/awscli-controller-role.yaml | 16 ++ .../karpenter/awscli-instanceprofiles.yaml | 16 ++ .../karpenter/awscli-karpenter-cfn-stack.yaml | 36 ++++ .../tasks/teardown/karpenter/awscli-mng.yaml | 0 .../teardown/karpenter/awscli-node-role.yaml | 0 .../karpenter/helm-karpenter-uninstall.yaml | 0 20 files changed, 383 insertions(+), 361 deletions(-) create mode 100644 tests/tekton-resources/tasks/generators/karpenter/kubectl-cluster-wait.yaml create mode 100644 tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml delete mode 100644 tests/tekton-resources/tasks/setup/karpenter/awscli-securitygroups.yaml delete mode 100644 tests/tekton-resources/tasks/setup/karpenter/awscli-subnets.yaml delete mode 100644 tests/tekton-resources/tasks/setup/karpenter/kubectl-create-namespace.yaml create mode 100644 tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter-cfn-stack.yaml delete mode 100644 tests/tekton-resources/tasks/teardown/karpenter/awscli-mng.yaml delete mode 100644 tests/tekton-resources/tasks/teardown/karpenter/awscli-node-role.yaml delete mode 100644 tests/tekton-resources/tasks/teardown/karpenter/helm-karpenter-uninstall.yaml diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-cluster-wait.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-cluster-wait.yaml new file mode 100644 index 00000000..653e3757 --- /dev/null +++ b/tests/tekton-resources/tasks/generators/karpenter/kubectl-cluster-wait.yaml @@ -0,0 +1,49 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: node-condition-wait + namespace: scalability +spec: + description: "waits for there to be no nodes with the specific condition" + params: + - name: cluster-name + description: The name of the cluster + - name: endpoint + description: eks endpoint to use + - name: aws-region + - name: initial-delay + default: 30m + - name: condition + description: condition to check + - name: value + description: value of the condition to validate + steps: + - name: drift-nodepool + image: amazon/aws-cli + script: | + sleep $(params.initial-delay) + CHECK_INTERVAL=300 + while true; do + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + echo "$(date): Checking node conditions..." + # Get nodes that still have the unwanted condition + nodes_with_condition=$(kubectl get nodes -o json | jq -r --arg type $(params.condition) --arg status $(params.value) ' + .items[] | + select(.status.conditions[] | select(.type == $type and .status == $status)) | + .metadata.name + ') + if [ -z "$nodes_with_condition" ]; then + echo "$(date): All nodes are clear of condition $(params.condition)=$(params.value)" + echo "Condition check completed successfully!" + exit 0 + else + echo "$(date): The following nodes still have $(params.condition)=$(params.value):" + echo "$nodes_with_condition" + echo "Waiting 5 minutes before next check..." + sleep $CHECK_INTERVAL + fi + done + + exit 1 + diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml index e69de29b..30c70746 100644 --- a/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml +++ b/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: nodepool-drift + namespace: scalability +spec: + description: "drift a nodepool by adding a new label to the specified nodepool" + params: + - name: nodepool + description: Name of the nodepool to drift + - name: cluster-name + description: The name of the cluster + - name: endpoint + description: eks endpoint to use + - name: aws-region + steps: + - name: drift-nodepool + image: amazon/aws-cli + script: | + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + kubectl patch nodepool ${params.nodepool} --patch '{"spec": {"template": {"metadata": {"labels": {"myLabel": "myValue"}}}}}' \ No newline at end of file diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml index e69de29b..0738c994 100644 --- a/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml +++ b/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: nodepool-scale + namespace: scalability +spec: + description: "drift a cluster by adding a new label to the specified nodepool" + params: + - name: replicas + description: Number of replicas to scale to + - name: nodepool + description: Name of the nodepool to drift + - name: cluster-name + description: The name of the cluster + - name: endpoint + description: eks endpoint to use + steps: + - name: scale-nodepool + image: amazon/aws-cli + script: | + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + kubectl scale nodepool ${params.nodepool} --replicas $(params.replicas) \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml index 2da13759..83c11465 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml @@ -2,7 +2,7 @@ apiVersion: tekton.dev/v1beta1 kind: Task metadata: name: awscli-controller-role - namespace: karpenter + namespace: scalability spec: description: | Creates the karpenter Controller Role @@ -12,161 +12,59 @@ spec: params: - name: cluster-name description: The name of the cluster - - name: eks-endpoint + - name: endpoint description: endpoint - name: aws-region description: region + default: us-west-2 - name: aws-account-id description: account id - name: aws-partition description: partition + default: aws + - name: karpenter-controller-role-trust-policy-url + default: https://raw.githubusercontent.com/DerekFrank/kubernetes-iteration-toolkit/refs/heads/main/tests/assets/karpenter/controller-role-trust-policy-document.json + - name: karpenter-controller-role-policy-url + default: https://raw.githubusercontent.com/DerekFrank/kubernetes-iteration-toolkit/refs/heads/main/tests/assets/karpenter/controller-role-policy-document.json + workspaces: + - name: source + mountPath: /src/karpenter/ steps: - name: create-role image: alpine/k8s:1.23.7 script: | - OIDC_ENDPOINT="$(aws eks --endpoint $(params.eks-endpoint) describe-cluster --name "$(params.cluster-name)" \ + echo "Starting controller role" + export RAW_OIDC_ENDPOINT="$(aws eks --endpoint $(params.endpoint) describe-cluster --name "$(params.cluster-name)" \ --query "cluster.identity.oidc.issuer" --output text)" - OIDC_ID=$(aws eks --endpoint $(params.eks-endpoint) describe-cluster --name $(params.cluster-name) --region $(params.aws-region) --query "cluster.identity.oidc.issuer" --output text | cut -d '/' -f 5) + export OIDC_ID=$(aws eks --endpoint $(params.endpoint) describe-cluster --name $(params.cluster-name) --region $(params.aws-region) --query "cluster.identity.oidc.issuer" --output text | cut -d '/' -f 5) + export AWS_PARTITION=$(params.aws-partition) + export AWS_ACCOUNT_ID=$(params.aws-account-id) + export AWS_REGION=$(params.aws-region) + export CLUSTER_NAME=$(params.cluster-name) + echo $RAW_OIDC_ENDPOINT + echo $OIDC_ID - cat << EOF > controller-trust-policy.json - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Federated": "arn:$(params.aws-partition):iam::$(params.aws-account-id):oidc-provider/${OIDC_ENDPOINT#*//}" - }, - "Action": "sts:AssumeRoleWithWebIdentity", - "Condition": { - "StringEquals": { - "${OIDC_ENDPOINT#*//}:aud": "sts.amazonaws.com", - "${OIDC_ENDPOINT#*//}:sub": "system:serviceaccount:karpenter:karpenter" - } - } - } - ] - } - EOF + export OIDC_ENDPOINT=$(echo ${RAW_OIDC_ENDPOINT#*//}) + + echo $OIDC_ENDPOINT + + curl -fsSL $(params.karpenter-controller-role-trust-policy-url) -o $(workspaces.source.path)karpenter-controller-role-trust-policy-url.json + + envsubst < $(workspaces.source.path)karpenter-controller-role-trust-policy-url.json > $(workspaces.source.path)karpenter-controller-role-trust-policy-url-modified.json + + cat $(workspaces.source.path)karpenter-controller-role-trust-policy-url.json + + cat $(workspaces.source.path)karpenter-controller-role-trust-policy-url-modified.json aws iam create-role --role-name "KarpenterControllerRole-$(params.cluster-name)" \ - --assume-role-policy-document file://controller-trust-policy.json + --assume-role-policy-document file://$(workspaces.source.path)karpenter-controller-role-trust-policy-url-modified.json + + curl -fsSL $(params.karpenter-controller-role-policy-url) -o $(workspaces.source.path)karpenter-controller-role-policy-url.json - cat << EOF > controller-policy.json - { - "Statement": [ - { - "Action": [ - "ssm:GetParameter", - "ec2:DescribeImages", - "ec2:RunInstances", - "ec2:DescribeSubnets", - "ec2:DescribeSecurityGroups", - "ec2:DescribeLaunchTemplates", - "ec2:DescribeInstances", - "ec2:DescribeInstanceTypes", - "ec2:DescribeInstanceTypeOfferings", - "ec2:DeleteLaunchTemplate", - "ec2:CreateTags", - "ec2:CreateLaunchTemplate", - "ec2:CreateFleet", - "ec2:DescribeSpotPriceHistory", - "pricing:GetProducts" - ], - "Effect": "Allow", - "Resource": "*", - "Sid": "Karpenter" - }, - { - "Action": "ec2:TerminateInstances", - "Condition": { - "StringLike": { - "ec2:ResourceTag/karpenter.sh/nodepool": "*" - } - }, - "Effect": "Allow", - "Resource": "*", - "Sid": "ConditionalEC2Termination" - }, - { - "Effect": "Allow", - "Action": "iam:PassRole", - "Resource": "arn:$(params.aws-partition):iam::$(params.aws-account-id):role/KarpenterNodeRole-$(params.cluster-name)", - "Sid": "PassNodeIAMRole" - }, - { - "Effect": "Allow", - "Action": "eks:DescribeCluster", - "Resource": "arn:$(params.aws-partition):eks:$(params.aws-region):$(params.aws-account-id):cluster/$(params.cluster-name)", - "Sid": "EKSClusterEndpointLookup" - }, - { - "Sid": "AllowScopedInstanceProfileCreationActions", - "Effect": "Allow", - "Resource": "*", - "Action": [ - "iam:CreateInstanceProfile" - ], - "Condition": { - "StringEquals": { - "aws:RequestTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", - "aws:RequestTag/topology.kubernetes.io/region": "$(params.aws-region)" - }, - "StringLike": { - "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" - } - } - }, - { - "Sid": "AllowScopedInstanceProfileTagActions", - "Effect": "Allow", - "Resource": "*", - "Action": [ - "iam:TagInstanceProfile" - ], - "Condition": { - "StringEquals": { - "aws:ResourceTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", - "aws:ResourceTag/topology.kubernetes.io/region": "$(params.aws-region)", - "aws:RequestTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", - "aws:RequestTag/topology.kubernetes.io/region": "$(params.aws-region)" - }, - "StringLike": { - "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*", - "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" - } - } - }, - { - "Sid": "AllowScopedInstanceProfileActions", - "Effect": "Allow", - "Resource": "*", - "Action": [ - "iam:AddRoleToInstanceProfile", - "iam:RemoveRoleFromInstanceProfile", - "iam:DeleteInstanceProfile" - ], - "Condition": { - "StringEquals": { - "aws:ResourceTag/kubernetes.io/cluster/$(params.cluster-name)": "owned", - "aws:ResourceTag/topology.kubernetes.io/region": "$(params.aws-region)" - }, - "StringLike": { - "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*" - } - } - }, - { - "Sid": "AllowInstanceProfileReadActions", - "Effect": "Allow", - "Resource": "*", - "Action": "iam:GetInstanceProfile" - } - ], - "Version": "2012-10-17" - } - E + envsubst < $(workspaces.source.path)karpenter-controller-role-policy-url.json > $(workspaces.source.path)karpenter-controller-role-policy-url-modified.json + + cat $(workspaces.source.path)karpenter-controller-role-policy-url-modified.json aws iam put-role-policy --role-name "KarpenterControllerRole-$(params.cluster-name)" \ --policy-name "KarpenterControllerPolicy-$(params.cluster-name)" \ - --policy-document file://controller-policy.json \ No newline at end of file + --policy-document file://$(workspaces.source.path)karpenter-controller-role-policy-url-modified.json \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml index 579fceb4..613dfb94 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml @@ -2,7 +2,7 @@ apiVersion: tekton.dev/v1beta1 kind: Task metadata: name: awscli-instanceprofiles - namespace: karpenter + namespace: scalability spec: description: | Creates the karpenter instance profile diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml new file mode 100644 index 00000000..eb1b318d --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml @@ -0,0 +1,65 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-karpenter-cfn-stack + namespace: scalability +spec: + description: | + Creates the karpenter instance roles and sqs interruption queue + params: + - name: cluster-name + description: The name of the cluster + - name: karpenter-version + description: Version of Karpenter to deploy + - name: endpoint + description: Endpoint to use with EKS + - name: region + default: us-west-2 + description: The region where the cluster is in. + - name: account-id + description: The aws account the cluster is running in + workspaces: + - name: source + mountPath: /src/karpenter/ + steps: + - name: create-stack + image: alpine/k8s:1.23.7 + script: | + STACK_NAME=Karpenter-$(params.cluster-name) + STACK_STATUS=$(aws cloudformation describe-stacks --query 'Stacks[?StackName==`'${STACK_NAME}'`].StackStatus' --output text --region $(params.region)) + curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.6.1/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml -o $(workspaces.source.path)cloudformation.yaml + + cat /src/karpenter/cloudformation.yaml + + aws eks update-cluster-config --name $(params.cluster-name) --access-config authenticationMode=API_AND_CONFIG_MAP --endpoint $(params.endpoint) + + if [[ "$STACK_STATUS" == "" ]]; then + aws cloudformation deploy \ + --stack-name "Karpenter-$(params.cluster-name)" \ + --template-file $(workspaces.source.path)cloudformation.yaml \ + --capabilities CAPABILITY_NAMED_IAM \ + --parameter-overrides "ClusterName=$(params.cluster-name)" + + aws cloudformation wait stack-create-complete --stack-name $STACK_NAME --region $(params.region) + echo "CREATED_CFN_STACK=$STACK_NAME" + else + echo "$STACK_NAME Already exists" + fi + + eksctl create iamserviceaccount \ + --name karpenter \ + --namespace karpenter \ + --cluster "$(params.cluster-name)" \ + --attach-policy-arn "arn:aws:iam::$(params.account-id):role/$(params.cluster-name)-karpenter" \ + --approve \ + --override-existing-serviceaccounts + + export AWS_EKS_ENDPOINT=$(params.endpoint) + eksctl utils associate-iam-oidc-provider --cluster "$(params.cluster-name)" --approve + + aws eks create-access-entry \ + --cluster-name "$(params.cluster-name)" \ + --principal-arn "arn:aws:iam::$(params.account-id):role/KarpenterNodeRole-$(params.cluster-name)" \ + --endpoint $(params.endpoint) \ + --type EC2_LINUX + diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml index ac1d3317..178a595b 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml @@ -2,38 +2,46 @@ apiVersion: tekton.dev/v1beta1 kind: Task metadata: name: awscli-mng - namespace: karpenter + namespace: scalability spec: description: | Creates the karpenter MNG params: - name: cluster-name description: The name of the cluster - - name: node-role-arn - description: + - name: aws-account-id + description: id of the account + - name: endpoint + description: eks endpoint to use + - name: region + default: "us-west-2" + description: The region where the cluster is in. steps: - name: create-mng image: alpine/k8s:1.23.7 script: | SUBNET_IDS=$(aws ec2 describe-subnets \ - --filters "Name=tag:karpenter.sh/discovery,Values=$(params.cluster-name)" \ + --filters "Name=tag:aws:cloudformation:stack-name,Values=$(params.cluster-name)" \ --query 'Subnets[*].SubnetId' \ --output text) + + echo ${SUBNET_IDS} aws eks create-nodegroup \ --cluster-name $(params.cluster-name) \ --nodegroup-name karpenter-system-large \ - --node-role $(params.node-role-arn) \ + --node-role arn:aws:iam::$(params.aws-account-id):role/$(params.cluster-name)-node-role \ --instance-types r5.24xlarge \ --scaling-config minSize=2,maxSize=3,desiredSize=2 \ --subnets ${SUBNET_IDS} \ --labels dedicated=karpenter \ - --region $(params.aws-region) \ - --endpoint-url https://api.beta.us-west-2.wesley.amazonaws.com + --region $(params.region) \ + --endpoint-url $(params.endpoint) \ + --taints key=dedicated,value=karpenter,effect=NoSchedule # quick validation aws eks list-nodegroups \ - --endpoint https://api.beta.us-west-2.wesley.amazonaws.com \ - --cluster-name eks-titan-perflab-c05 \ - --region $(params.aws-region) \ No newline at end of file + --endpoint $(params.endpoint) \ + --cluster-name $(params.cluster-name) \ + --region $(params.region) \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml index 00e7e292..acff12d8 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml @@ -2,7 +2,7 @@ apiVersion: tekton.dev/v1beta1 kind: Task metadata: name: awscli-node-role - namespace: karpenter + namespace: scalability spec: description: | Creates the karpenter Node Role @@ -39,4 +39,4 @@ spec: --policy-arn arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly aws iam attach-role-policy --role-name "KarpenterNodeRole-$(params.cluster-name)" \ --policy-arn arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore - aws iam get-role --role-name KarpenterNodeRole-$(params.cluster-name) --query 'Role.[Arn]' --output text > $(results.role-arn.path) \ No newline at end of file + aws iam get-role --role-name KarpenterNodeRole-$(params.cluster-name) --query 'Role.[Arn]' --output text > $(results.node-role-arn) \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-securitygroups.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-securitygroups.yaml deleted file mode 100644 index 7bbe52ec..00000000 --- a/tests/tekton-resources/tasks/setup/karpenter/awscli-securitygroups.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: tekton.dev/v1beta1 -kind: Task -metadata: - name: awscli-securitygroups - namespace: karpenter -spec: - description: | - Creates the karpenter Subnet Role - params: - - name: cluster-name - description: The name of the cluster - - name: cfn-stack-arn - description: The arn of the stack that the security groups are created by - - name: aws-region - description: AWS region that test is running in - steps: - - name: create-role - image: alpine/k8s:1.23.7 - script: | - aws ec2 create-tags \ - --resources $(aws ec2 describe-security-groups --filter "Name=tag:aws:cloudformation:stack-id,Values=$(params.cfn-stack-arn)" --region $(params.aws-region) --output text --query 'SecurityGroupId') \ - --region $(params.aws-region) \ - --tags Key=karpenter.sh/discovery,Value=$(params.cluster-name) \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-subnets.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-subnets.yaml deleted file mode 100644 index e68c4b70..00000000 --- a/tests/tekton-resources/tasks/setup/karpenter/awscli-subnets.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: tekton.dev/v1beta1 -kind: Task -metadata: - name: awscli-subnets - namespace: karpenter -spec: - description: | - Creates the karpenter Subnet Role - params: - - name: cluster-name - description: The name of the cluster - - name: aws-region - description: AWS region that test is running in - steps: - - name: create-role - image: alpine/k8s:1.23.7 - script: | - for SUBNET in $(aws ec2 describe-subnets --filter "Name=tag:aws:cloudformation:stack-id,Values=$(params.cfn-stack-arn)" --region $(params.aws-region) --output text --query 'SubnetId') - do - aws ec2 create-tags \ - --resources ${SUBNET} \ - --region $(params.aws-region) \ - --tags Key=karpenter.sh/discovery,Value=$(params.cluster-name) - done \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml b/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml index 92e16616..7358f87e 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml @@ -2,25 +2,56 @@ apiVersion: tekton.dev/v1beta1 kind: Task metadata: name: helm-karpenter-install - namespace: karpenter + namespace: scalability spec: description: | Install karpenter on the cluster params: - name: cluster-name description: The name of the cluster - - name: node-role-arn - description: + - name: aws-account-id + description: aws account id + - name: karpenter-ecr-repo + description: ECR repo to install karpenter + - name: karpenter-version + description: version of karpenter to install + - name: endpoint + description: eks endpoint to use + workspaces: + - name: config steps: - - name: install karpenter + - name: install-karpenter image: alpine/k8s:1.23.7 + timeout: 10m script: | - aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 953421922360.dkr.ecr.us-west-2.amazonaws.com + aws ecr get-login-password --region us-west-2 | helm registry login --username AWS --password-stdin $(params.karpenter-ecr-repo) - helm upgrade --install karpenter oci://953421922360.dkr.ecr.us-west-2.amazonaws.com/karpenter/karpenter --version ${KARPENTER_VERSION} \ + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + + aws eks describe-nodegroup --cluster-name $(params.cluster-name) --endpoint $(params.endpoint) --nodegroup-name karpenter-system-large + + kubectl get nodes -A -o yaml + + kubectl get pods -A -o wide + + kubectl get pods -n karpenter -o yaml + + # kubectl delete nodes -l dedicated=karpenter + + kubectl get deployments -A -o wide + + # helm status karpenter --namespace karpenter + + # kubectl logs karpenter-5df996fbbf-f8ghz -n karpenter -f + + # helm delete -n karpenter karpenter --wait + + # kubectl taint nodes -l dedicated=karpenter dedicated=karpenter:NoSchedule + + helm upgrade --install karpenter oci://$(params.karpenter-ecr-repo)/karpenter/karpenter --version $(params.karpenter-version) \ --namespace "karpenter" \ --create-namespace \ - --set "settings.clusterName=${CLUSTER_NAME}" \ + --set "settings.clusterName=$(params.cluster-name)" \ --set "settings.interruptionQueue=" \ --set "settings.eksControlPlane=true" \ --set-string "settings.awsCreateQPS=60" \ @@ -30,7 +61,7 @@ spec: --set settings.featureGates.spotToSpotConsolidation="true" \ --set settings.featureGates.disableMetricsControllers="true" \ --set settings.preferencePolicy=Ignore \ - --set "serviceAccount.annotations.eks\.amazonaws\.com/role-arn=${CONTROLLER_ROLE_ARN}" \ + --set "serviceAccount.annotations.eks\.amazonaws\.com/role-arn=arn:aws:iam::$(params.aws-account-id):role/KarpenterControllerRole-$(params.cluster-name)" \ --set controller.resources.requests.cpu=60 \ --set controller.resources.requests.memory=200Gi \ --set controller.resources.limits.cpu=60 \ @@ -43,11 +74,15 @@ spec: --set "tolerations[0].operator=Equal" \ --set "dnsPolicy=Default" \ --set-string "controller.env[0].name=AWS_ENDPOINT_URL_EKS" \ - --set-string "controller.env[0].value=https://api.beta.us-west-2.wesley.amazonaws.com" \ + --set-string "controller.env[0].value=$(params.endpoint)" \ --set-string "controller.env[1].name=KUBE_CLIENT_QPS" \ --set-string "controller.env[1].value=50000" \ --set-string "controller.env[2].name=KUBE_CLIENT_BURST" \ --set-string "controller.env[2].value=50000" \ --set-string "controller.env[3].name=ENABLE_PROFILING" \ --set-string "controller.env[3].value=true" \ - --wait \ No newline at end of file + --timeout 100m \ + --debug \ + --wait + + kubectl get pods -n karpenter \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/kubectl-create-namespace.yaml b/tests/tekton-resources/tasks/setup/karpenter/kubectl-create-namespace.yaml deleted file mode 100644 index e610c464..00000000 --- a/tests/tekton-resources/tasks/setup/karpenter/kubectl-create-namespace.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -apiVersion: tekton.dev/v1beta1 -kind: Task -metadata: - name: kubectl-create-namespace - namespace: karpenter -spec: - description: | - Create a namespace within the cluster - workspaces: - - name: config - description: | - A workspace into which a kubeconfig file called `kubeconfig` will be written that will contain the information required to access the cluster. The `kubeconfig` will expect to use [aws-iam-authenticator](https://github.com/kubernetes-sigs/aws-iam-authenticator/) to authenticate, so in order for it to be used it must be run in a container which contains both `kubectl` and `aws-iam-authenticator`. - steps: - - name: create namespace - image: docker.io/weaveworks/eksctl:0.66.0 - script: | - kubeconfig create namespace karpenter --kubeconfig $(workspaces.config.path)/kubeconfig diff --git a/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml b/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml index 0622bc94..cd8b6fb8 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml @@ -2,94 +2,55 @@ apiVersion: tekton.dev/v1beta1 kind: Task metadata: name: create-ec2nodeclass - namespace: karpenter + namespace: scalability spec: description: | Install karpenter on the cluster params: - name: cluster-name description: The name of the cluster - - name: node-role-arn - description: + - name: endpoint + description: eks endpoint to use + - name: karpenter-nodeclass-url + description: url of the nodeclass template to use + workspaces: + - name: source + mountPath: /src/karpenter/ steps: - name: create-ec2nodeclass image: alpine/k8s:1.23.7 script: | - CLUSTER_CA=$(aws eks describe-cluster \ + export CLUSTER_CA=$(aws eks describe-cluster \ --name $(params.cluster-name) \ - --endpoint-url https://api.beta.us-west-2.wesley.amazonaws.com \ + --endpoint-url $(params.endpoint) \ --query 'cluster.certificateAuthority.data' \ --output text) - CLUSTER_ENDPOINT=$(aws eks describe-cluster \ + export CLUSTER_ENDPOINT=$(aws eks describe-cluster \ --name $(params.cluster-name) \ - --endpoint-url https://api.beta.us-west-2.wesley.amazonaws.com \ + --endpoint-url $(params.endpoint) \ --query 'cluster.endpoint' \ --output text) + export CLUSTER_NAME=$(params.cluster-name) + + export ALIAS_VERSION=latest + echo "Cluster endpoint: ${CLUSTER_ENDPOINT}" + curl -fsSL $(params.karpenter-nodeclass-url) -o $(workspaces.source.path)ec2nodeclass.yaml + + cat $(workspaces.source.path)ec2nodeclass.yaml + + envsubst < $(workspaces.source.path)ec2nodeclass.yaml > $(workspaces.source.path)ec2nodeclass-modified.yaml + + ls $(workspaces.source.path) - cat < $(workspaces.source.path)nodepool-${AZ}.yaml + cat $(workspaces.source.path)nodepool-${AZ}.yaml + kubectl apply -f $(workspaces.source.path)nodepool-${AZ}.yaml done + kubectl get nodepool -o yaml + diff --git a/tests/tekton-resources/tasks/teardown/karpenter/awscli-controller-role.yaml b/tests/tekton-resources/tasks/teardown/karpenter/awscli-controller-role.yaml index e69de29b..c4d03173 100644 --- a/tests/tekton-resources/tasks/teardown/karpenter/awscli-controller-role.yaml +++ b/tests/tekton-resources/tasks/teardown/karpenter/awscli-controller-role.yaml @@ -0,0 +1,16 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-controller-role-teardown + namespace: scalability +spec: + description: | + Creates the karpenter Controller Role + params: + - name: cluster-name + description: The name of the cluster + steps: + - name: create-role + image: alpine/k8s:1.23.7 + script: | + aws iam delete-role --role-name "KarpenterControllerRole-$(params.cluster-name)" \ No newline at end of file diff --git a/tests/tekton-resources/tasks/teardown/karpenter/awscli-instanceprofiles.yaml b/tests/tekton-resources/tasks/teardown/karpenter/awscli-instanceprofiles.yaml index e69de29b..fdcb9558 100644 --- a/tests/tekton-resources/tasks/teardown/karpenter/awscli-instanceprofiles.yaml +++ b/tests/tekton-resources/tasks/teardown/karpenter/awscli-instanceprofiles.yaml @@ -0,0 +1,16 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-instanceprofiles-teardown + namespace: scalability +spec: + description: | + Creates the karpenter instance profile + params: + - name: cluster-name + description: The name of the cluster + steps: + - name: create-role + image: alpine/k8s:1.23.7 + script: | + aws iam delete-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" \ No newline at end of file diff --git a/tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter-cfn-stack.yaml b/tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter-cfn-stack.yaml new file mode 100644 index 00000000..9dbbdd45 --- /dev/null +++ b/tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter-cfn-stack.yaml @@ -0,0 +1,36 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-karpenter-cfn-stack-teardown + namespace: scalability +spec: + description: | + Creates the karpenter instance roles and sqs interruption queue + params: + - name: cluster-name + description: The name of the cluster + - name: karpenter-version + description: Version of Karpenter to deploy + - name: endpoint + description: Endpoint to use with EKS + - name: region + default: us-west-2 + description: The region where the cluster is in. + - name: account-id + description: The aws account the cluster is running in + steps: + - name: create-stack + image: alpine/k8s:1.23.7 + script: | + STACK_NAME=Karpenter-$(params.cluster-name) + STACK_STATUS=$(aws cloudformation describe-stacks --query 'Stacks[?StackName==`'${STACK_NAME}'`].StackStatus' --output text --region $(params.region)) + cat ${STACK_STATUS} + + if [[ "$STACK_STATUS" == "ACTIVE" ]]; then + aws cloudformation delete-stack --stack-name ${STACK_NAME} + + aws cloudformation wait stack-delete-complete --stack-name $STACK_NAME --region $(params.region) + else + echo "$STACK_NAME Already exists" + fi + diff --git a/tests/tekton-resources/tasks/teardown/karpenter/awscli-mng.yaml b/tests/tekton-resources/tasks/teardown/karpenter/awscli-mng.yaml deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/tekton-resources/tasks/teardown/karpenter/awscli-node-role.yaml b/tests/tekton-resources/tasks/teardown/karpenter/awscli-node-role.yaml deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/tekton-resources/tasks/teardown/karpenter/helm-karpenter-uninstall.yaml b/tests/tekton-resources/tasks/teardown/karpenter/helm-karpenter-uninstall.yaml deleted file mode 100644 index e69de29b..00000000 From b2cd61e85dd8c7594b90323f7b8908be8f623fe1 Mon Sep 17 00:00:00 2001 From: DerekFrank Date: Tue, 29 Jul 2025 14:16:14 -0700 Subject: [PATCH 10/13] Removing erroneously included script --- tests/assets/karpenter-no-drift-validation.sh | 58 ------------------- 1 file changed, 58 deletions(-) delete mode 100755 tests/assets/karpenter-no-drift-validation.sh diff --git a/tests/assets/karpenter-no-drift-validation.sh b/tests/assets/karpenter-no-drift-validation.sh deleted file mode 100755 index 85a1be08..00000000 --- a/tests/assets/karpenter-no-drift-validation.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash - -# Script to validate that an eks cluster with karpenter installed will not drift -# Validates: -# 1. Nodepools have a disruption budget of 0% set -# 2. Nodeclasses have pinned ami's -echo "Checking disruption budgets in nodepool specs..." -echo "-----------------------------------" - -FAILED=0 -# Get all nodepools and check their disruption budget settings -for nodepool in $(kubectl get nodepools -o json | jq -r '.items[] | {name: .metadata.name, budgets: .spec.disruption.budgets[].nodes} | @json'); do - echo $nodepool - NAME=$(echo $nodepool | jq -r '.name') - NDB=$(echo $nodepool | jq -r '.budgets') - - # Remove any % symbol and convert to number - NDB_NUM=$(echo $NDB | sed 's/%//') - - if [ "$NDB_NUM" -eq 0 ]; then - echo "✅ Disruption budget correctly set to $NDB for nodepool: $NAME" - else - echo "❌ Disruption budget too high for nodepool: $NAME (current: $NDB)" - export FAILED=1 - fi -done - -echo "Checking AMI versions in EC2NodeClass resources..." -echo "------------------------------------------------" - -# Get EC2NodeClass resources and check for @latest -for nodeclass in $(kubectl get ec2nodeclasses -o json | jq -r '.items[] | {name: .metadata.name, ami: .spec.amiFamily, amiSelector: .spec.amiSelectorTerms} | @json'); do - NAME=$(echo $nodeclass | jq -r '.name') - AMI_FAMILY=$(echo $nodeclass | jq -r '.ami') - AMI_SELECTOR=$(echo $nodeclass | jq -r '.amiSelector') - - echo "NodeClass: $NAME" - echo "AMI Family: $AMI_FAMILY" - echo "AMI Selector Terms: $AMI_SELECTOR" - - # Check if @latest is used in any selector terms - if echo "$AMI_SELECTOR" | grep -q "@latest"; then - echo "❌ WARNING: @latest version detected in NodeClass $NAME" - export FAILED=1 - else - echo "✅ No @latest version found in NodeClass $NAME" - fi - echo "------------------------------------------------" -done - -echo "-----------------------------------" -if [ $FAILED -eq 1 ]; then - echo "❌ Some nodepools or nodeclasses do not have the correct configuration" - exit 1 -else - echo "✅ All nodepools or nodeclasses have the correct configuration" - exit 0 -fi From c303454fc7d744acc97507adc6a7ec4dd9d075ed Mon Sep 17 00:00:00 2001 From: DerekFrank Date: Tue, 29 Jul 2025 14:18:52 -0700 Subject: [PATCH 11/13] updating hardcoded version --- .../tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml index eb1b318d..6083cd45 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml @@ -27,7 +27,7 @@ spec: script: | STACK_NAME=Karpenter-$(params.cluster-name) STACK_STATUS=$(aws cloudformation describe-stacks --query 'Stacks[?StackName==`'${STACK_NAME}'`].StackStatus' --output text --region $(params.region)) - curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v1.6.1/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml -o $(workspaces.source.path)cloudformation.yaml + curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/$(params.karpenter-version)/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml -o $(workspaces.source.path)cloudformation.yaml cat /src/karpenter/cloudformation.yaml From c546b89b9891e8834ab717d34874e32a792bd14e Mon Sep 17 00:00:00 2001 From: DerekFrank Date: Tue, 9 Sep 2025 16:42:56 -0700 Subject: [PATCH 12/13] updating budget for nodepool --- tests/assets/karpenter/nodepool.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/assets/karpenter/nodepool.yaml b/tests/assets/karpenter/nodepool.yaml index 0eacf187..b506a4e3 100644 --- a/tests/assets/karpenter/nodepool.yaml +++ b/tests/assets/karpenter/nodepool.yaml @@ -5,7 +5,7 @@ metadata: spec: disruption: budgets: - - nodes: 5% + - nodes: 10% consolidateAfter: 0s consolidationPolicy: WhenEmptyOrUnderutilized replicas: 0 From 47a83f336646a5c887f53735f15beab81fb38362 Mon Sep 17 00:00:00 2001 From: DerekFrank Date: Tue, 16 Sep 2025 16:40:36 -0700 Subject: [PATCH 13/13] review feedback --- tests/assets/karpenter/nodepool.yaml | 3 +- .../pipelines/eks/karpenter-ultra.yaml | 381 ++++++++++++++++++ .../karpenter/kubectl-cluster-wait.yaml | 49 --- .../generators/karpenter/kubectl-drift.yaml | 80 +++- .../kubectl-nodepool-condition-wait.yaml | 165 ++++++++ .../kubectl-nodepool-replicas-wait.yaml | 97 +++++ .../generators/karpenter/kubectl-scale.yaml | 88 +++- .../karpenter/awscli-controller-role.yaml | 116 +++++- .../karpenter/awscli-instanceprofiles.yaml | 20 +- .../karpenter/awscli-karpenter-cfn-stack.yaml | 62 ++- .../tasks/setup/karpenter/awscli-mng.yaml | 80 +++- .../setup/karpenter/awscli-node-role.yaml | 123 ++++-- .../karpenter/helm-karpenter-install.yaml | 92 ++++- .../setup/karpenter/kubectl-nodeclass.yaml | 51 ++- .../setup/karpenter/kubectl-nodepools.yaml | 93 ++++- .../karpenter/awscli-controller-role.yaml | 16 - .../karpenter/awscli-instanceprofiles.yaml | 16 - .../karpenter/awscli-karpenter-cfn-stack.yaml | 36 -- .../teardown/karpenter/awscli-karpenter.yaml | 381 ++++++++++++++++++ .../karpenter/kubectl-get-karpenter-logs.yaml | 56 +++ 20 files changed, 1740 insertions(+), 265 deletions(-) create mode 100644 tests/tekton-resources/pipelines/eks/karpenter-ultra.yaml delete mode 100644 tests/tekton-resources/tasks/generators/karpenter/kubectl-cluster-wait.yaml create mode 100644 tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-condition-wait.yaml create mode 100644 tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-replicas-wait.yaml delete mode 100644 tests/tekton-resources/tasks/teardown/karpenter/awscli-controller-role.yaml delete mode 100644 tests/tekton-resources/tasks/teardown/karpenter/awscli-instanceprofiles.yaml delete mode 100644 tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter-cfn-stack.yaml create mode 100644 tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter.yaml create mode 100644 tests/tekton-resources/tasks/teardown/karpenter/kubectl-get-karpenter-logs.yaml diff --git a/tests/assets/karpenter/nodepool.yaml b/tests/assets/karpenter/nodepool.yaml index b506a4e3..22c5229b 100644 --- a/tests/assets/karpenter/nodepool.yaml +++ b/tests/assets/karpenter/nodepool.yaml @@ -43,5 +43,4 @@ spec: - key: karpenter.k8s.aws/instance-size operator: In values: - - medium - - large \ No newline at end of file + - medium \ No newline at end of file diff --git a/tests/tekton-resources/pipelines/eks/karpenter-ultra.yaml b/tests/tekton-resources/pipelines/eks/karpenter-ultra.yaml new file mode 100644 index 00000000..bbaec91b --- /dev/null +++ b/tests/tekton-resources/pipelines/eks/karpenter-ultra.yaml @@ -0,0 +1,381 @@ +kind: Pipeline +apiVersion: tekton.dev/v1 +metadata: + name: derekff-karpenter-testing + namespace: scalability +spec: + params: + - name: cluster-name + type: string + - default: "" + name: endpoint + type: string + - default: "5000" + name: desired-nodes + type: string + - default: "30" + name: pods-per-node + type: string + - default: "100" + name: nodes-per-namespace + type: string + - default: "50" + name: cl2-load-test-throughput + type: string + - default: kit-eks-scalability/kit-eks-5k/etcd/$(date +%s) + name: results-bucket + type: string + - default: "" + type: string + - default: 'You can monitor here - https://experimental.scalability.eks.aws.dev/#/namespaces/scalability/pipelineruns/ + ;5k node ' + name: slack-message + type: string + - default: "" + name: amp-workspace-id + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/amazon-eks-vpc.json + name: vpc-cfn-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_node_group_launch_template.json + name: ng-cfn-url + type: string + - name: kubernetes-version + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_service_role.json + name: service-role-cfn-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_node_role.json + name: node-role-cfn-url + type: string + - name: manifest-id + type: string + - default: "" + name: eksadm-s3-path + type: string + - default: 1.8.0 + name: karpenter-version + type: string + - default: karpenter + name: karpenter-namespace + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/karpenter/node-role-policy-document.json + name: karpenter-node-role-policy-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/karpenter/controller-role-policy-document.json + name: karpenter-controller-role-policy-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/karpenter/controller-role-trust-policy-document.json + name: karpenter-controller-role-trust-policy-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/refs/heads/main/tests/assets/karpenter/nodeclass.yaml + name: karpenter-ec2nodeclass-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/refs/heads/main/tests/assets/karpenter/nodepool.yaml + name: karpenter-nodepool-url + type: string + - default: "" + name: karpenter-ecr-repo + type: string + - default: "" + name: aws-account-id + type: string + tasks: + - name: awscli-vpc-create + params: + - name: stack-name + value: $(params.cluster-name) + - name: vpc-cfn-url + value: "$(params.vpc-cfn-url)" + taskRef: + kind: Task + name: awscli-vpc-create + - name: create-cluster-service-role + params: + - name: stack-name + value: $(params.cluster-name)-service-role + - name: role-cfn-url + value: $(params.service-role-cfn-url) + - name: role-name + value: "$(params.cluster-name)-service-role" + taskRef: + kind: Task + name: awscli-role-create + - name: create-cluster-node-role + params: + - name: stack-name + value: $(params.cluster-name)-node-role + - name: role-cfn-url + value: $(params.node-role-cfn-url) + - name: role-name + value: "$(params.cluster-name)-node-role" + taskRef: + kind: Task + name: awscli-role-create + - name: create-eks-cluster + params: + - name: cluster-name + value: $(params.cluster-name) + - name: service-role-name + value: $(params.cluster-name)-service-role + - name: endpoint + value: $(params.endpoint) + - name: vpc-stack-name + value: $(params.cluster-name) + - name: manifest-id + value: $(params.manifest-id) + - name: eksadm-s3-path + value: $(params.eksadm-s3-path) + - name: kubernetes-version + value: $(params.kubernetes-version) + retries: 3 + runAfter: + - create-cluster-node-role + - create-cluster-service-role + - awscli-vpc-create + taskRef: + kind: Task + name: awscli-eks-cluster-create-with-vpc-stack + workspaces: + - name: config + workspace: config + - name: create-karpenter-controller-role + params: + - name: cluster-name + value: $(params.cluster-name) + - name: aws-account-id + value: $(params.aws-account-id) + - name: endpoint + value: $(params.endpoint) + - name: karpenter-controller-role-policy-url + value: $(params.karpenter-controller-role-policy-url) + - name: karpenter-controller-role-trust-policy-url + value: $(params.karpenter-controller-role-trust-policy-url) + runAfter: + - create-eks-cluster + taskRef: + kind: Task + name: awscli-controller-role + - name: create-karpenter-mng + params: + - name: cluster-name + value: $(params.cluster-name) + - name: aws-account-id + value: $(params.aws-account-id) + - name: endpoint + value: $(params.endpoint) + runAfter: + - create-eks-cluster + taskRef: + kind: Task + name: awscli-mng + - name: create-karpenter-cfn + params: + - name: cluster-name + value: $(params.cluster-name) + - name: karpenter-version + value: $(params.karpenter-version) + - name: endpoint + value: $(params.endpoint) + - name: account-id + value: $(params.aws-account-id) + runAfter: + - create-eks-cluster + taskRef: + kind: Task + name: awscli-karpenter-cfn-stack + - name: helm-install-karpenter + params: + - name: cluster-name + value: $(params.cluster-name) + - name: karpenter-version + value: $(params.karpenter-version) + - name: aws-account-id + value: $(params.aws-account-id) + - name: karpenter-ecr-repo + value: $(params.karpenter-ecr-repo) + - name: endpoint + value: $(params.endpoint) + runAfter: + - create-karpenter-cfn + - create-karpenter-mng + - create-karpenter-controller-role + - awscli-instance-profile + taskRef: + kind: Task + name: helm-karpenter-install + - name: get-karp-logs + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + runAfter: + - helm-install-karpenter + taskRef: + kind: Task + name: kubectl-get-karpenter-logs + - name: awscli-instance-profile + params: + - name: cluster-name + value: $(params.cluster-name) + runAfter: + - create-karpenter-cfn + - create-karpenter-mng + taskRef: + kind: Task + name: awscli-instanceprofiles + - name: create-nodeclass + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: karpenter-nodeclass-url + value: $(params.karpenter-ec2nodeclass-url) + runAfter: + - helm-install-karpenter + taskRef: + kind: Task + name: create-ec2nodeclass + - name: create-nodepools + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: karpenter-nodepool-url + value: $(params.karpenter-nodepool-url) + runAfter: + - helm-install-karpenter + taskRef: + kind: Task + name: create-nodepool + - name: scale-nodepools + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: replicas + value: 100 + - name: nodepool + value: $(params.cluster-name) + runAfter: + - create-nodepools + taskRef: + kind: Task + name: scale-nodepool + - name: wait-for-scale + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: replicas + value: 100 + - name: nodepool + value: $(params.cluster-name) + runAfter: + - scale-nodepools + taskRef: + kind: Task + name: nodepool-replicas-wait + - name: drift + params: + - name: nodepool + value: $(params.cluster-name) + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + runAfter: + - wait-for-scale + taskRef: + kind: Task + name: drift-nodepool + - name: wait-for-drift + params: + - name: nodepool + value: $(params.cluster-name) + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: value + value: "True" + - name: condition + value: Drifted + - name: presence + value: false + runAfter: + - drift + taskRef: + kind: Task + name: nodepool-condition-wait + - name: scale-down + params: + - name: cluster-name + value: $(params.cluster-name) + - name: nodepool + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: replicas + value: 0 + runAfter: + - wait-for-drift + taskRef: + kind: Task + name: scale-nodepool + - name: wait-for-scale-down + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: replicas + value: 0 + - name: nodepool + value: $(params.cluster-name) + runAfter: + - scale-down + taskRef: + kind: Task + name: nodepool-replicas-wait + - name: uninstall-karpenter + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + runAfter: + - wait-for-scale-down + taskRef: + kind: Task + name: helm-karpenter-uninstall + finally: + - name: teardown + retries: 10 # To deal with throttling during deletion + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: slack-hook + value: $(params.slack-hook) + - name: slack-message + value: "$(params.slack-message) job completed" + - name: service-role-stack-name + value: $(params.cluster-name)-service-role + - name: node-role-stack-name + value: $(params.cluster-name)-node-role + - name: launch-template-stack-name + value: $(params.cluster-name)-launch-template + taskRef: + kind: Task + name: awscli-eks-karpenter-cluster-teardown + workspaces: + - name: source + - name: results + - name: config \ No newline at end of file diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-cluster-wait.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-cluster-wait.yaml deleted file mode 100644 index 653e3757..00000000 --- a/tests/tekton-resources/tasks/generators/karpenter/kubectl-cluster-wait.yaml +++ /dev/null @@ -1,49 +0,0 @@ ---- -apiVersion: tekton.dev/v1beta1 -kind: Task -metadata: - name: node-condition-wait - namespace: scalability -spec: - description: "waits for there to be no nodes with the specific condition" - params: - - name: cluster-name - description: The name of the cluster - - name: endpoint - description: eks endpoint to use - - name: aws-region - - name: initial-delay - default: 30m - - name: condition - description: condition to check - - name: value - description: value of the condition to validate - steps: - - name: drift-nodepool - image: amazon/aws-cli - script: | - sleep $(params.initial-delay) - CHECK_INTERVAL=300 - while true; do - aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) - echo "$(date): Checking node conditions..." - # Get nodes that still have the unwanted condition - nodes_with_condition=$(kubectl get nodes -o json | jq -r --arg type $(params.condition) --arg status $(params.value) ' - .items[] | - select(.status.conditions[] | select(.type == $type and .status == $status)) | - .metadata.name - ') - if [ -z "$nodes_with_condition" ]; then - echo "$(date): All nodes are clear of condition $(params.condition)=$(params.value)" - echo "Condition check completed successfully!" - exit 0 - else - echo "$(date): The following nodes still have $(params.condition)=$(params.value):" - echo "$nodes_with_condition" - echo "Waiting 5 minutes before next check..." - sleep $CHECK_INTERVAL - fi - done - - exit 1 - diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml index 30c70746..e7012c90 100644 --- a/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml +++ b/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml @@ -1,22 +1,86 @@ ---- apiVersion: tekton.dev/v1beta1 kind: Task metadata: - name: nodepool-drift + name: drift-nodepool namespace: scalability spec: - description: "drift a nodepool by adding a new label to the specified nodepool" + description: | + Triggers Karpenter nodepool drift by modifying nodepool template labels. + This task connects to an EKS cluster, captures the current nodepool state, + applies a label change to force node replacement, and verifies the drift operation. + The drift process causes Karpenter to replace existing nodes with new ones + that match the updated nodepool template specification. + DOES NOT CHECK TO SEE IF ALL NODES SUCESSFULLY DRIFT. Use kubectl-nodepool-condition-wait.yaml for that params: - name: nodepool - description: Name of the nodepool to drift + description: Name of the Karpenter nodepool to drift (must exist in cluster) - name: cluster-name - description: The name of the cluster + description: The name of the EKS cluster containing the target nodepool - name: endpoint - description: eks endpoint to use + description: EKS cluster endpoint URL for kubectl configuration - name: aws-region + description: AWS region where the cluster is located (used for AZ discovery) + default: us-west-2 + - name: label-key + description: Label key to add/modify in the nodepool template + default: myLabel + - name: label-val + description: Label value to set for the specified label key + default: myValue steps: - name: drift-nodepool - image: amazon/aws-cli + image: alpine/k8s:1.30.2 script: | + echo "Starting Nodepool Drift Operation" + echo "=================================" + + # Configure kubectl to connect to the EKS cluster + echo "[INFO] Configuring kubectl for cluster access..." aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) - kubectl patch nodepool ${params.nodepool} --patch '{"spec": {"template": {"metadata": {"labels": {"myLabel": "myValue"}}}}}' \ No newline at end of file + echo "[SUCCESS] Successfully configured kubectl" + echo "" + + # Get list of availability zones + AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') + AZ_COUNT=$(echo "$AZ_LIST" | wc -l) + + echo "[INFO] Found $AZ_COUNT availability zones:" + echo "$AZ_LIST" | sed 's/^/ - /' + echo "" + + # Capture cluster state before drift operation + echo "[INFO] Capturing cluster state before nodepool drift..." + echo "-----------------------------------------------------" + + echo "[INFO] Current cluster nodes:" + kubectl get nodes -l karpenter.sh/nodepool=$(params.nodepool) -o wide --show-labels + echo "" + + echo "[INFO] Current nodepool configuration:" + kubectl get nodepool -o yaml + echo "" + + echo "$AZ_LIST" | while read -r az; do + export AZ=$az + # Apply the drift-inducing label change to the nodepool + echo "[INFO] Applying label change to trigger nodepool drift..." + echo "[INFO] Patching nodepool $(params.nodepool)-${az} with label $(params.label-key)=$(params.label-val)" + + kubectl patch nodepool $(params.nodepool)-${az} --type='merge' --patch='{"spec": {"template": {"metadata": {"labels": {"$(params.label-key)": "$(params.label-val)"}}}}}' + + echo "[SUCCESS] Successfully patched nodepool $(params.nodepool)-${az}" + echo "" + + # Verify the drift operation was applied + echo "[INFO] Verifying nodepool drift configuration..." + echo "===============================================" + + echo "[INFO] Updated nodepool configuration:" + kubectl get nodepool $(params.nodepool)-${az} -o yaml + echo "" + done + + echo "===============================================" + echo "[SUCCESS] Nodepool drift operation completed" + echo "[INFO] Karpenter will now begin replacing nodes to match the new template" + echo "===============================================" diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-condition-wait.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-condition-wait.yaml new file mode 100644 index 00000000..cc4b0b37 --- /dev/null +++ b/tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-condition-wait.yaml @@ -0,0 +1,165 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: nodepool-condition-wait + namespace: scalability +spec: + description: "waits for nodeclaims in a given nodepool to have or not have the specified condition with the given value based on presence parameter" + results: + - name: datapoint + description: Stores the result that can be consumed by other tasks (1 for success, 0 for failure) + params: + - name: cluster-name + description: The name of the cluster + - name: endpoint + description: eks endpoint to use + - name: aws-region + default: us-west-2 + - name: initial-delay + default: 1m + - name: condition + description: condition to check (e.g., Ready, MemoryPressure, DiskPressure) + - name: presence + description: whether to check for the presence or absence of the condition with the value + default: true + - name: value + description: value of the condition to validate (e.g., True, False) + - name: nodepool + description: nodepool to check nodeclaim in. + - name: check-interval + description: interval in seconds between checks + default: "60" + - name: timeout + description: total time to wait before timing out in seconds + default: 3000 + steps: + - name: wait-for-condition + image: alpine/k8s:1.30.2 + script: | + sleep $(params.initial-delay) + CHECK_INTERVAL=$(params.check-interval) + TIMEOUT=$(params.timeout) + START_TIME=$(date +%s) + + while true; do + # Check if timeout has been reached + echo "in true" + CURRENT_TIME=$(date +%s) + echo "after current" + ELAPSED_TIME=$((CURRENT_TIME - START_TIME)) + echo "after elapsed" + if [ $ELAPSED_TIME -ge $TIMEOUT ]; then + echo "$(date): Timeout reached after ${ELAPSED_TIME} seconds. Nodepools did not complete within the specified timeout." + echo "0" | tee $(results.datapoint.path) + exit 1 + fi + echo "updating kubeconfig" + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + if [ "$(params.presence)" = "true" ]; then + echo "$(date): Checking that ALL nodeclaims in nodepool $(params.nodepool) have condition $(params.condition)=$(params.value)..." + else + echo "$(date): Checking that NO nodeclaims in nodepool $(params.nodepool) have condition $(params.condition)=$(params.value)..." + fi + + echo "getting AZs" + # Get list of availability zones + AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') + AZ_COUNT=$(echo "$AZ_LIST" | wc -l) + + echo "[INFO] Found $AZ_COUNT availability zones:" + echo "$AZ_LIST" | sed 's/^/ - /' + echo "" + + # Track if all availability zones have nodeclaims meeting the condition + ALL_AZ_READY=true + TOTAL_NODECLAIMS_ALL_AZ=0 + TOTAL_READY_NODECLAIMS_ALL_AZ=0 + + # Check each availability zone + for az in $AZ_LIST; do + # Get all nodeclaims in the AZ-specific nodepool + all_nodeclaims=$(kubectl get nodeclaims -l karpenter.sh/nodepool=$(params.nodepool)-${az} -o json | jq -r '.items[].metadata.name') + + if [ -z "$all_nodeclaims" ]; then + echo "$(date): AZ ${az} - No nodeclaims found in nodepool $(params.nodepool)-${az}" + continue + fi + + # Count total nodeclaims in this AZ + total_nodeclaims=$(echo "$all_nodeclaims" | wc -l) + TOTAL_NODECLAIMS_ALL_AZ=$((TOTAL_NODECLAIMS_ALL_AZ + total_nodeclaims)) + + # Get nodeclaims that have the desired condition with the specified value + nodeclaims_with_condition=$(kubectl get nodeclaims -l karpenter.sh/nodepool=$(params.nodepool)-${az} -o json | jq -r --arg type $(params.condition) --arg status $(params.value) ' + .items[] | + select(.status.conditions[] | select(.type == $type and .status == $status)) | + .metadata.name + ') + + # Count nodeclaims with the desired condition in this AZ + if [ -z "$nodeclaims_with_condition" ]; then + nodeclaims_with_condition_count=0 + else + nodeclaims_with_condition_count=$(echo "$nodeclaims_with_condition" | wc -l) + fi + + TOTAL_READY_NODECLAIMS_ALL_AZ=$((TOTAL_READY_NODECLAIMS_ALL_AZ + nodeclaims_with_condition_count)) + + echo "$(date): AZ ${az} - Nodeclaims with $(params.condition)=$(params.value): $nodeclaims_with_condition_count/$total_nodeclaims" + + if [ "$(params.presence)" = "true" ]; then + # presence=true: Check if all nodeclaims have the condition + if [ "$nodeclaims_with_condition_count" -ne "$total_nodeclaims" ]; then + echo "$(date): AZ ${az} - Not all nodeclaims have $(params.condition)=$(params.value)" + ALL_AZ_READY=false + else + echo "$(date): AZ ${az} - Success! All nodeclaims have $(params.condition)=$(params.value)" + fi + else + # presence=false: Check if no nodeclaims have the condition + if [ "$nodeclaims_with_condition_count" -ne 0 ]; then + echo "$(date): AZ ${az} - Some nodeclaims still have $(params.condition)=$(params.value)" + ALL_AZ_READY=false + else + echo "$(date): AZ ${az} - Success! No nodeclaims have $(params.condition)=$(params.value)" + fi + fi + done + + echo "$(date): Overall status - Nodeclaims with $(params.condition)=$(params.value): $TOTAL_READY_NODECLAIMS_ALL_AZ/$TOTAL_NODECLAIMS_ALL_AZ across all AZs" + + # Check success condition based on presence parameter + if [ "$(params.presence)" = "true" ]; then + # presence=true: Exit if all availability zones have all nodeclaims meeting the condition + if [ "$ALL_AZ_READY" = "true" ] && [ "$TOTAL_NODECLAIMS_ALL_AZ" -gt 0 ]; then + echo "$(date): Success! All nodeclaims across all availability zones have $(params.condition)=$(params.value)" + echo "1" | tee $(results.datapoint.path) + exit 0 + fi + + if [ "$TOTAL_NODECLAIMS_ALL_AZ" -eq 0 ]; then + echo "$(date): No nodeclaims found in any availability zone for nodepool $(params.nodepool)" + else + echo "$(date): Waiting for remaining nodeclaims to achieve $(params.condition)=$(params.value)..." + fi + else + # presence=false: Exit if no nodeclaims have the condition with the specified value + if [ "$TOTAL_READY_NODECLAIMS_ALL_AZ" -eq 0 ] && [ "$TOTAL_NODECLAIMS_ALL_AZ" -gt 0 ]; then + echo "$(date): Success! No nodeclaims across all availability zones have $(params.condition)=$(params.value)" + echo "1" | tee $(results.datapoint.path) + exit 0 + fi + + if [ "$TOTAL_NODECLAIMS_ALL_AZ" -eq 0 ]; then + echo "$(date): No nodeclaims found in any availability zone for nodepool $(params.nodepool)" + else + echo "$(date): Waiting for nodeclaims to no longer have $(params.condition)=$(params.value)..." + fi + fi + + echo "$(date): Waiting ${CHECK_INTERVAL} seconds before next check..." + sleep $CHECK_INTERVAL + done + done + exit 1 diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-replicas-wait.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-replicas-wait.yaml new file mode 100644 index 00000000..3c434fa0 --- /dev/null +++ b/tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-replicas-wait.yaml @@ -0,0 +1,97 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: nodepool-replicas-wait + namespace: scalability +spec: + description: "waits for the number of ready nodes in a nodepool to equal the specified replicas count" + results: + - name: datapoint + description: Stores the result that can be consumed by other tasks (1 for success, 0 for failure) + params: + - name: cluster-name + description: The name of the cluster + - name: endpoint + description: eks endpoint to use + - name: aws-region + default: us-west-2 + - name: initial-delay + default: 1m + - name: replicas + description: number of ready replicas in the nodepool to wait for + - name: nodepool + description: nodepool to check nodes in. + - name: check-interval + description: interval in seconds between checks + default: "60" + - name: timeout + description: total time to wait before timing out + default: 3000 + steps: + - name: wait-for-replicas + image: alpine/k8s:1.30.2 + script: | + sleep $(params.initial-delay) + CHECK_INTERVAL=$(params.check-interval) + TARGET_REPLICAS=$(params.replicas) + TIMEOUT=$(params.timeout) + START_TIME=$(date +%s) + + while true; do + # Check if timeout has been reached + CURRENT_TIME=$(date +%s) + ELAPSED_TIME=$((CURRENT_TIME - START_TIME)) + + if [ $ELAPSED_TIME -ge $TIMEOUT ]; then + echo "$(date): Timeout reached after ${ELAPSED_TIME} seconds. Nodepools did not complete within the specified timeout." + echo "0" | tee $(results.datapoint.path) + exit 1 + fi + + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + echo "$(date): Checking ready nodes in nodepool $(params.nodepool)..." + + # Get list of availability zones + AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') + AZ_COUNT=$(echo "$AZ_LIST" | wc -l) + + echo "[INFO] Found $AZ_COUNT availability zones:" + echo "$AZ_LIST" | sed 's/^/ - /' + echo "" + + # Track if all availability zones have reached target replicas + ALL_AZ_READY=true + + # Check each availability zone + for az in $AZ_LIST; do + ready_nodes_count=$(kubectl get nodes -l karpenter.sh/nodepool=$(params.nodepool)-${az} -o json | jq -r ' + [.items[] | + select(.status.conditions[] | select(.type == "Ready" and .status == "True"))] | + length + ') + + echo "$(date): AZ ${az} - Ready nodes: $ready_nodes_count, Target replicas: $TARGET_REPLICAS" + + if [ "$ready_nodes_count" -ne "$TARGET_REPLICAS" ]; then + echo "$(date): AZ ${az} - Ready nodes count ($ready_nodes_count) does not match target replicas ($TARGET_REPLICAS)" + ALL_AZ_READY=false + else + echo "$(date): AZ ${az} - Success! Ready nodes count matches target replicas ($TARGET_REPLICAS)" + fi + done + + # Exit if all availability zones have reached target replicas + if [ "$ALL_AZ_READY" = "true" ]; then + echo "$(date): All availability zones have reached target replica count. Exiting successfully." + echo "1" | tee $(results.datapoint.path) + exit 0 + fi + + echo "$(date): Not all availability zones have reached target replicas. Waiting ${CHECK_INTERVAL} seconds before next check..." + + done + sleep $CHECK_INTERVAL + done + + exit 1 diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml index 0738c994..08b621e8 100644 --- a/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml +++ b/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml @@ -1,23 +1,93 @@ ---- apiVersion: tekton.dev/v1beta1 kind: Task metadata: - name: nodepool-scale + name: scale-nodepool namespace: scalability spec: - description: "drift a cluster by adding a new label to the specified nodepool" + description: | + Scales a Karpenter nodepool by modifying the number of replicas. + This task configures kubectl access to the EKS cluster, captures the current + cluster state for monitoring purposes, performs the scaling operation, + and verifies the scaling request was applied successfully. + DOES NOT CHECK TO SEE IF ALL NODES HAVE GONE READY. Use kubectl-nodepool-replicas-wait.yaml for that params: - name: replicas - description: Number of replicas to scale to + description: Number of replicas to scale the nodepool to (target replica count) - name: nodepool - description: Name of the nodepool to drift + description: Name of the Karpenter nodepool resource to scale - name: cluster-name - description: The name of the cluster + description: The name of the EKS cluster containing the nodepool - name: endpoint - description: eks endpoint to use + description: EKS cluster endpoint URL for kubectl configuration + - name: aws-region + description: AWS region where the cluster is located (used for AZ discovery) + default: us-west-2 steps: - name: scale-nodepool - image: amazon/aws-cli + image: alpine/k8s:1.30.2 script: | + echo "Starting Nodepool Scaling Task" + echo "==============================" + + # Configure kubectl to connect to the EKS cluster + echo "[INFO] Configuring kubectl for cluster access..." aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) - kubectl scale nodepool ${params.nodepool} --replicas $(params.replicas) \ No newline at end of file + echo "[SUCCESS] Successfully configured kubectl" + echo "" + + # Discover availability zones and scale nodepools + echo "" + echo "[INFO] Discovering availability zones in region: $(params.aws-region)" + + # Get list of availability zones + AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') + AZ_COUNT=$(echo "$AZ_LIST" | wc -l) + + echo "[INFO] Found $AZ_COUNT availability zones:" + echo "$AZ_LIST" | sed 's/^/ - /' + echo "" + + # Capture current cluster state before scaling operation + echo "[INFO] Capturing cluster state before scaling operation..." + echo "--------------------------------------------------------" + + echo "[INFO] Current nodepool status:" + kubectl get nodepools -o wide + echo "" + + # Process each availability zone + NODEPOOL_COUNT=0 + echo "$AZ_LIST" | while read -r az; do + export AZ=$az + + echo "[INFO] Current nodepool $(params.nodepool) detailed status:" + kubectl get nodepool $(params.nodepool)-${az} -o yaml + echo "" + + echo "[INFO] Current nodepool nodes:" + kubectl get nodes -l karpenter.sh/nodepool=$(params.nodepool)-${az} -o wide + echo "" + + # Perform the scaling operation + echo "[INFO] Scaling nodepool $(params.nodepool)-${az} to $(params.replicas) replicas..." + kubectl scale nodepool $(params.nodepool)-${az} --replicas $(params.replicas) + echo "[SUCCESS] Scaling command executed successfully" + echo "" + + echo "[INFO] Updated nodepool $(params.nodepool) detailed status:" + kubectl get nodepool $(params.nodepool)-${az} -o yaml + echo "" + done + + + # Verify the scaling operation was applied + echo "[INFO] Verifying scaling operation results..." + echo "=============================================" + + echo "[INFO] Updated nodepool status:" + kubectl get nodepools -o wide + echo "" + + echo "=============================================" + echo "Nodepool Scaling Operation Begun" + echo "=============================================" \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml index 83c11465..1bb28d77 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml @@ -5,66 +5,138 @@ metadata: namespace: scalability spec: description: | - Creates the karpenter Controller Role + Creates the Karpenter Controller IAM Role with necessary permissions for managing EC2 instances. + This task downloads trust and policy documents, configures OIDC integration, and creates/updates + the IAM role and policies required for Karpenter to function properly in the EKS cluster. results: - name: node-role-arn - description: Stores the controller role arn created by the task + description: Stores the controller role ARN created by the task params: - name: cluster-name - description: The name of the cluster + description: The name of the EKS cluster for which the controller role will be created - name: endpoint - description: endpoint + description: EKS cluster endpoint URL for API operations - name: aws-region - description: region + description: AWS region where the cluster and IAM resources are located default: us-west-2 - name: aws-account-id - description: account id + description: AWS account ID where the IAM role will be created - name: aws-partition - description: partition + description: AWS partition (aws, aws-cn, aws-us-gov) default: aws - name: karpenter-controller-role-trust-policy-url + description: URL of the trust policy document template for the controller role default: https://raw.githubusercontent.com/DerekFrank/kubernetes-iteration-toolkit/refs/heads/main/tests/assets/karpenter/controller-role-trust-policy-document.json - name: karpenter-controller-role-policy-url + description: URL of the IAM policy document template for the controller role default: https://raw.githubusercontent.com/DerekFrank/kubernetes-iteration-toolkit/refs/heads/main/tests/assets/karpenter/controller-role-policy-document.json workspaces: - name: source mountPath: /src/karpenter/ steps: - name: create-role - image: alpine/k8s:1.23.7 + image: alpine/k8s:1.30.2 script: | - echo "Starting controller role" - export RAW_OIDC_ENDPOINT="$(aws eks --endpoint $(params.endpoint) describe-cluster --name "$(params.cluster-name)" \ - --query "cluster.identity.oidc.issuer" --output text)" - export OIDC_ID=$(aws eks --endpoint $(params.endpoint) describe-cluster --name $(params.cluster-name) --region $(params.aws-region) --query "cluster.identity.oidc.issuer" --output text | cut -d '/' -f 5) + echo "Starting Karpenter Controller Role Creation Task" + echo "==============================================" + + # Set up environment variables for template substitution + echo "" + echo "[INFO] Setting up environment variables..." export AWS_PARTITION=$(params.aws-partition) export AWS_ACCOUNT_ID=$(params.aws-account-id) export AWS_REGION=$(params.aws-region) export CLUSTER_NAME=$(params.cluster-name) - echo $RAW_OIDC_ENDPOINT - echo $OIDC_ID + # Retrieve OIDC issuer information from EKS cluster + echo "" + echo "[INFO] Retrieving OIDC issuer information from EKS cluster..." + export RAW_OIDC_ENDPOINT="$(aws eks --endpoint $(params.endpoint) describe-cluster --name "$(params.cluster-name)" \ + --query "cluster.identity.oidc.issuer" --output text)" + + if [ -z "$RAW_OIDC_ENDPOINT" ]; then + echo "[ERROR] Failed to retrieve OIDC endpoint from cluster" + exit 1 + fi + + export OIDC_ID=$(aws eks --endpoint $(params.endpoint) describe-cluster --name $(params.cluster-name) --region $(params.aws-region) --query "cluster.identity.oidc.issuer" --output text | cut -d '/' -f 5) export OIDC_ENDPOINT=$(echo ${RAW_OIDC_ENDPOINT#*//}) - echo $OIDC_ENDPOINT + echo "[SUCCESS] Retrieved OIDC information:" + echo " - Raw OIDC Endpoint: $RAW_OIDC_ENDPOINT" + echo " - OIDC ID: $OIDC_ID" + echo " - OIDC Endpoint: $OIDC_ENDPOINT" + # Download and process trust policy document + echo "" + echo "[INFO] Downloading trust policy document from: $(params.karpenter-controller-role-trust-policy-url)" curl -fsSL $(params.karpenter-controller-role-trust-policy-url) -o $(workspaces.source.path)karpenter-controller-role-trust-policy-url.json + echo "[INFO] Original trust policy template:" + echo "----------------------------------------" + cat $(workspaces.source.path)karpenter-controller-role-trust-policy-url.json | sed 's/^/ /' + echo "----------------------------------------" + + echo "[INFO] Processing trust policy template with environment variables..." envsubst < $(workspaces.source.path)karpenter-controller-role-trust-policy-url.json > $(workspaces.source.path)karpenter-controller-role-trust-policy-url-modified.json - cat $(workspaces.source.path)karpenter-controller-role-trust-policy-url.json - - cat $(workspaces.source.path)karpenter-controller-role-trust-policy-url-modified.json + echo "[INFO] Processed trust policy document:" + echo "----------------------------------------" + cat $(workspaces.source.path)karpenter-controller-role-trust-policy-url-modified.json | sed 's/^/ /' + echo "----------------------------------------" - aws iam create-role --role-name "KarpenterControllerRole-$(params.cluster-name)" \ - --assume-role-policy-document file://$(workspaces.source.path)karpenter-controller-role-trust-policy-url-modified.json + # Create or verify IAM role existence + echo "" + echo "[INFO] Checking if IAM role KarpenterControllerRole-$(params.cluster-name) exists..." + if aws iam get-role --role-name "KarpenterControllerRole-$(params.cluster-name)" >/dev/null 2>&1; then + echo "[INFO] IAM role KarpenterControllerRole-$(params.cluster-name) already exists, skipping creation" + else + echo "[INFO] Creating IAM role KarpenterControllerRole-$(params.cluster-name)..." + aws iam create-role --role-name "KarpenterControllerRole-$(params.cluster-name)" \ + --assume-role-policy-document file://$(workspaces.source.path)karpenter-controller-role-trust-policy-url-modified.json + echo "[SUCCESS] Successfully created IAM role KarpenterControllerRole-$(params.cluster-name)" + fi + # Download and process IAM policy document + echo "" + echo "[INFO] Downloading IAM policy document from: $(params.karpenter-controller-role-policy-url)" curl -fsSL $(params.karpenter-controller-role-policy-url) -o $(workspaces.source.path)karpenter-controller-role-policy-url.json + echo "[INFO] Processing IAM policy template with environment variables..." envsubst < $(workspaces.source.path)karpenter-controller-role-policy-url.json > $(workspaces.source.path)karpenter-controller-role-policy-url-modified.json - cat $(workspaces.source.path)karpenter-controller-role-policy-url-modified.json + echo "[INFO] Processed IAM policy document:" + echo "----------------------------------------" + cat $(workspaces.source.path)karpenter-controller-role-policy-url-modified.json | sed 's/^/ /' + echo "----------------------------------------" + # Create or update role policy + echo "" + echo "[INFO] Checking if role policy KarpenterControllerPolicy-$(params.cluster-name) exists..." + if aws iam get-role-policy --role-name "KarpenterControllerRole-$(params.cluster-name)" --policy-name "KarpenterControllerPolicy-$(params.cluster-name)" >/dev/null 2>&1; then + echo "[INFO] Role policy KarpenterControllerPolicy-$(params.cluster-name) already exists, updating..." + else + echo "[INFO] Creating role policy KarpenterControllerPolicy-$(params.cluster-name)..." + fi + aws iam put-role-policy --role-name "KarpenterControllerRole-$(params.cluster-name)" \ --policy-name "KarpenterControllerPolicy-$(params.cluster-name)" \ - --policy-document file://$(workspaces.source.path)karpenter-controller-role-policy-url-modified.json \ No newline at end of file + --policy-document file://$(workspaces.source.path)karpenter-controller-role-policy-url-modified.json + echo "[SUCCESS] Successfully applied role policy KarpenterControllerPolicy-$(params.cluster-name)" + + # Verify the created resources + echo "" + echo "[INFO] Verifying created IAM resources..." + echo "========================================" + + echo "[INFO] IAM Role details:" + aws iam get-role --role-name "KarpenterControllerRole-$(params.cluster-name)" --query 'Role.[RoleName,Arn,CreateDate]' --output table + + echo "" + echo "[INFO] Attached role policies:" + aws iam list-role-policies --role-name "KarpenterControllerRole-$(params.cluster-name)" --output table + + echo "" + echo "==============================================" + echo "Karpenter Controller Role Creation Completed" + echo "==============================================" diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml index 613dfb94..ef169c30 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml @@ -11,7 +11,21 @@ spec: description: The name of the cluster steps: - name: create-role - image: alpine/k8s:1.23.7 + image: alpine/k8s:1.30.2 script: | - aws iam create-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" - aws iam add-role-to-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" --role-name "KarpenterNodeRole-$(params.cluster-name)" + # Check if the instance profile already exists + if aws iam get-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" >/dev/null 2>&1; then + echo "Instance profile KarpenterNodeInstanceProfile-$(params.cluster-name) already exists. Skipping creation..." + else + echo "Creating instance profile KarpenterNodeInstanceProfile-$(params.cluster-name)..." + aws iam create-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" + fi + + # Check if the role is already added to the instance profile + EXISTING_ROLES=$(aws iam get-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" --query 'InstanceProfile.Roles[?RoleName==`KarpenterNodeRole-$(params.cluster-name)`].RoleName' --output text) + if [ -n "$EXISTING_ROLES" ]; then + echo "Role KarpenterNodeRole-$(params.cluster-name) is already attached to instance profile. Skipping..." + else + echo "Adding role KarpenterNodeRole-$(params.cluster-name) to instance profile..." + aws iam add-role-to-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" --role-name "KarpenterNodeRole-$(params.cluster-name)" + fi diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml index 6083cd45..924f855a 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml @@ -23,7 +23,7 @@ spec: mountPath: /src/karpenter/ steps: - name: create-stack - image: alpine/k8s:1.23.7 + image: alpine/k8s:1.30.2 script: | STACK_NAME=Karpenter-$(params.cluster-name) STACK_STATUS=$(aws cloudformation describe-stacks --query 'Stacks[?StackName==`'${STACK_NAME}'`].StackStatus' --output text --region $(params.region)) @@ -31,7 +31,38 @@ spec: cat /src/karpenter/cloudformation.yaml - aws eks update-cluster-config --name $(params.cluster-name) --access-config authenticationMode=API_AND_CONFIG_MAP --endpoint $(params.endpoint) + UPDATE_OUTPUT=$(aws eks update-cluster-config --name $(params.cluster-name) --access-config authenticationMode=API_AND_CONFIG_MAP --endpoint $(params.endpoint)) + + echo $UPDATE_OUTPUT + + # Extract the update ID from the output + UPDATE_ID=$(echo "$UPDATE_OUTPUT" | jq -r '.update.id // empty') + + echo "Waiting for cluster config update $UPDATE_ID to complete..." + + # Wait for the update to complete + while true; do + UPDATE_STATUS=$(aws eks describe-update --name $(params.cluster-name) --update-id "$UPDATE_ID" --endpoint $(params.endpoint) --query 'update.status' --output text) + + case "$UPDATE_STATUS" in + "Successful") + echo "Cluster config update completed successfully" + break + ;; + "Failed"|"Cancelled") + echo "Cluster config update failed with status: $UPDATE_STATUS" + exit 1 + ;; + "InProgress") + echo "Update still in progress, waiting 30 seconds..." + sleep 30 + ;; + *) + echo "Unknown update status: $UPDATE_STATUS" + sleep 30 + ;; + esac + done if [[ "$STACK_STATUS" == "" ]]; then aws cloudformation deploy \ @@ -46,20 +77,21 @@ spec: echo "$STACK_NAME Already exists" fi - eksctl create iamserviceaccount \ - --name karpenter \ - --namespace karpenter \ - --cluster "$(params.cluster-name)" \ - --attach-policy-arn "arn:aws:iam::$(params.account-id):role/$(params.cluster-name)-karpenter" \ - --approve \ - --override-existing-serviceaccounts + aws eks describe-cluster --name "$(params.cluster-name)" --output text --endpoint $(params.endpoint) export AWS_EKS_ENDPOINT=$(params.endpoint) + # Check if OIDC provider is already associated + echo "Associating OIDC provider with cluster..." eksctl utils associate-iam-oidc-provider --cluster "$(params.cluster-name)" --approve - aws eks create-access-entry \ - --cluster-name "$(params.cluster-name)" \ - --principal-arn "arn:aws:iam::$(params.account-id):role/KarpenterNodeRole-$(params.cluster-name)" \ - --endpoint $(params.endpoint) \ - --type EC2_LINUX - + # Check if access entry already exists + if aws eks describe-access-entry --cluster-name "$(params.cluster-name)" --principal-arn "arn:aws:iam::$(params.account-id):role/KarpenterNodeRole-$(params.cluster-name)" --endpoint $(params.endpoint) >/dev/null 2>&1; then + echo "Access entry for KarpenterNodeRole already exists. Skipping creation..." + else + echo "Creating access entry for KarpenterNodeRole..." + aws eks create-access-entry \ + --cluster-name "$(params.cluster-name)" \ + --principal-arn "arn:aws:iam::$(params.account-id):role/KarpenterNodeRole-$(params.cluster-name)" \ + --endpoint $(params.endpoint) \ + --type EC2_LINUX + fi diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml index 178a595b..f7cbea11 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml @@ -5,28 +5,39 @@ metadata: namespace: scalability spec: description: | - Creates the karpenter MNG + Creates a dedicated Karpenter managed node group (MNG) for the EKS cluster. + This task creates a large-capacity node group specifically designed to host Karpenter + system components with appropriate taints and labels to ensure proper scheduling. + The node group uses r5.24xlarge instances with dedicated=karpenter taints. params: - name: cluster-name - description: The name of the cluster + description: The name of the EKS cluster where the Karpenter MNG will be created - name: aws-account-id - description: id of the account + description: AWS account ID used to construct the node role ARN - name: endpoint - description: eks endpoint to use + description: EKS cluster endpoint URL for AWS EKS CLI operations - name: region default: "us-west-2" - description: The region where the cluster is in. + description: AWS region where the EKS cluster is located steps: - name: create-mng - image: alpine/k8s:1.23.7 + image: alpine/k8s:1.30.2 script: | + echo "Starting Karpenter Managed Node Group Creation" + + # Discover subnets associated with the cluster + echo "[INFO] Discovering subnets for cluster $(params.cluster-name)..." SUBNET_IDS=$(aws ec2 describe-subnets \ --filters "Name=tag:aws:cloudformation:stack-name,Values=$(params.cluster-name)" \ --query 'Subnets[*].SubnetId' \ - --output text) - - echo ${SUBNET_IDS} - + --output text \ + --region $(params.region)) + echo "[INFO] Discovered Subnets: $SUBNET_IDS" + + # Create the Karpenter managed node group + echo "[INFO] Creating Karpenter managed node group..." + echo "" + aws eks create-nodegroup \ --cluster-name $(params.cluster-name) \ --nodegroup-name karpenter-system-large \ @@ -37,11 +48,46 @@ spec: --labels dedicated=karpenter \ --region $(params.region) \ --endpoint-url $(params.endpoint) \ - --taints key=dedicated,value=karpenter,effect=NoSchedule + --taints key=dedicated,value=karpenter,effect=NO_SCHEDULE + + # Verify the node group was created and list all node groups + echo "[INFO] Verifying node group creation..." + echo "======================================" + + NODE_GROUPS=$(aws eks list-nodegroups \ + --endpoint-url $(params.endpoint) \ + --cluster-name $(params.cluster-name) \ + --region $(params.region) \ + --query 'nodegroups' \ + --output text) + + if [ -z "$NODE_GROUPS" ]; then + echo "[WARNING] No node groups found in cluster" + else + NODE_GROUP_COUNT=$(echo $NODE_GROUPS | wc -w) + echo "[SUCCESS] Found $NODE_GROUP_COUNT node group(s) in cluster:" + echo "$NODE_GROUPS" | tr ' ' '\n' | sed 's/^/ - /' + fi + echo "" - - # quick validation - aws eks list-nodegroups \ - --endpoint $(params.endpoint) \ - --cluster-name $(params.cluster-name) \ - --region $(params.region) \ No newline at end of file + # Display detailed information about the Karpenter node group + echo "[INFO] Retrieving Karpenter node group details..." + aws eks describe-nodegroup \ + --cluster-name $(params.cluster-name) \ + --nodegroup-name karpenter-system-large \ + --region $(params.region) \ + --endpoint-url $(params.endpoint) \ + --query '{ + Status: nodegroup.status, + InstanceTypes: nodegroup.instanceTypes, + ScalingConfig: nodegroup.scalingConfig, + Labels: nodegroup.labels, + Taints: nodegroup.taints, + SubnetIds: nodegroup.subnets + }' \ + --output table + echo "" + + echo "==============================================" + echo "Karpenter Managed Node Group Creation Complete" + echo "==============================================" diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml index acff12d8..2a13f8a3 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml @@ -5,38 +5,107 @@ metadata: namespace: scalability spec: description: | - Creates the karpenter Node Role + Creates the Karpenter Node IAM Role with required policies for EKS worker nodes. + This task creates an IAM role that allows EC2 instances to assume the role and attaches + the necessary AWS managed policies for EKS worker node functionality including container + registry access, CNI networking, and Systems Manager access. results: - name: node-role-arn - description: Stores the node role arn created by the task + description: The ARN of the created Karpenter node IAM role params: - name: cluster-name - description: The name of the cluster + description: The name of the EKS cluster (used to create unique role name) steps: - name: create-role - image: alpine/k8s:1.23.7 + image: alpine/k8s:1.30.2 script: | - aws iam create-role --role-name "KarpenterNodeRole-$(params.cluster-name)" \ - --assume-role-policy-document '{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Service": "ec2.amazonaws.com" - }, - "Action": "sts:AssumeRole" - } - ] - }' + echo "Starting Karpenter Node IAM Role Creation Task" + echo "==============================================" + + # Set role name variable for consistency + ROLE_NAME="KarpenterNodeRole-$(params.cluster-name)" + + echo "[INFO] Target role name: $ROLE_NAME" + echo "" - # Attach required policies - aws iam attach-role-policy --role-name "KarpenterNodeRole-$(params.cluster-name)" \ - --policy-arn arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy - aws iam attach-role-policy --role-name "KarpenterNodeRole-$(params.cluster-name)" \ - --policy-arn arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy - aws iam attach-role-policy --role-name "KarpenterNodeRole-$(params.cluster-name)" \ - --policy-arn arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly - aws iam attach-role-policy --role-name "KarpenterNodeRole-$(params.cluster-name)" \ - --policy-arn arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore - aws iam get-role --role-name KarpenterNodeRole-$(params.cluster-name) --query 'Role.[Arn]' --output text > $(results.node-role-arn) \ No newline at end of file + # Check if the IAM role already exists + echo "[INFO] Checking if IAM role already exists..." + if aws iam get-role --role-name "$ROLE_NAME" >/dev/null 2>&1; then + echo "[INFO] IAM role $ROLE_NAME already exists, skipping creation" + else + echo "[INFO] Creating new IAM role: $ROLE_NAME" + echo "[INFO] Configuring trust policy for EC2 service..." + + # Create the IAM role with trust policy for EC2 + aws iam create-role --role-name "$ROLE_NAME" \ + --assume-role-policy-document '{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "ec2.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] + }' + + echo "[SUCCESS] Successfully created IAM role: $ROLE_NAME" + fi + echo "" + + # Define required AWS managed policies for EKS worker nodes + echo "[INFO] Preparing to attach required AWS managed policies..." + POLICIES=( + "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" + "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + ) + + echo "[INFO] Required policies to attach:" + for policy in "${POLICIES[@]}"; do + echo " - $policy" + done + echo "" + + # Attach required policies to the role + POLICY_COUNT=0 + for policy in "${POLICIES[@]}"; do + POLICY_COUNT=$((POLICY_COUNT + 1)) + echo "[INFO] Processing policy $POLICY_COUNT of ${#POLICIES[@]}: $policy" + + # Check if policy is already attached to avoid duplicate attachments + if aws iam list-attached-role-policies --role-name "$ROLE_NAME" --query "AttachedPolicies[?PolicyArn=='$policy'].PolicyArn" --output text | grep -q "$policy"; then + echo "[INFO] Policy already attached, skipping: $policy" + else + echo "[INFO] Attaching policy to role..." + aws iam attach-role-policy --role-name "$ROLE_NAME" --policy-arn "$policy" + echo "[SUCCESS] Successfully attached policy: $policy" + fi + echo "" + done + + # Retrieve and store the role ARN for use by other tasks + echo "[INFO] Retrieving role ARN for task output..." + ROLE_ARN=$(aws iam get-role --role-name "$ROLE_NAME" --query 'Role.Arn' --output text) + echo "[INFO] Role ARN: $ROLE_ARN" + + # Write ARN to results file for pipeline consumption + echo "$ROLE_ARN" > $(results.node-role-arn) + echo "[SUCCESS] Role ARN saved to task results" + echo "" + + # Verify final role configuration + echo "[INFO] Verifying final role configuration..." + echo "==========================================" + echo "[INFO] Role details:" + aws iam get-role --role-name "$ROLE_NAME" --query 'Role.{RoleName:RoleName,Arn:Arn,CreateDate:CreateDate}' --output table + echo "" + echo "[INFO] Attached policies:" + aws iam list-attached-role-policies --role-name "$ROLE_NAME" --query 'AttachedPolicies[].{PolicyName:PolicyName,PolicyArn:PolicyArn}' --output table + echo "" + echo "==========================================" + echo "Karpenter Node IAM Role Creation Completed" + echo "==========================================" diff --git a/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml b/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml index 7358f87e..4fa8fd03 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml @@ -5,48 +5,66 @@ metadata: namespace: scalability spec: description: | - Install karpenter on the cluster + Installs Karpenter on an EKS cluster using Helm. + This task authenticates with ECR, configures kubectl, validates cluster state, + and installs Karpenter with optimized settings for large-scale workloads. params: - name: cluster-name - description: The name of the cluster + description: The name of the EKS cluster where Karpenter will be installed - name: aws-account-id - description: aws account id + description: AWS account ID for IAM role ARN construction - name: karpenter-ecr-repo - description: ECR repo to install karpenter + description: ECR repository URL containing the Karpenter Helm chart - name: karpenter-version - description: version of karpenter to install + description: Version of Karpenter to install (e.g., v0.32.0) - name: endpoint - description: eks endpoint to use + description: EKS cluster endpoint URL for kubectl configuration workspaces: - name: config steps: - name: install-karpenter - image: alpine/k8s:1.23.7 - timeout: 10m + image: alpine/k8s:1.30.2 script: | + echo "Starting Karpenter Installation Task" + echo "====================================" + + # Authenticate with ECR for Helm registry access + echo "[INFO] Authenticating with ECR registry..." aws ecr get-login-password --region us-west-2 | helm registry login --username AWS --password-stdin $(params.karpenter-ecr-repo) + echo "[SUCCESS] Successfully authenticated with ECR" + echo "" + # Configure kubectl to connect to the EKS cluster + echo "[INFO] Configuring kubectl for cluster access..." aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + echo "[SUCCESS] Successfully configured kubectl" + echo "" + # Verify karpenter-system nodegroup exists + echo "[INFO] Verifying karpenter-system nodegroup..." aws eks describe-nodegroup --cluster-name $(params.cluster-name) --endpoint $(params.endpoint) --nodegroup-name karpenter-system-large + echo "" + + # Capture cluster state before installation for troubleshooting + echo "[INFO] Capturing cluster state before Karpenter installation..." + echo "----------------------------------------" - kubectl get nodes -A -o yaml + echo "[INFO] Current cluster nodes:" + kubectl get nodes -o wide + echo "" + echo "[INFO] Current pods across all namespaces:" kubectl get pods -A -o wide + echo "" - kubectl get pods -n karpenter -o yaml - - # kubectl delete nodes -l dedicated=karpenter - + echo "[INFO] Current deployments across all namespaces:" kubectl get deployments -A -o wide - - # helm status karpenter --namespace karpenter - - # kubectl logs karpenter-5df996fbbf-f8ghz -n karpenter -f - - # helm delete -n karpenter karpenter --wait - - # kubectl taint nodes -l dedicated=karpenter dedicated=karpenter:NoSchedule + echo "----------------------------------------" + echo "" + + # Install Karpenter using Helm with optimized configuration + echo "[INFO] Installing Karpenter with Helm..." + echo "" helm upgrade --install karpenter oci://$(params.karpenter-ecr-repo)/karpenter/karpenter --version $(params.karpenter-version) \ --namespace "karpenter" \ @@ -85,4 +103,34 @@ spec: --debug \ --wait - kubectl get pods -n karpenter \ No newline at end of file + echo "" + echo "[SUCCESS] Karpenter installation completed" + echo "" + + # Verify the installation + echo "[INFO] Verifying Karpenter installation..." + echo "=========================================" + + KARPENTER_PODS=$(kubectl get pods -n karpenter --no-headers 2>/dev/null | wc -l) + echo "[SUCCESS] Found $KARPENTER_PODS Karpenter pod(s) in the cluster" + echo "" + + echo "[INFO] Current Karpenter pod status:" + kubectl get pods -n karpenter -o wide + echo "" + + echo "[INFO] Current Karpenter pod detailed:" + kubectl get pods -n karpenter -o yaml + echo "" + + echo "[INFO] Karpenter deployment details:" + kubectl get deployment -n karpenter -o wide + echo "" + + echo "[INFO] Karpenter deployment details:" + kubectl get deployment -n karpenter -o yaml + echo "" + + echo "=========================================" + echo "Karpenter Installation Completed" + echo "=========================================" diff --git a/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml b/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml index cd8b6fb8..38da850b 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml @@ -5,52 +5,81 @@ metadata: namespace: scalability spec: description: | - Install karpenter on the cluster + Creates and applies an EC2NodeClass resource for Karpenter node provisioning. + This task retrieves cluster configuration, downloads a nodeclass template, + substitutes environment variables, and applies the configuration to the cluster. params: - name: cluster-name - description: The name of the cluster + description: The name of the EKS cluster where the EC2NodeClass will be created - name: endpoint - description: eks endpoint to use + description: The AWS EKS API endpoint URL to use for cluster operations - name: karpenter-nodeclass-url - description: url of the nodeclass template to use + description: The URL of the EC2NodeClass YAML template to download and apply workspaces: - name: source mountPath: /src/karpenter/ + description: Workspace for storing downloaded and processed nodeclass files steps: - name: create-ec2nodeclass - image: alpine/k8s:1.23.7 - script: | + image: alpine/k8s:1.30.2 + script: | + echo "Starting EC2NodeClass creation process for cluster: $(params.cluster-name)" + + # Retrieve cluster certificate authority data for node authentication + echo "Fetching cluster certificate authority data..." export CLUSTER_CA=$(aws eks describe-cluster \ --name $(params.cluster-name) \ --endpoint-url $(params.endpoint) \ --query 'cluster.certificateAuthority.data' \ --output text) + echo "Successfully retrieved cluster CA data" - + # Retrieve cluster API endpoint for node communication + echo "Fetching cluster API endpoint..." export CLUSTER_ENDPOINT=$(aws eks describe-cluster \ --name $(params.cluster-name) \ --endpoint-url $(params.endpoint) \ --query 'cluster.endpoint' \ --output text) + echo "Cluster endpoint retrieved: ${CLUSTER_ENDPOINT}" + # Set cluster name for template substitution export CLUSTER_NAME=$(params.cluster-name) + echo "Using cluster name: ${CLUSTER_NAME}" + # Set AMI alias version for node instances export ALIAS_VERSION=latest + echo "Using AMI alias version: ${ALIAS_VERSION}" - echo "Cluster endpoint: ${CLUSTER_ENDPOINT}" - + # Download the EC2NodeClass template from the specified URL + echo "Downloading EC2NodeClass template from: $(params.karpenter-nodeclass-url)" curl -fsSL $(params.karpenter-nodeclass-url) -o $(workspaces.source.path)ec2nodeclass.yaml + echo "Template downloaded successfully to $(workspaces.source.path)ec2nodeclass.yaml" + # Display the original template for verification + echo "Original EC2NodeClass template content:" cat $(workspaces.source.path)ec2nodeclass.yaml + # Substitute environment variables in the template + echo "Performing environment variable substitution in template..." envsubst < $(workspaces.source.path)ec2nodeclass.yaml > $(workspaces.source.path)ec2nodeclass-modified.yaml + echo "Environment variable substitution completed" - ls $(workspaces.source.path) - + # Display the processed template with substituted values + echo "Processed EC2NodeClass configuration:" cat $(workspaces.source.path)ec2nodeclass-modified.yaml + # Update kubeconfig to authenticate with the target cluster + echo "Updating kubeconfig for cluster access..." aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + echo "Kubeconfig updated successfully" + # Apply the EC2NodeClass configuration to the cluster + echo "Applying EC2NodeClass configuration to cluster..." kubectl apply -f $(workspaces.source.path)ec2nodeclass-modified.yaml + echo "EC2NodeClass applied successfully" + # Verify the EC2NodeClass was created and display its configuration + echo "Retrieving and displaying created EC2NodeClass resources:" kubectl get ec2nodeclass -o yaml + echo "EC2NodeClass creation process completed successfully" diff --git a/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodepools.yaml b/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodepools.yaml index f438b3ff..5f3eb805 100644 --- a/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodepools.yaml +++ b/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodepools.yaml @@ -5,39 +5,108 @@ metadata: namespace: scalability spec: description: | - Install karpenter on the cluster + Creates Karpenter NodePool resources for each availability zone in the specified AWS region. + This task downloads a nodepool template, customizes it for each AZ, and applies it to the cluster. params: - name: cluster-name - description: The name of the cluster + description: The name of the EKS cluster where nodepools will be created - name: endpoint - description: eks endpoint to use + description: EKS cluster endpoint URL for kubectl configuration - name: aws-region - description: aws region to use + description: AWS region where the cluster is located (used for AZ discovery) default: us-west-2 - name: karpenter-nodepool-url - description: url of the nodeclass template to use + description: URL of the nodepool YAML template to download and customize workspaces: - name: source mountPath: /src/karpenter/ steps: - name: create-nodepools - image: alpine/k8s:1.23.7 + image: alpine/k8s:1.30.2 script: | + echo "Starting Karpenter NodePool Creation Task" + + # Configure kubectl to connect to the EKS cluster + echo "[INFO] Configuring kubectl for cluster access..." aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + if [ $? -eq 0 ]; then + echo "[SUCCESS] Successfully configured kubectl" + else + echo "[ERROR] Failed to configure kubectl" + exit 1 + fi + # Set cluster name environment variable for template substitution export CLUSTER_NAME=$(params.cluster-name) + + # Download the nodepool template + echo "" + echo "[INFO] Downloading nodepool template from: $(params.karpenter-nodepool-url)" curl -fsSL $(params.karpenter-nodepool-url) -o $(workspaces.source.path)nodepool.yaml + # Display the downloaded template for verification + echo "" + echo "[INFO] Downloaded nodepool template content:" + echo "----------------------------------------" cat $(workspaces.source.path)nodepool.yaml + echo "----------------------------------------" + # Discover availability zones and create nodepools + echo "" + echo "[INFO] Discovering availability zones in region: $(params.aws-region)" + + # Get list of availability zones + AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') + AZ_COUNT=$(echo "$AZ_LIST" | wc -l) - aws ec2 describe-availability-zones --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]' | while read -r az; do + echo "[INFO] Found $AZ_COUNT availability zones:" + echo "$AZ_LIST" | sed 's/^/ - /' + echo "" + + # Process each availability zone + NODEPOOL_COUNT=0 + echo "$AZ_LIST" | while read -r az; do + if [ -z "$az" ]; then + continue + fi + + NODEPOOL_COUNT=$((NODEPOOL_COUNT + 1)) export AZ=$az - echo ${AZ} - envsubst < $(workspaces.source.path)nodepool.yaml > $(workspaces.source.path)nodepool-${AZ}.yaml - cat $(workspaces.source.path)nodepool-${AZ}.yaml - kubectl apply -f $(workspaces.source.path)nodepool-${AZ}.yaml + + echo "[INFO] Creating nodepool for availability zone: $az" + + # Generate AZ-specific nodepool configuration + echo "[INFO] Generating nodepool configuration for $az..." + envsubst < $(workspaces.source.path)nodepool.yaml > $(workspaces.source.path)nodepool-${az}.yaml + + # Display the generated configuration + echo "[INFO] Generated nodepool configuration for $az:" + echo "----------------------------------------" + cat $(workspaces.source.path)nodepool-${az}.yaml | sed 's/^/ /' + echo "----------------------------------------" + + # Apply the nodepool configuration + echo "[INFO] Applying nodepool configuration for $az..." + kubectl apply -f $(workspaces.source.path)nodepool-${az}.yaml + echo "" done + # Verify the created nodepools + echo "[INFO] Verifying created nodepools..." + echo "==================================" + + NODEPOOL_LIST=$(kubectl get nodepool --no-headers 2>/dev/null | wc -l) + echo "[SUCCESS] Found $NODEPOOL_LIST nodepool(s) in the cluster" + echo "" + echo "[INFO] Current nodepool status:" + kubectl get nodepool -o wide + echo "" + echo "----------------------------------------" + echo "[INFO] Detailed nodepool configuration:" kubectl get nodepool -o yaml - + echo "----------------------------------------" + + echo "" + echo "==========================================" + echo "Karpenter NodePool Creation Completed" + echo "==========================================" diff --git a/tests/tekton-resources/tasks/teardown/karpenter/awscli-controller-role.yaml b/tests/tekton-resources/tasks/teardown/karpenter/awscli-controller-role.yaml deleted file mode 100644 index c4d03173..00000000 --- a/tests/tekton-resources/tasks/teardown/karpenter/awscli-controller-role.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: tekton.dev/v1beta1 -kind: Task -metadata: - name: awscli-controller-role-teardown - namespace: scalability -spec: - description: | - Creates the karpenter Controller Role - params: - - name: cluster-name - description: The name of the cluster - steps: - - name: create-role - image: alpine/k8s:1.23.7 - script: | - aws iam delete-role --role-name "KarpenterControllerRole-$(params.cluster-name)" \ No newline at end of file diff --git a/tests/tekton-resources/tasks/teardown/karpenter/awscli-instanceprofiles.yaml b/tests/tekton-resources/tasks/teardown/karpenter/awscli-instanceprofiles.yaml deleted file mode 100644 index fdcb9558..00000000 --- a/tests/tekton-resources/tasks/teardown/karpenter/awscli-instanceprofiles.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: tekton.dev/v1beta1 -kind: Task -metadata: - name: awscli-instanceprofiles-teardown - namespace: scalability -spec: - description: | - Creates the karpenter instance profile - params: - - name: cluster-name - description: The name of the cluster - steps: - - name: create-role - image: alpine/k8s:1.23.7 - script: | - aws iam delete-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" \ No newline at end of file diff --git a/tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter-cfn-stack.yaml b/tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter-cfn-stack.yaml deleted file mode 100644 index 9dbbdd45..00000000 --- a/tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter-cfn-stack.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: tekton.dev/v1beta1 -kind: Task -metadata: - name: awscli-karpenter-cfn-stack-teardown - namespace: scalability -spec: - description: | - Creates the karpenter instance roles and sqs interruption queue - params: - - name: cluster-name - description: The name of the cluster - - name: karpenter-version - description: Version of Karpenter to deploy - - name: endpoint - description: Endpoint to use with EKS - - name: region - default: us-west-2 - description: The region where the cluster is in. - - name: account-id - description: The aws account the cluster is running in - steps: - - name: create-stack - image: alpine/k8s:1.23.7 - script: | - STACK_NAME=Karpenter-$(params.cluster-name) - STACK_STATUS=$(aws cloudformation describe-stacks --query 'Stacks[?StackName==`'${STACK_NAME}'`].StackStatus' --output text --region $(params.region)) - cat ${STACK_STATUS} - - if [[ "$STACK_STATUS" == "ACTIVE" ]]; then - aws cloudformation delete-stack --stack-name ${STACK_NAME} - - aws cloudformation wait stack-delete-complete --stack-name $STACK_NAME --region $(params.region) - else - echo "$STACK_NAME Already exists" - fi - diff --git a/tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter.yaml b/tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter.yaml new file mode 100644 index 00000000..aecdd5a3 --- /dev/null +++ b/tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter.yaml @@ -0,0 +1,381 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-eks-karpenter-cluster-teardown + namespace: scalability +spec: + description: | + Teardown an EKS cluster. + This Task can be used to teardown an EKS cluster with mng in an AWS account. + params: + - name: cluster-name + description: The name of the EKS cluster which will be teared down. + - name: region + default: us-west-2 + description: The region where the cluster is in. + - name: endpoint + default: "" + - name: namespace-count + description: The number of namespaces for EKS Pod Identity test. + default: "0" + - name: slack-hook + default: "" + - name: slack-message + default: "Job is completed" + - name: service-role-stack-name + - name: node-role-stack-name + - name: launch-template-stack-name + steps: + - name: terminate-cluster-instances + image: alpine/k8s:1.30.2 + script: | + #!/bin/bash + set -e + + echo "$(date): Starting EC2 instance termination for cluster $(params.cluster-name)..." + + # Find all EC2 instances that belong to the cluster using the aws:eks:cluster-name tag + echo "$(date): Finding instances with tag aws:eks:cluster-name=$(params.cluster-name)..." + INSTANCE_IDS=$(aws ec2 describe-instances \ + --region $(params.region) \ + --filters "Name=tag:aws:eks:cluster-name,Values=$(params.cluster-name)" "Name=instance-state-name,Values=running,pending,stopping,stopped" \ + --query 'Reservations[*].Instances[*].InstanceId' \ + --output text) + + if [ -z "$INSTANCE_IDS" ]; then + echo "$(date): No instances found with tag aws:eks:cluster-name=$(params.cluster-name)" + echo "$(date): Instance termination completed - no instances to terminate" + exit 0 + fi + + # Convert to array and count instances + INSTANCE_ARRAY=($INSTANCE_IDS) + INSTANCE_COUNT=${#INSTANCE_ARRAY[@]} + + echo "$(date): Found $INSTANCE_COUNT instances to terminate: $INSTANCE_IDS" + + # List instance details for logging + echo "$(date): Instance details:" + aws ec2 describe-instances \ + --region $(params.region) \ + --instance-ids $INSTANCE_IDS \ + --query 'Reservations[*].Instances[*].[InstanceId,InstanceType,State.Name,LaunchTime]' \ + --output table + + # Terminate all instances belonging to the cluster + echo "$(date): Terminating instances..." + aws ec2 terminate-instances \ + --region $(params.region) \ + --instance-ids $INSTANCE_IDS + + echo "$(date): Termination request sent for all instances" + + # Wait for all instances to be terminated + echo "$(date): Waiting for all instances to be terminated..." + TIMEOUT=600 # 10 minutes timeout + CHECK_INTERVAL=15 # Check every 15 seconds + START_TIME=$(date +%s) + + while true; do + # Check if timeout has been reached + CURRENT_TIME=$(date +%s) + ELAPSED_TIME=$((CURRENT_TIME - START_TIME)) + + if [ $ELAPSED_TIME -ge $TIMEOUT ]; then + echo "$(date): Timeout reached after ${ELAPSED_TIME} seconds. Some instances may still be terminating." + # List remaining instances for debugging + REMAINING_INSTANCES=$(aws ec2 describe-instances \ + --region $(params.region) \ + --instance-ids $INSTANCE_IDS \ + --query 'Reservations[*].Instances[?State.Name!=`terminated`].InstanceId' \ + --output text 2>/dev/null || echo "") + if [ -n "$REMAINING_INSTANCES" ] && [ "$REMAINING_INSTANCES" != "None" ]; then + echo "$(date): Instances still not terminated: $REMAINING_INSTANCES" + fi + exit 1 + fi + + # Check instance states + RUNNING_INSTANCES=$(aws ec2 describe-instances \ + --region $(params.region) \ + --instance-ids $INSTANCE_IDS \ + --query 'Reservations[*].Instances[?State.Name!=`terminated`].InstanceId' \ + --output text 2>/dev/null || echo "") + + if [ -z "$RUNNING_INSTANCES" ] || [ "$RUNNING_INSTANCES" = "None" ]; then + echo "$(date): Success! All instances have been terminated" + break + else + RUNNING_COUNT=$(echo "$RUNNING_INSTANCES" | wc -w) + echo "$(date): Still waiting for $RUNNING_COUNT instances to be terminated: $RUNNING_INSTANCES" + echo "$(date): Waiting ${CHECK_INTERVAL} seconds before next check..." + sleep $CHECK_INTERVAL + fi + done + + echo "$(date): EC2 instance termination completed successfully" + - name: delete-cluster + image: alpine/k8s:1.23.7 + script: | + set +e + ENDPOINT_FLAG="" + if [ -n "$(params.endpoint)" ]; then + ENDPOINT_FLAG="--endpoint $(params.endpoint)" + fi + + for i in `aws eks list-nodegroups --cluster-name $(params.cluster-name) $ENDPOINT_FLAG --region $(params.region) | jq -r '.nodegroups[]'`; + do + aws eks delete-nodegroup --nodegroup-name $i --cluster-name $(params.cluster-name) $ENDPOINT_FLAG --region $(params.region); + aws eks wait nodegroup-deleted --nodegroup-name $i --cluster-name $(params.cluster-name) $ENDPOINT_FLAG --region $(params.region); + done; + echo "Starting to delete cluster..." + aws eks delete-cluster --name $(params.cluster-name) --region $(params.region) $ENDPOINT_FLAG + echo "Waiting for cluster to be deleted..." + aws eks wait cluster-deleted --name $(params.cluster-name) --region $(params.region) $ENDPOINT_FLAG + echo "Cluster is deleted..." + + for i in $(seq 1 $(params.namespace-count)); do + PIA_ROLE_NAME=$(params.cluster-name)-pia-role-$i + PIA_ROLE_EXISTS=$(aws iam get-role --role-name $PIA_ROLE_NAME --query 'Role.RoleName' --output text 2>/dev/null) + if [ "$PIA_ROLE_EXISTS" == "$PIA_ROLE_NAME" ]; then + # Detach all attached managed policies + aws iam list-attached-role-policies --role-name "$PIA_ROLE_NAME" \ + --query 'AttachedPolicies[*].PolicyArn' --output json | jq -r '.[]' | while read -r policy_arn; do + echo "Detaching managed policy: $policy_arn" + aws iam detach-role-policy --role-name "$PIA_ROLE_NAME" --policy-arn "$policy_arn" + done + # Delete all inline policies + aws iam list-role-policies --role-name "$PIA_ROLE_NAME" \ + --query 'PolicyNames' --output json | jq -r '.[]' | while read -r policy_name; do + echo "Deleting inline policy: $policy_name" + aws iam delete-role-policy --role-name "$PIA_ROLE_NAME" --policy-name "$policy_name" + done + # Delete role + aws iam delete-role --role-name $PIA_ROLE_NAME + echo "Role $PIA_ROLE_NAME deleted successfully." + else + echo "Role $PIA_ROLE_NAME does not exist, no action needed." + fi + done + - name: delete-karpenter-role + image: alpine/k8s:1.30.2 + script: | + # Check if the instance profile exists before attempting to delete + if aws iam get-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" >/dev/null 2>&1; then + echo "Found instance profile KarpenterNodeInstanceProfile-$(params.cluster-name)..." + + # Check if the role is attached to the instance profile and remove it + ATTACHED_ROLES=$(aws iam get-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" --query 'InstanceProfile.Roles[?RoleName==`KarpenterNodeRole-$(params.cluster-name)`].RoleName' --output text) + if [ -n "$ATTACHED_ROLES" ]; then + echo "Removing role KarpenterNodeRole-$(params.cluster-name) from instance profile..." + aws iam remove-role-from-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" --role-name "KarpenterNodeRole-$(params.cluster-name)" + echo "Role KarpenterNodeRole-$(params.cluster-name) removed from instance profile successfully." + else + echo "Role KarpenterNodeRole-$(params.cluster-name) is not attached to instance profile. Skipping role removal..." + fi + + echo "Deleting instance profile KarpenterNodeInstanceProfile-$(params.cluster-name)..." + aws iam delete-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" + echo "Instance profile KarpenterNodeInstanceProfile-$(params.cluster-name) deleted successfully." + else + echo "Instance profile KarpenterNodeInstanceProfile-$(params.cluster-name) does not exist. Skipping deletion..." + fi + - name: delete-karpenter-controller-role + image: alpine/k8s:1.30.2 + script: | + echo "Starting Karpenter Controller Role Teardown Task" + echo "===============================================" + + ROLE_NAME="KarpenterControllerRole-$(params.cluster-name)" + POLICY_NAME="KarpenterControllerPolicy-$(params.cluster-name)" + + # Check if the IAM role exists before attempting to delete + echo "" + echo "[INFO] Checking if IAM role $ROLE_NAME exists..." + if aws iam get-role --role-name "$ROLE_NAME" >/dev/null 2>&1; then + echo "[INFO] IAM role $ROLE_NAME found. Proceeding with cleanup..." + + # First, remove any attached inline policies + echo "" + echo "[INFO] Checking for attached inline policies..." + if aws iam get-role-policy --role-name "$ROLE_NAME" --policy-name "$POLICY_NAME" >/dev/null 2>&1; then + echo "[INFO] Removing inline policy $POLICY_NAME from role $ROLE_NAME..." + aws iam delete-role-policy --role-name "$ROLE_NAME" --policy-name "$POLICY_NAME" + echo "[SUCCESS] Successfully removed inline policy $POLICY_NAME" + else + echo "[INFO] No inline policy $POLICY_NAME found on role $ROLE_NAME" + fi + + # List and detach any managed policies (if any exist) + echo "" + echo "[INFO] Checking for attached managed policies..." + ATTACHED_POLICIES=$(aws iam list-attached-role-policies --role-name "$ROLE_NAME" --query 'AttachedPolicies[].PolicyArn' --output text) + + if [ -n "$ATTACHED_POLICIES" ] && [ "$ATTACHED_POLICIES" != "None" ]; then + echo "[INFO] Found attached managed policies. Detaching them..." + for policy_arn in $ATTACHED_POLICIES; do + echo "[INFO] Detaching managed policy: $policy_arn" + aws iam detach-role-policy --role-name "$ROLE_NAME" --policy-arn "$policy_arn" + echo "[SUCCESS] Successfully detached policy: $policy_arn" + done + else + echo "[INFO] No managed policies attached to role $ROLE_NAME" + fi + + # Now delete the role + echo "" + echo "[INFO] Deleting IAM role $ROLE_NAME..." + aws iam delete-role --role-name "$ROLE_NAME" + echo "[SUCCESS] IAM role $ROLE_NAME deleted successfully." + + else + echo "[INFO] IAM role $ROLE_NAME does not exist. Skipping deletion..." + fi + + echo "" + echo "===============================================" + echo "Karpenter Controller Role Teardown Completed" + echo "===============================================" + - name: delete-stacks + image: alpine/k8s:1.30.2 + script: | + #!/bin/bash + set -e + + echo "$(date): Starting CloudFormation stack deletion process..." + + # Define the stacks to delete in order + STACKS=( + "$(params.cluster-name)-node-role" + "$(params.cluster-name)-service-role" + "$(params.cluster-name)" + ) + + # Function to check if a stack exists and get its status + check_stack_status() { + local stack_name=$1 + aws cloudformation describe-stacks \ + --stack-name "$stack_name" \ + --region $(params.region) \ + --query 'Stacks[0].StackStatus' \ + --output text 2>/dev/null || echo "STACK_NOT_FOUND" + } + + # Function to delete a stack if it exists and is in a valid state + delete_stack_if_exists() { + local stack_name=$1 + echo "$(date): Processing stack: $stack_name" + + local stack_status=$(check_stack_status "$stack_name") + echo "$(date): Stack $stack_name status: $stack_status" + + case "$stack_status" in + "STACK_NOT_FOUND") + echo "$(date): Stack $stack_name does not exist. Skipping..." + return 0 + ;; + "CREATE_COMPLETE"|"UPDATE_COMPLETE"|"UPDATE_ROLLBACK_COMPLETE"|"ROLLBACK_COMPLETE") + echo "$(date): Stack $stack_name is in a valid state for deletion. Proceeding..." + ;; + "DELETE_IN_PROGRESS") + echo "$(date): Stack $stack_name is already being deleted. Waiting for completion..." + aws cloudformation wait stack-delete-complete \ + --stack-name "$stack_name" \ + --region $(params.region) + echo "$(date): Stack $stack_name deletion completed." + return 0 + ;; + "DELETE_COMPLETE") + echo "$(date): Stack $stack_name is already deleted. Skipping..." + return 0 + ;; + *) + echo "$(date): Stack $stack_name is in state $stack_status, which is not valid for deletion. Skipping..." + return 0 + ;; + esac + + # Delete the stack + echo "$(date): Initiating deletion of stack $stack_name..." + aws cloudformation delete-stack \ + --stack-name "$stack_name" \ + --region $(params.region) + + # Wait for deletion to complete + echo "$(date): Waiting for stack $stack_name deletion to complete..." + aws cloudformation wait stack-delete-complete \ + --stack-name "$stack_name" \ + --region $(params.region) + + echo "$(date): Stack $stack_name deleted successfully." + } + + # Delete each stack + for stack_name in "${STACKS[@]}"; do + delete_stack_if_exists "$stack_name" + echo "" + done + + echo "$(date): CloudFormation stack deletion process completed successfully." + - name: awscli-delete-asg + image: alpine/k8s:1.23.7 + script: | + #!/bin/bash + set -e + aws sts get-caller-identity + # Stack ids for self managed node groups will have pattern -nodes- + STACK_IDS=$(aws cloudformation describe-stacks \ + --region $(params.region) \ + --query 'Stacks[?contains(StackName, `'$(params.cluster-name)'-nodes-`)].StackName' \ + --output text) + + if [ -z "$STACK_IDS" ]; then + echo "No stacks found matching pattern: $(params.cluster-name)-nodes-" + exit 0 + fi + + echo "Found stacks to delete: $STACK_IDS" + # Delete each stack and wait for completion + for stack_name in $STACK_IDS; do + echo "Deleting stack: $stack_name" + + # Delete the stack + aws cloudformation delete-stack \ + --region $(params.region) \ + --stack-name "$stack_name" + + echo "Waiting for stack deletion to complete..." + + # Wait for deletion to complete + aws cloudformation wait stack-delete-complete \ + --region $(params.region) \ + --stack-name "$stack_name" + + echo "Stack $stack_name deleted successfully!" + done + + echo "All matching stacks have been deleted!" + - name: teardown-eks-role-stack + image: alpine/k8s:1.30.2 + script: | + aws cloudformation delete-stack --stack-name $(params.service-role-stack-name) + aws cloudformation delete-stack --stack-name $(params.launch-template-stack-name) + # wait for the launch-template stack to be completely deleted to avoid race-conditions. + echo "waiting for launch-template stack deletion..." + aws cloudformation wait stack-delete-complete --stack-name $(params.launch-template-stack-name) + STACK_STATUS=$(aws cloudformation describe-stacks --stack-name $(params.node-role-stack-name) --query 'Stacks[0].StackStatus' --output text || echo "STACK_NOT_FOUND") + echo $STACK_STATUS + if [ "$STACK_STATUS" == "DELETE_FAILED" ]; then + echo "Stack is in DELETE_FAILED state, using FORCE_DELETE_STACK" + aws cloudformation delete-stack --stack-name $(params.node-role-stack-name) --deletion-mode FORCE_DELETE_STACK + else + echo "Normal stack deletion" + aws cloudformation delete-stack --stack-name $(params.node-role-stack-name) + fi + - name: send-slack-notification + image: alpine/k8s:1.23.7 + script: | + if [ -n "$(params.slack-hook)" ]; then + curl -H "Content-type: application/json" --data '{"Message": "$(params.slack-message)"}' -X POST $(params.slack-hook) + fi diff --git a/tests/tekton-resources/tasks/teardown/karpenter/kubectl-get-karpenter-logs.yaml b/tests/tekton-resources/tasks/teardown/karpenter/kubectl-get-karpenter-logs.yaml new file mode 100644 index 00000000..e093f423 --- /dev/null +++ b/tests/tekton-resources/tasks/teardown/karpenter/kubectl-get-karpenter-logs.yaml @@ -0,0 +1,56 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: kubectl-get-karpenter-logs + namespace: scalability +spec: + description: "Watch logs from both karpenter pods continually until they are deleted, writing logs to stdout" + params: + - name: cluster-name + description: The name of the cluster + - name: endpoint + description: eks endpoint to use + - name: aws-region + description: AWS region where the cluster is located + default: us-west-2 + - name: namespace + description: Namespace where karpenter is installed + default: karpenter + steps: + - name: get-karpenter-logs + image: alpine/k8s:1.30.2 + script: | + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) --region $(params.aws-region) + + echo "Finding karpenter pods to watch logs..." + + # Get all karpenter pods + karpenter_pods=$(kubectl get pods -n $(params.namespace) -l app.kubernetes.io/name=karpenter -o jsonpath='{.items[*].metadata.name}') + + if [ -z "$karpenter_pods" ]; then + echo "No karpenter pods found in namespace $(params.namespace)" + echo "Checking if namespace exists..." + kubectl get namespace $(params.namespace) || echo "Namespace $(params.namespace) does not exist" + echo "Listing all pods in karpenter namespace (if it exists)..." + kubectl get pods -n $(params.namespace) || echo "Could not list pods in namespace $(params.namespace)" + exit 1 + fi + + echo "Found karpenter pods: $karpenter_pods" + + # Start watching logs for each pod in background + for pod in $karpenter_pods; do + echo "==========================================" + echo "Starting to watch logs for pod: $pod" + echo "==========================================" + + # Follow logs continuously - will exit when pod is deleted + kubectl logs "$pod" -n $(params.namespace) -f & + done + + # Wait for all background log processes + # This will continue until all kubectl logs processes exit (when pods are deleted) + wait + + echo "All karpenter pods have been deleted - log watching completed"