diff --git a/tests/assets/karpenter/controller-role-policy-document.json b/tests/assets/karpenter/controller-role-policy-document.json new file mode 100644 index 00000000..18f11f94 --- /dev/null +++ b/tests/assets/karpenter/controller-role-policy-document.json @@ -0,0 +1,112 @@ +{ + "Statement": [ + { + "Action": [ + "ssm:GetParameter", + "ec2:DescribeImages", + "ec2:RunInstances", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeLaunchTemplates", + "ec2:DescribeInstances", + "ec2:DescribeInstanceTypes", + "ec2:DescribeInstanceTypeOfferings", + "ec2:DeleteLaunchTemplate", + "ec2:CreateTags", + "ec2:CreateLaunchTemplate", + "ec2:CreateFleet", + "ec2:DescribeSpotPriceHistory", + "pricing:GetProducts" + ], + "Effect": "Allow", + "Resource": "*", + "Sid": "Karpenter" + }, + { + "Action": "ec2:TerminateInstances", + "Condition": { + "StringLike": { + "ec2:ResourceTag/karpenter.sh/nodepool": "*" + } + }, + "Effect": "Allow", + "Resource": "*", + "Sid": "ConditionalEC2Termination" + }, + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}", + "Sid": "PassNodeIAMRole" + }, + { + "Effect": "Allow", + "Action": "eks:DescribeCluster", + "Resource": "arn:${AWS_PARTITION}:eks:${AWS_REGION}:${AWS_ACCOUNT_ID}:cluster/${CLUSTER_NAME}", + "Sid": "EKSClusterEndpointLookup" + }, + { + "Sid": "AllowScopedInstanceProfileCreationActions", + "Effect": "Allow", + "Resource": "*", + "Action": [ + "iam:CreateInstanceProfile" + ], + "Condition": { + "StringEquals": { + "aws:RequestTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", + "aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}" + }, + "StringLike": { + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" + } + } + }, + { + "Sid": "AllowScopedInstanceProfileTagActions", + "Effect": "Allow", + "Resource": "*", + "Action": [ + "iam:TagInstanceProfile" + ], + "Condition": { + "StringEquals": { + "aws:ResourceTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", + "aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}", + "aws:RequestTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", + "aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}" + }, + "StringLike": { + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*", + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" + } + } + }, + { + "Sid": "AllowScopedInstanceProfileActions", + "Effect": "Allow", + "Resource": "*", + "Action": [ + "iam:AddRoleToInstanceProfile", + "iam:RemoveRoleFromInstanceProfile", + "iam:DeleteInstanceProfile" + ], + "Condition": { + "StringEquals": { + "aws:ResourceTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", + "aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}" + }, + "StringLike": { + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*" + } + } + }, + { + "Sid": "AllowInstanceProfileReadActions", + "Effect": "Allow", + "Resource": "*", + "Action": "iam:GetInstanceProfile" + } + ], + "Version": "2012-10-17" +} \ No newline at end of file diff --git a/tests/assets/karpenter/controller-role-trust-policy-document.json b/tests/assets/karpenter/controller-role-trust-policy-document.json new file mode 100644 index 00000000..18b6e23b --- /dev/null +++ b/tests/assets/karpenter/controller-role-trust-policy-document.json @@ -0,0 +1,18 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:oidc-provider/${OIDC_ENDPOINT}" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "${OIDC_ENDPOINT}:aud": "sts.amazonaws.com", + "${OIDC_ENDPOINT}:sub": "system:serviceaccount:karpenter:karpenter" + } + } + } + ] +} \ No newline at end of file diff --git a/tests/assets/karpenter/node-role-policy-document.json b/tests/assets/karpenter/node-role-policy-document.json new file mode 100644 index 00000000..19859682 --- /dev/null +++ b/tests/assets/karpenter/node-role-policy-document.json @@ -0,0 +1,12 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "ec2.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} \ No newline at end of file diff --git a/tests/assets/karpenter/nodeclass.yaml b/tests/assets/karpenter/nodeclass.yaml new file mode 100644 index 00000000..d11a1cf9 --- /dev/null +++ b/tests/assets/karpenter/nodeclass.yaml @@ -0,0 +1,60 @@ +apiVersion: karpenter.k8s.aws/v1 +kind: EC2NodeClass +metadata: + name: default +spec: + amiFamily: Custom + instanceProfile: "KarpenterNodeInstanceProfile-${CLUSTER_NAME}" + amiSelectorTerms: + - alias: "al2023@${ALIAS_VERSION}" + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: "${CLUSTER_NAME}" + - tags: + aws:cloudformation:stack-name: "${CLUSTER_NAME}" + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: "${CLUSTER_NAME}" + - tags: + aws:cloudformation:stack-name: "${CLUSTER_NAME}" + - tags: + kubernetes.io/cluster/${CLUSTER_NAME}: owned + kubelet: + maxPods: 110 + systemReserved: + cpu: 100m + memory: 100Mi + ephemeral-storage: 1Gi + kubeReserved: + cpu: 100m + memory: 100Mi + ephemeral-storage: 1Gi + evictionHard: + memory.available: 5% + nodefs.available: 10% + nodefs.inodesFree: 10% + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: application/node.eks.aws + + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + cluster: + name: ${CLUSTER_NAME} + apiServerEndpoint: ${CLUSTER_ENDPOINT} # Using the actual cluster endpoint + certificateAuthority: ${CLUSTER_CA} + cidr: "172.20.0.0/16" + kubelet: + config: + nodeStatusReportFrequency: "60m" + nodeLeaseDurationSeconds: 60 + maxPods: 110 + clusterDNS: ["172.20.0.10"] + flags: + - --node-labels=karpenter.sh/capacity-type=on-demand,karpenter.sh/nodepool=titan-pool + - --register-with-taints=karpenter.sh/unregistered:NoExecute + --BOUNDARY-- \ No newline at end of file diff --git a/tests/assets/karpenter/nodepool.yaml b/tests/assets/karpenter/nodepool.yaml new file mode 100644 index 00000000..22c5229b --- /dev/null +++ b/tests/assets/karpenter/nodepool.yaml @@ -0,0 +1,46 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: ${CLUSTER_NAME}-${AZ} +spec: + disruption: + budgets: + - nodes: 10% + consolidateAfter: 0s + consolidationPolicy: WhenEmptyOrUnderutilized + replicas: 0 + template: + spec: + expireAfter: 720h + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: default + requirements: + - key: topology.kubernetes.io/zone + operator: In + values: + - ${AZ} + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: kubernetes.io/os + operator: In + values: + - linux + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - key: node.kubernetes.io/instance-category + operator: In + values: + - c + - m + - r + - t + - key: karpenter.k8s.aws/instance-size + operator: In + values: + - medium \ No newline at end of file diff --git a/tests/tekton-resources/pipelines/eks/karpenter-ultra.yaml b/tests/tekton-resources/pipelines/eks/karpenter-ultra.yaml new file mode 100644 index 00000000..bbaec91b --- /dev/null +++ b/tests/tekton-resources/pipelines/eks/karpenter-ultra.yaml @@ -0,0 +1,381 @@ +kind: Pipeline +apiVersion: tekton.dev/v1 +metadata: + name: derekff-karpenter-testing + namespace: scalability +spec: + params: + - name: cluster-name + type: string + - default: "" + name: endpoint + type: string + - default: "5000" + name: desired-nodes + type: string + - default: "30" + name: pods-per-node + type: string + - default: "100" + name: nodes-per-namespace + type: string + - default: "50" + name: cl2-load-test-throughput + type: string + - default: kit-eks-scalability/kit-eks-5k/etcd/$(date +%s) + name: results-bucket + type: string + - default: "" + type: string + - default: 'You can monitor here - https://experimental.scalability.eks.aws.dev/#/namespaces/scalability/pipelineruns/ + ;5k node ' + name: slack-message + type: string + - default: "" + name: amp-workspace-id + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/amazon-eks-vpc.json + name: vpc-cfn-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_node_group_launch_template.json + name: ng-cfn-url + type: string + - name: kubernetes-version + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_service_role.json + name: service-role-cfn-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_node_role.json + name: node-role-cfn-url + type: string + - name: manifest-id + type: string + - default: "" + name: eksadm-s3-path + type: string + - default: 1.8.0 + name: karpenter-version + type: string + - default: karpenter + name: karpenter-namespace + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/karpenter/node-role-policy-document.json + name: karpenter-node-role-policy-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/karpenter/controller-role-policy-document.json + name: karpenter-controller-role-policy-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/karpenter/controller-role-trust-policy-document.json + name: karpenter-controller-role-trust-policy-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/refs/heads/main/tests/assets/karpenter/nodeclass.yaml + name: karpenter-ec2nodeclass-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/refs/heads/main/tests/assets/karpenter/nodepool.yaml + name: karpenter-nodepool-url + type: string + - default: "" + name: karpenter-ecr-repo + type: string + - default: "" + name: aws-account-id + type: string + tasks: + - name: awscli-vpc-create + params: + - name: stack-name + value: $(params.cluster-name) + - name: vpc-cfn-url + value: "$(params.vpc-cfn-url)" + taskRef: + kind: Task + name: awscli-vpc-create + - name: create-cluster-service-role + params: + - name: stack-name + value: $(params.cluster-name)-service-role + - name: role-cfn-url + value: $(params.service-role-cfn-url) + - name: role-name + value: "$(params.cluster-name)-service-role" + taskRef: + kind: Task + name: awscli-role-create + - name: create-cluster-node-role + params: + - name: stack-name + value: $(params.cluster-name)-node-role + - name: role-cfn-url + value: $(params.node-role-cfn-url) + - name: role-name + value: "$(params.cluster-name)-node-role" + taskRef: + kind: Task + name: awscli-role-create + - name: create-eks-cluster + params: + - name: cluster-name + value: $(params.cluster-name) + - name: service-role-name + value: $(params.cluster-name)-service-role + - name: endpoint + value: $(params.endpoint) + - name: vpc-stack-name + value: $(params.cluster-name) + - name: manifest-id + value: $(params.manifest-id) + - name: eksadm-s3-path + value: $(params.eksadm-s3-path) + - name: kubernetes-version + value: $(params.kubernetes-version) + retries: 3 + runAfter: + - create-cluster-node-role + - create-cluster-service-role + - awscli-vpc-create + taskRef: + kind: Task + name: awscli-eks-cluster-create-with-vpc-stack + workspaces: + - name: config + workspace: config + - name: create-karpenter-controller-role + params: + - name: cluster-name + value: $(params.cluster-name) + - name: aws-account-id + value: $(params.aws-account-id) + - name: endpoint + value: $(params.endpoint) + - name: karpenter-controller-role-policy-url + value: $(params.karpenter-controller-role-policy-url) + - name: karpenter-controller-role-trust-policy-url + value: $(params.karpenter-controller-role-trust-policy-url) + runAfter: + - create-eks-cluster + taskRef: + kind: Task + name: awscli-controller-role + - name: create-karpenter-mng + params: + - name: cluster-name + value: $(params.cluster-name) + - name: aws-account-id + value: $(params.aws-account-id) + - name: endpoint + value: $(params.endpoint) + runAfter: + - create-eks-cluster + taskRef: + kind: Task + name: awscli-mng + - name: create-karpenter-cfn + params: + - name: cluster-name + value: $(params.cluster-name) + - name: karpenter-version + value: $(params.karpenter-version) + - name: endpoint + value: $(params.endpoint) + - name: account-id + value: $(params.aws-account-id) + runAfter: + - create-eks-cluster + taskRef: + kind: Task + name: awscli-karpenter-cfn-stack + - name: helm-install-karpenter + params: + - name: cluster-name + value: $(params.cluster-name) + - name: karpenter-version + value: $(params.karpenter-version) + - name: aws-account-id + value: $(params.aws-account-id) + - name: karpenter-ecr-repo + value: $(params.karpenter-ecr-repo) + - name: endpoint + value: $(params.endpoint) + runAfter: + - create-karpenter-cfn + - create-karpenter-mng + - create-karpenter-controller-role + - awscli-instance-profile + taskRef: + kind: Task + name: helm-karpenter-install + - name: get-karp-logs + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + runAfter: + - helm-install-karpenter + taskRef: + kind: Task + name: kubectl-get-karpenter-logs + - name: awscli-instance-profile + params: + - name: cluster-name + value: $(params.cluster-name) + runAfter: + - create-karpenter-cfn + - create-karpenter-mng + taskRef: + kind: Task + name: awscli-instanceprofiles + - name: create-nodeclass + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: karpenter-nodeclass-url + value: $(params.karpenter-ec2nodeclass-url) + runAfter: + - helm-install-karpenter + taskRef: + kind: Task + name: create-ec2nodeclass + - name: create-nodepools + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: karpenter-nodepool-url + value: $(params.karpenter-nodepool-url) + runAfter: + - helm-install-karpenter + taskRef: + kind: Task + name: create-nodepool + - name: scale-nodepools + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: replicas + value: 100 + - name: nodepool + value: $(params.cluster-name) + runAfter: + - create-nodepools + taskRef: + kind: Task + name: scale-nodepool + - name: wait-for-scale + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: replicas + value: 100 + - name: nodepool + value: $(params.cluster-name) + runAfter: + - scale-nodepools + taskRef: + kind: Task + name: nodepool-replicas-wait + - name: drift + params: + - name: nodepool + value: $(params.cluster-name) + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + runAfter: + - wait-for-scale + taskRef: + kind: Task + name: drift-nodepool + - name: wait-for-drift + params: + - name: nodepool + value: $(params.cluster-name) + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: value + value: "True" + - name: condition + value: Drifted + - name: presence + value: false + runAfter: + - drift + taskRef: + kind: Task + name: nodepool-condition-wait + - name: scale-down + params: + - name: cluster-name + value: $(params.cluster-name) + - name: nodepool + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: replicas + value: 0 + runAfter: + - wait-for-drift + taskRef: + kind: Task + name: scale-nodepool + - name: wait-for-scale-down + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: replicas + value: 0 + - name: nodepool + value: $(params.cluster-name) + runAfter: + - scale-down + taskRef: + kind: Task + name: nodepool-replicas-wait + - name: uninstall-karpenter + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + runAfter: + - wait-for-scale-down + taskRef: + kind: Task + name: helm-karpenter-uninstall + finally: + - name: teardown + retries: 10 # To deal with throttling during deletion + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: slack-hook + value: $(params.slack-hook) + - name: slack-message + value: "$(params.slack-message) job completed" + - name: service-role-stack-name + value: $(params.cluster-name)-service-role + - name: node-role-stack-name + value: $(params.cluster-name)-node-role + - name: launch-template-stack-name + value: $(params.cluster-name)-launch-template + taskRef: + kind: Task + name: awscli-eks-karpenter-cluster-teardown + workspaces: + - name: source + - name: results + - name: config \ No newline at end of file diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml new file mode 100644 index 00000000..e7012c90 --- /dev/null +++ b/tests/tekton-resources/tasks/generators/karpenter/kubectl-drift.yaml @@ -0,0 +1,86 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: drift-nodepool + namespace: scalability +spec: + description: | + Triggers Karpenter nodepool drift by modifying nodepool template labels. + This task connects to an EKS cluster, captures the current nodepool state, + applies a label change to force node replacement, and verifies the drift operation. + The drift process causes Karpenter to replace existing nodes with new ones + that match the updated nodepool template specification. + DOES NOT CHECK TO SEE IF ALL NODES SUCESSFULLY DRIFT. Use kubectl-nodepool-condition-wait.yaml for that + params: + - name: nodepool + description: Name of the Karpenter nodepool to drift (must exist in cluster) + - name: cluster-name + description: The name of the EKS cluster containing the target nodepool + - name: endpoint + description: EKS cluster endpoint URL for kubectl configuration + - name: aws-region + description: AWS region where the cluster is located (used for AZ discovery) + default: us-west-2 + - name: label-key + description: Label key to add/modify in the nodepool template + default: myLabel + - name: label-val + description: Label value to set for the specified label key + default: myValue + steps: + - name: drift-nodepool + image: alpine/k8s:1.30.2 + script: | + echo "Starting Nodepool Drift Operation" + echo "=================================" + + # Configure kubectl to connect to the EKS cluster + echo "[INFO] Configuring kubectl for cluster access..." + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + echo "[SUCCESS] Successfully configured kubectl" + echo "" + + # Get list of availability zones + AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') + AZ_COUNT=$(echo "$AZ_LIST" | wc -l) + + echo "[INFO] Found $AZ_COUNT availability zones:" + echo "$AZ_LIST" | sed 's/^/ - /' + echo "" + + # Capture cluster state before drift operation + echo "[INFO] Capturing cluster state before nodepool drift..." + echo "-----------------------------------------------------" + + echo "[INFO] Current cluster nodes:" + kubectl get nodes -l karpenter.sh/nodepool=$(params.nodepool) -o wide --show-labels + echo "" + + echo "[INFO] Current nodepool configuration:" + kubectl get nodepool -o yaml + echo "" + + echo "$AZ_LIST" | while read -r az; do + export AZ=$az + # Apply the drift-inducing label change to the nodepool + echo "[INFO] Applying label change to trigger nodepool drift..." + echo "[INFO] Patching nodepool $(params.nodepool)-${az} with label $(params.label-key)=$(params.label-val)" + + kubectl patch nodepool $(params.nodepool)-${az} --type='merge' --patch='{"spec": {"template": {"metadata": {"labels": {"$(params.label-key)": "$(params.label-val)"}}}}}' + + echo "[SUCCESS] Successfully patched nodepool $(params.nodepool)-${az}" + echo "" + + # Verify the drift operation was applied + echo "[INFO] Verifying nodepool drift configuration..." + echo "===============================================" + + echo "[INFO] Updated nodepool configuration:" + kubectl get nodepool $(params.nodepool)-${az} -o yaml + echo "" + done + + echo "===============================================" + echo "[SUCCESS] Nodepool drift operation completed" + echo "[INFO] Karpenter will now begin replacing nodes to match the new template" + echo "===============================================" diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-condition-wait.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-condition-wait.yaml new file mode 100644 index 00000000..cc4b0b37 --- /dev/null +++ b/tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-condition-wait.yaml @@ -0,0 +1,165 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: nodepool-condition-wait + namespace: scalability +spec: + description: "waits for nodeclaims in a given nodepool to have or not have the specified condition with the given value based on presence parameter" + results: + - name: datapoint + description: Stores the result that can be consumed by other tasks (1 for success, 0 for failure) + params: + - name: cluster-name + description: The name of the cluster + - name: endpoint + description: eks endpoint to use + - name: aws-region + default: us-west-2 + - name: initial-delay + default: 1m + - name: condition + description: condition to check (e.g., Ready, MemoryPressure, DiskPressure) + - name: presence + description: whether to check for the presence or absence of the condition with the value + default: true + - name: value + description: value of the condition to validate (e.g., True, False) + - name: nodepool + description: nodepool to check nodeclaim in. + - name: check-interval + description: interval in seconds between checks + default: "60" + - name: timeout + description: total time to wait before timing out in seconds + default: 3000 + steps: + - name: wait-for-condition + image: alpine/k8s:1.30.2 + script: | + sleep $(params.initial-delay) + CHECK_INTERVAL=$(params.check-interval) + TIMEOUT=$(params.timeout) + START_TIME=$(date +%s) + + while true; do + # Check if timeout has been reached + echo "in true" + CURRENT_TIME=$(date +%s) + echo "after current" + ELAPSED_TIME=$((CURRENT_TIME - START_TIME)) + echo "after elapsed" + if [ $ELAPSED_TIME -ge $TIMEOUT ]; then + echo "$(date): Timeout reached after ${ELAPSED_TIME} seconds. Nodepools did not complete within the specified timeout." + echo "0" | tee $(results.datapoint.path) + exit 1 + fi + echo "updating kubeconfig" + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + if [ "$(params.presence)" = "true" ]; then + echo "$(date): Checking that ALL nodeclaims in nodepool $(params.nodepool) have condition $(params.condition)=$(params.value)..." + else + echo "$(date): Checking that NO nodeclaims in nodepool $(params.nodepool) have condition $(params.condition)=$(params.value)..." + fi + + echo "getting AZs" + # Get list of availability zones + AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') + AZ_COUNT=$(echo "$AZ_LIST" | wc -l) + + echo "[INFO] Found $AZ_COUNT availability zones:" + echo "$AZ_LIST" | sed 's/^/ - /' + echo "" + + # Track if all availability zones have nodeclaims meeting the condition + ALL_AZ_READY=true + TOTAL_NODECLAIMS_ALL_AZ=0 + TOTAL_READY_NODECLAIMS_ALL_AZ=0 + + # Check each availability zone + for az in $AZ_LIST; do + # Get all nodeclaims in the AZ-specific nodepool + all_nodeclaims=$(kubectl get nodeclaims -l karpenter.sh/nodepool=$(params.nodepool)-${az} -o json | jq -r '.items[].metadata.name') + + if [ -z "$all_nodeclaims" ]; then + echo "$(date): AZ ${az} - No nodeclaims found in nodepool $(params.nodepool)-${az}" + continue + fi + + # Count total nodeclaims in this AZ + total_nodeclaims=$(echo "$all_nodeclaims" | wc -l) + TOTAL_NODECLAIMS_ALL_AZ=$((TOTAL_NODECLAIMS_ALL_AZ + total_nodeclaims)) + + # Get nodeclaims that have the desired condition with the specified value + nodeclaims_with_condition=$(kubectl get nodeclaims -l karpenter.sh/nodepool=$(params.nodepool)-${az} -o json | jq -r --arg type $(params.condition) --arg status $(params.value) ' + .items[] | + select(.status.conditions[] | select(.type == $type and .status == $status)) | + .metadata.name + ') + + # Count nodeclaims with the desired condition in this AZ + if [ -z "$nodeclaims_with_condition" ]; then + nodeclaims_with_condition_count=0 + else + nodeclaims_with_condition_count=$(echo "$nodeclaims_with_condition" | wc -l) + fi + + TOTAL_READY_NODECLAIMS_ALL_AZ=$((TOTAL_READY_NODECLAIMS_ALL_AZ + nodeclaims_with_condition_count)) + + echo "$(date): AZ ${az} - Nodeclaims with $(params.condition)=$(params.value): $nodeclaims_with_condition_count/$total_nodeclaims" + + if [ "$(params.presence)" = "true" ]; then + # presence=true: Check if all nodeclaims have the condition + if [ "$nodeclaims_with_condition_count" -ne "$total_nodeclaims" ]; then + echo "$(date): AZ ${az} - Not all nodeclaims have $(params.condition)=$(params.value)" + ALL_AZ_READY=false + else + echo "$(date): AZ ${az} - Success! All nodeclaims have $(params.condition)=$(params.value)" + fi + else + # presence=false: Check if no nodeclaims have the condition + if [ "$nodeclaims_with_condition_count" -ne 0 ]; then + echo "$(date): AZ ${az} - Some nodeclaims still have $(params.condition)=$(params.value)" + ALL_AZ_READY=false + else + echo "$(date): AZ ${az} - Success! No nodeclaims have $(params.condition)=$(params.value)" + fi + fi + done + + echo "$(date): Overall status - Nodeclaims with $(params.condition)=$(params.value): $TOTAL_READY_NODECLAIMS_ALL_AZ/$TOTAL_NODECLAIMS_ALL_AZ across all AZs" + + # Check success condition based on presence parameter + if [ "$(params.presence)" = "true" ]; then + # presence=true: Exit if all availability zones have all nodeclaims meeting the condition + if [ "$ALL_AZ_READY" = "true" ] && [ "$TOTAL_NODECLAIMS_ALL_AZ" -gt 0 ]; then + echo "$(date): Success! All nodeclaims across all availability zones have $(params.condition)=$(params.value)" + echo "1" | tee $(results.datapoint.path) + exit 0 + fi + + if [ "$TOTAL_NODECLAIMS_ALL_AZ" -eq 0 ]; then + echo "$(date): No nodeclaims found in any availability zone for nodepool $(params.nodepool)" + else + echo "$(date): Waiting for remaining nodeclaims to achieve $(params.condition)=$(params.value)..." + fi + else + # presence=false: Exit if no nodeclaims have the condition with the specified value + if [ "$TOTAL_READY_NODECLAIMS_ALL_AZ" -eq 0 ] && [ "$TOTAL_NODECLAIMS_ALL_AZ" -gt 0 ]; then + echo "$(date): Success! No nodeclaims across all availability zones have $(params.condition)=$(params.value)" + echo "1" | tee $(results.datapoint.path) + exit 0 + fi + + if [ "$TOTAL_NODECLAIMS_ALL_AZ" -eq 0 ]; then + echo "$(date): No nodeclaims found in any availability zone for nodepool $(params.nodepool)" + else + echo "$(date): Waiting for nodeclaims to no longer have $(params.condition)=$(params.value)..." + fi + fi + + echo "$(date): Waiting ${CHECK_INTERVAL} seconds before next check..." + sleep $CHECK_INTERVAL + done + done + exit 1 diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-replicas-wait.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-replicas-wait.yaml new file mode 100644 index 00000000..3c434fa0 --- /dev/null +++ b/tests/tekton-resources/tasks/generators/karpenter/kubectl-nodepool-replicas-wait.yaml @@ -0,0 +1,97 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: nodepool-replicas-wait + namespace: scalability +spec: + description: "waits for the number of ready nodes in a nodepool to equal the specified replicas count" + results: + - name: datapoint + description: Stores the result that can be consumed by other tasks (1 for success, 0 for failure) + params: + - name: cluster-name + description: The name of the cluster + - name: endpoint + description: eks endpoint to use + - name: aws-region + default: us-west-2 + - name: initial-delay + default: 1m + - name: replicas + description: number of ready replicas in the nodepool to wait for + - name: nodepool + description: nodepool to check nodes in. + - name: check-interval + description: interval in seconds between checks + default: "60" + - name: timeout + description: total time to wait before timing out + default: 3000 + steps: + - name: wait-for-replicas + image: alpine/k8s:1.30.2 + script: | + sleep $(params.initial-delay) + CHECK_INTERVAL=$(params.check-interval) + TARGET_REPLICAS=$(params.replicas) + TIMEOUT=$(params.timeout) + START_TIME=$(date +%s) + + while true; do + # Check if timeout has been reached + CURRENT_TIME=$(date +%s) + ELAPSED_TIME=$((CURRENT_TIME - START_TIME)) + + if [ $ELAPSED_TIME -ge $TIMEOUT ]; then + echo "$(date): Timeout reached after ${ELAPSED_TIME} seconds. Nodepools did not complete within the specified timeout." + echo "0" | tee $(results.datapoint.path) + exit 1 + fi + + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + echo "$(date): Checking ready nodes in nodepool $(params.nodepool)..." + + # Get list of availability zones + AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') + AZ_COUNT=$(echo "$AZ_LIST" | wc -l) + + echo "[INFO] Found $AZ_COUNT availability zones:" + echo "$AZ_LIST" | sed 's/^/ - /' + echo "" + + # Track if all availability zones have reached target replicas + ALL_AZ_READY=true + + # Check each availability zone + for az in $AZ_LIST; do + ready_nodes_count=$(kubectl get nodes -l karpenter.sh/nodepool=$(params.nodepool)-${az} -o json | jq -r ' + [.items[] | + select(.status.conditions[] | select(.type == "Ready" and .status == "True"))] | + length + ') + + echo "$(date): AZ ${az} - Ready nodes: $ready_nodes_count, Target replicas: $TARGET_REPLICAS" + + if [ "$ready_nodes_count" -ne "$TARGET_REPLICAS" ]; then + echo "$(date): AZ ${az} - Ready nodes count ($ready_nodes_count) does not match target replicas ($TARGET_REPLICAS)" + ALL_AZ_READY=false + else + echo "$(date): AZ ${az} - Success! Ready nodes count matches target replicas ($TARGET_REPLICAS)" + fi + done + + # Exit if all availability zones have reached target replicas + if [ "$ALL_AZ_READY" = "true" ]; then + echo "$(date): All availability zones have reached target replica count. Exiting successfully." + echo "1" | tee $(results.datapoint.path) + exit 0 + fi + + echo "$(date): Not all availability zones have reached target replicas. Waiting ${CHECK_INTERVAL} seconds before next check..." + + done + sleep $CHECK_INTERVAL + done + + exit 1 diff --git a/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml b/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml new file mode 100644 index 00000000..08b621e8 --- /dev/null +++ b/tests/tekton-resources/tasks/generators/karpenter/kubectl-scale.yaml @@ -0,0 +1,93 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: scale-nodepool + namespace: scalability +spec: + description: | + Scales a Karpenter nodepool by modifying the number of replicas. + This task configures kubectl access to the EKS cluster, captures the current + cluster state for monitoring purposes, performs the scaling operation, + and verifies the scaling request was applied successfully. + DOES NOT CHECK TO SEE IF ALL NODES HAVE GONE READY. Use kubectl-nodepool-replicas-wait.yaml for that + params: + - name: replicas + description: Number of replicas to scale the nodepool to (target replica count) + - name: nodepool + description: Name of the Karpenter nodepool resource to scale + - name: cluster-name + description: The name of the EKS cluster containing the nodepool + - name: endpoint + description: EKS cluster endpoint URL for kubectl configuration + - name: aws-region + description: AWS region where the cluster is located (used for AZ discovery) + default: us-west-2 + steps: + - name: scale-nodepool + image: alpine/k8s:1.30.2 + script: | + echo "Starting Nodepool Scaling Task" + echo "==============================" + + # Configure kubectl to connect to the EKS cluster + echo "[INFO] Configuring kubectl for cluster access..." + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + echo "[SUCCESS] Successfully configured kubectl" + echo "" + + # Discover availability zones and scale nodepools + echo "" + echo "[INFO] Discovering availability zones in region: $(params.aws-region)" + + # Get list of availability zones + AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') + AZ_COUNT=$(echo "$AZ_LIST" | wc -l) + + echo "[INFO] Found $AZ_COUNT availability zones:" + echo "$AZ_LIST" | sed 's/^/ - /' + echo "" + + # Capture current cluster state before scaling operation + echo "[INFO] Capturing cluster state before scaling operation..." + echo "--------------------------------------------------------" + + echo "[INFO] Current nodepool status:" + kubectl get nodepools -o wide + echo "" + + # Process each availability zone + NODEPOOL_COUNT=0 + echo "$AZ_LIST" | while read -r az; do + export AZ=$az + + echo "[INFO] Current nodepool $(params.nodepool) detailed status:" + kubectl get nodepool $(params.nodepool)-${az} -o yaml + echo "" + + echo "[INFO] Current nodepool nodes:" + kubectl get nodes -l karpenter.sh/nodepool=$(params.nodepool)-${az} -o wide + echo "" + + # Perform the scaling operation + echo "[INFO] Scaling nodepool $(params.nodepool)-${az} to $(params.replicas) replicas..." + kubectl scale nodepool $(params.nodepool)-${az} --replicas $(params.replicas) + echo "[SUCCESS] Scaling command executed successfully" + echo "" + + echo "[INFO] Updated nodepool $(params.nodepool) detailed status:" + kubectl get nodepool $(params.nodepool)-${az} -o yaml + echo "" + done + + + # Verify the scaling operation was applied + echo "[INFO] Verifying scaling operation results..." + echo "=============================================" + + echo "[INFO] Updated nodepool status:" + kubectl get nodepools -o wide + echo "" + + echo "=============================================" + echo "Nodepool Scaling Operation Begun" + echo "=============================================" \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml new file mode 100644 index 00000000..1bb28d77 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-controller-role.yaml @@ -0,0 +1,142 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-controller-role + namespace: scalability +spec: + description: | + Creates the Karpenter Controller IAM Role with necessary permissions for managing EC2 instances. + This task downloads trust and policy documents, configures OIDC integration, and creates/updates + the IAM role and policies required for Karpenter to function properly in the EKS cluster. + results: + - name: node-role-arn + description: Stores the controller role ARN created by the task + params: + - name: cluster-name + description: The name of the EKS cluster for which the controller role will be created + - name: endpoint + description: EKS cluster endpoint URL for API operations + - name: aws-region + description: AWS region where the cluster and IAM resources are located + default: us-west-2 + - name: aws-account-id + description: AWS account ID where the IAM role will be created + - name: aws-partition + description: AWS partition (aws, aws-cn, aws-us-gov) + default: aws + - name: karpenter-controller-role-trust-policy-url + description: URL of the trust policy document template for the controller role + default: https://raw.githubusercontent.com/DerekFrank/kubernetes-iteration-toolkit/refs/heads/main/tests/assets/karpenter/controller-role-trust-policy-document.json + - name: karpenter-controller-role-policy-url + description: URL of the IAM policy document template for the controller role + default: https://raw.githubusercontent.com/DerekFrank/kubernetes-iteration-toolkit/refs/heads/main/tests/assets/karpenter/controller-role-policy-document.json + workspaces: + - name: source + mountPath: /src/karpenter/ + steps: + - name: create-role + image: alpine/k8s:1.30.2 + script: | + echo "Starting Karpenter Controller Role Creation Task" + echo "==============================================" + + # Set up environment variables for template substitution + echo "" + echo "[INFO] Setting up environment variables..." + export AWS_PARTITION=$(params.aws-partition) + export AWS_ACCOUNT_ID=$(params.aws-account-id) + export AWS_REGION=$(params.aws-region) + export CLUSTER_NAME=$(params.cluster-name) + + # Retrieve OIDC issuer information from EKS cluster + echo "" + echo "[INFO] Retrieving OIDC issuer information from EKS cluster..." + export RAW_OIDC_ENDPOINT="$(aws eks --endpoint $(params.endpoint) describe-cluster --name "$(params.cluster-name)" \ + --query "cluster.identity.oidc.issuer" --output text)" + + if [ -z "$RAW_OIDC_ENDPOINT" ]; then + echo "[ERROR] Failed to retrieve OIDC endpoint from cluster" + exit 1 + fi + + export OIDC_ID=$(aws eks --endpoint $(params.endpoint) describe-cluster --name $(params.cluster-name) --region $(params.aws-region) --query "cluster.identity.oidc.issuer" --output text | cut -d '/' -f 5) + export OIDC_ENDPOINT=$(echo ${RAW_OIDC_ENDPOINT#*//}) + + echo "[SUCCESS] Retrieved OIDC information:" + echo " - Raw OIDC Endpoint: $RAW_OIDC_ENDPOINT" + echo " - OIDC ID: $OIDC_ID" + echo " - OIDC Endpoint: $OIDC_ENDPOINT" + + # Download and process trust policy document + echo "" + echo "[INFO] Downloading trust policy document from: $(params.karpenter-controller-role-trust-policy-url)" + curl -fsSL $(params.karpenter-controller-role-trust-policy-url) -o $(workspaces.source.path)karpenter-controller-role-trust-policy-url.json + + echo "[INFO] Original trust policy template:" + echo "----------------------------------------" + cat $(workspaces.source.path)karpenter-controller-role-trust-policy-url.json | sed 's/^/ /' + echo "----------------------------------------" + + echo "[INFO] Processing trust policy template with environment variables..." + envsubst < $(workspaces.source.path)karpenter-controller-role-trust-policy-url.json > $(workspaces.source.path)karpenter-controller-role-trust-policy-url-modified.json + + echo "[INFO] Processed trust policy document:" + echo "----------------------------------------" + cat $(workspaces.source.path)karpenter-controller-role-trust-policy-url-modified.json | sed 's/^/ /' + echo "----------------------------------------" + + # Create or verify IAM role existence + echo "" + echo "[INFO] Checking if IAM role KarpenterControllerRole-$(params.cluster-name) exists..." + if aws iam get-role --role-name "KarpenterControllerRole-$(params.cluster-name)" >/dev/null 2>&1; then + echo "[INFO] IAM role KarpenterControllerRole-$(params.cluster-name) already exists, skipping creation" + else + echo "[INFO] Creating IAM role KarpenterControllerRole-$(params.cluster-name)..." + aws iam create-role --role-name "KarpenterControllerRole-$(params.cluster-name)" \ + --assume-role-policy-document file://$(workspaces.source.path)karpenter-controller-role-trust-policy-url-modified.json + echo "[SUCCESS] Successfully created IAM role KarpenterControllerRole-$(params.cluster-name)" + fi + + # Download and process IAM policy document + echo "" + echo "[INFO] Downloading IAM policy document from: $(params.karpenter-controller-role-policy-url)" + curl -fsSL $(params.karpenter-controller-role-policy-url) -o $(workspaces.source.path)karpenter-controller-role-policy-url.json + + echo "[INFO] Processing IAM policy template with environment variables..." + envsubst < $(workspaces.source.path)karpenter-controller-role-policy-url.json > $(workspaces.source.path)karpenter-controller-role-policy-url-modified.json + + echo "[INFO] Processed IAM policy document:" + echo "----------------------------------------" + cat $(workspaces.source.path)karpenter-controller-role-policy-url-modified.json | sed 's/^/ /' + echo "----------------------------------------" + + # Create or update role policy + echo "" + echo "[INFO] Checking if role policy KarpenterControllerPolicy-$(params.cluster-name) exists..." + if aws iam get-role-policy --role-name "KarpenterControllerRole-$(params.cluster-name)" --policy-name "KarpenterControllerPolicy-$(params.cluster-name)" >/dev/null 2>&1; then + echo "[INFO] Role policy KarpenterControllerPolicy-$(params.cluster-name) already exists, updating..." + else + echo "[INFO] Creating role policy KarpenterControllerPolicy-$(params.cluster-name)..." + fi + + aws iam put-role-policy --role-name "KarpenterControllerRole-$(params.cluster-name)" \ + --policy-name "KarpenterControllerPolicy-$(params.cluster-name)" \ + --policy-document file://$(workspaces.source.path)karpenter-controller-role-policy-url-modified.json + echo "[SUCCESS] Successfully applied role policy KarpenterControllerPolicy-$(params.cluster-name)" + + # Verify the created resources + echo "" + echo "[INFO] Verifying created IAM resources..." + echo "========================================" + + echo "[INFO] IAM Role details:" + aws iam get-role --role-name "KarpenterControllerRole-$(params.cluster-name)" --query 'Role.[RoleName,Arn,CreateDate]' --output table + + echo "" + echo "[INFO] Attached role policies:" + aws iam list-role-policies --role-name "KarpenterControllerRole-$(params.cluster-name)" --output table + + echo "" + echo "==============================================" + echo "Karpenter Controller Role Creation Completed" + echo "==============================================" diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml new file mode 100644 index 00000000..ef169c30 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-instanceprofiles.yaml @@ -0,0 +1,31 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-instanceprofiles + namespace: scalability +spec: + description: | + Creates the karpenter instance profile + params: + - name: cluster-name + description: The name of the cluster + steps: + - name: create-role + image: alpine/k8s:1.30.2 + script: | + # Check if the instance profile already exists + if aws iam get-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" >/dev/null 2>&1; then + echo "Instance profile KarpenterNodeInstanceProfile-$(params.cluster-name) already exists. Skipping creation..." + else + echo "Creating instance profile KarpenterNodeInstanceProfile-$(params.cluster-name)..." + aws iam create-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" + fi + + # Check if the role is already added to the instance profile + EXISTING_ROLES=$(aws iam get-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" --query 'InstanceProfile.Roles[?RoleName==`KarpenterNodeRole-$(params.cluster-name)`].RoleName' --output text) + if [ -n "$EXISTING_ROLES" ]; then + echo "Role KarpenterNodeRole-$(params.cluster-name) is already attached to instance profile. Skipping..." + else + echo "Adding role KarpenterNodeRole-$(params.cluster-name) to instance profile..." + aws iam add-role-to-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" --role-name "KarpenterNodeRole-$(params.cluster-name)" + fi diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml new file mode 100644 index 00000000..924f855a --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-karpenter-cfn-stack.yaml @@ -0,0 +1,97 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-karpenter-cfn-stack + namespace: scalability +spec: + description: | + Creates the karpenter instance roles and sqs interruption queue + params: + - name: cluster-name + description: The name of the cluster + - name: karpenter-version + description: Version of Karpenter to deploy + - name: endpoint + description: Endpoint to use with EKS + - name: region + default: us-west-2 + description: The region where the cluster is in. + - name: account-id + description: The aws account the cluster is running in + workspaces: + - name: source + mountPath: /src/karpenter/ + steps: + - name: create-stack + image: alpine/k8s:1.30.2 + script: | + STACK_NAME=Karpenter-$(params.cluster-name) + STACK_STATUS=$(aws cloudformation describe-stacks --query 'Stacks[?StackName==`'${STACK_NAME}'`].StackStatus' --output text --region $(params.region)) + curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/$(params.karpenter-version)/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml -o $(workspaces.source.path)cloudformation.yaml + + cat /src/karpenter/cloudformation.yaml + + UPDATE_OUTPUT=$(aws eks update-cluster-config --name $(params.cluster-name) --access-config authenticationMode=API_AND_CONFIG_MAP --endpoint $(params.endpoint)) + + echo $UPDATE_OUTPUT + + # Extract the update ID from the output + UPDATE_ID=$(echo "$UPDATE_OUTPUT" | jq -r '.update.id // empty') + + echo "Waiting for cluster config update $UPDATE_ID to complete..." + + # Wait for the update to complete + while true; do + UPDATE_STATUS=$(aws eks describe-update --name $(params.cluster-name) --update-id "$UPDATE_ID" --endpoint $(params.endpoint) --query 'update.status' --output text) + + case "$UPDATE_STATUS" in + "Successful") + echo "Cluster config update completed successfully" + break + ;; + "Failed"|"Cancelled") + echo "Cluster config update failed with status: $UPDATE_STATUS" + exit 1 + ;; + "InProgress") + echo "Update still in progress, waiting 30 seconds..." + sleep 30 + ;; + *) + echo "Unknown update status: $UPDATE_STATUS" + sleep 30 + ;; + esac + done + + if [[ "$STACK_STATUS" == "" ]]; then + aws cloudformation deploy \ + --stack-name "Karpenter-$(params.cluster-name)" \ + --template-file $(workspaces.source.path)cloudformation.yaml \ + --capabilities CAPABILITY_NAMED_IAM \ + --parameter-overrides "ClusterName=$(params.cluster-name)" + + aws cloudformation wait stack-create-complete --stack-name $STACK_NAME --region $(params.region) + echo "CREATED_CFN_STACK=$STACK_NAME" + else + echo "$STACK_NAME Already exists" + fi + + aws eks describe-cluster --name "$(params.cluster-name)" --output text --endpoint $(params.endpoint) + + export AWS_EKS_ENDPOINT=$(params.endpoint) + # Check if OIDC provider is already associated + echo "Associating OIDC provider with cluster..." + eksctl utils associate-iam-oidc-provider --cluster "$(params.cluster-name)" --approve + + # Check if access entry already exists + if aws eks describe-access-entry --cluster-name "$(params.cluster-name)" --principal-arn "arn:aws:iam::$(params.account-id):role/KarpenterNodeRole-$(params.cluster-name)" --endpoint $(params.endpoint) >/dev/null 2>&1; then + echo "Access entry for KarpenterNodeRole already exists. Skipping creation..." + else + echo "Creating access entry for KarpenterNodeRole..." + aws eks create-access-entry \ + --cluster-name "$(params.cluster-name)" \ + --principal-arn "arn:aws:iam::$(params.account-id):role/KarpenterNodeRole-$(params.cluster-name)" \ + --endpoint $(params.endpoint) \ + --type EC2_LINUX + fi diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml new file mode 100644 index 00000000..f7cbea11 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-mng.yaml @@ -0,0 +1,93 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-mng + namespace: scalability +spec: + description: | + Creates a dedicated Karpenter managed node group (MNG) for the EKS cluster. + This task creates a large-capacity node group specifically designed to host Karpenter + system components with appropriate taints and labels to ensure proper scheduling. + The node group uses r5.24xlarge instances with dedicated=karpenter taints. + params: + - name: cluster-name + description: The name of the EKS cluster where the Karpenter MNG will be created + - name: aws-account-id + description: AWS account ID used to construct the node role ARN + - name: endpoint + description: EKS cluster endpoint URL for AWS EKS CLI operations + - name: region + default: "us-west-2" + description: AWS region where the EKS cluster is located + steps: + - name: create-mng + image: alpine/k8s:1.30.2 + script: | + echo "Starting Karpenter Managed Node Group Creation" + + # Discover subnets associated with the cluster + echo "[INFO] Discovering subnets for cluster $(params.cluster-name)..." + SUBNET_IDS=$(aws ec2 describe-subnets \ + --filters "Name=tag:aws:cloudformation:stack-name,Values=$(params.cluster-name)" \ + --query 'Subnets[*].SubnetId' \ + --output text \ + --region $(params.region)) + echo "[INFO] Discovered Subnets: $SUBNET_IDS" + + # Create the Karpenter managed node group + echo "[INFO] Creating Karpenter managed node group..." + echo "" + + aws eks create-nodegroup \ + --cluster-name $(params.cluster-name) \ + --nodegroup-name karpenter-system-large \ + --node-role arn:aws:iam::$(params.aws-account-id):role/$(params.cluster-name)-node-role \ + --instance-types r5.24xlarge \ + --scaling-config minSize=2,maxSize=3,desiredSize=2 \ + --subnets ${SUBNET_IDS} \ + --labels dedicated=karpenter \ + --region $(params.region) \ + --endpoint-url $(params.endpoint) \ + --taints key=dedicated,value=karpenter,effect=NO_SCHEDULE + + # Verify the node group was created and list all node groups + echo "[INFO] Verifying node group creation..." + echo "======================================" + + NODE_GROUPS=$(aws eks list-nodegroups \ + --endpoint-url $(params.endpoint) \ + --cluster-name $(params.cluster-name) \ + --region $(params.region) \ + --query 'nodegroups' \ + --output text) + + if [ -z "$NODE_GROUPS" ]; then + echo "[WARNING] No node groups found in cluster" + else + NODE_GROUP_COUNT=$(echo $NODE_GROUPS | wc -w) + echo "[SUCCESS] Found $NODE_GROUP_COUNT node group(s) in cluster:" + echo "$NODE_GROUPS" | tr ' ' '\n' | sed 's/^/ - /' + fi + echo "" + + # Display detailed information about the Karpenter node group + echo "[INFO] Retrieving Karpenter node group details..." + aws eks describe-nodegroup \ + --cluster-name $(params.cluster-name) \ + --nodegroup-name karpenter-system-large \ + --region $(params.region) \ + --endpoint-url $(params.endpoint) \ + --query '{ + Status: nodegroup.status, + InstanceTypes: nodegroup.instanceTypes, + ScalingConfig: nodegroup.scalingConfig, + Labels: nodegroup.labels, + Taints: nodegroup.taints, + SubnetIds: nodegroup.subnets + }' \ + --output table + echo "" + + echo "==============================================" + echo "Karpenter Managed Node Group Creation Complete" + echo "==============================================" diff --git a/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml b/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml new file mode 100644 index 00000000..2a13f8a3 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/awscli-node-role.yaml @@ -0,0 +1,111 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-node-role + namespace: scalability +spec: + description: | + Creates the Karpenter Node IAM Role with required policies for EKS worker nodes. + This task creates an IAM role that allows EC2 instances to assume the role and attaches + the necessary AWS managed policies for EKS worker node functionality including container + registry access, CNI networking, and Systems Manager access. + results: + - name: node-role-arn + description: The ARN of the created Karpenter node IAM role + params: + - name: cluster-name + description: The name of the EKS cluster (used to create unique role name) + steps: + - name: create-role + image: alpine/k8s:1.30.2 + script: | + echo "Starting Karpenter Node IAM Role Creation Task" + echo "==============================================" + + # Set role name variable for consistency + ROLE_NAME="KarpenterNodeRole-$(params.cluster-name)" + + echo "[INFO] Target role name: $ROLE_NAME" + echo "" + + # Check if the IAM role already exists + echo "[INFO] Checking if IAM role already exists..." + if aws iam get-role --role-name "$ROLE_NAME" >/dev/null 2>&1; then + echo "[INFO] IAM role $ROLE_NAME already exists, skipping creation" + else + echo "[INFO] Creating new IAM role: $ROLE_NAME" + echo "[INFO] Configuring trust policy for EC2 service..." + + # Create the IAM role with trust policy for EC2 + aws iam create-role --role-name "$ROLE_NAME" \ + --assume-role-policy-document '{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "ec2.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] + }' + + echo "[SUCCESS] Successfully created IAM role: $ROLE_NAME" + fi + echo "" + + # Define required AWS managed policies for EKS worker nodes + echo "[INFO] Preparing to attach required AWS managed policies..." + POLICIES=( + "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" + "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + ) + + echo "[INFO] Required policies to attach:" + for policy in "${POLICIES[@]}"; do + echo " - $policy" + done + echo "" + + # Attach required policies to the role + POLICY_COUNT=0 + for policy in "${POLICIES[@]}"; do + POLICY_COUNT=$((POLICY_COUNT + 1)) + echo "[INFO] Processing policy $POLICY_COUNT of ${#POLICIES[@]}: $policy" + + # Check if policy is already attached to avoid duplicate attachments + if aws iam list-attached-role-policies --role-name "$ROLE_NAME" --query "AttachedPolicies[?PolicyArn=='$policy'].PolicyArn" --output text | grep -q "$policy"; then + echo "[INFO] Policy already attached, skipping: $policy" + else + echo "[INFO] Attaching policy to role..." + aws iam attach-role-policy --role-name "$ROLE_NAME" --policy-arn "$policy" + echo "[SUCCESS] Successfully attached policy: $policy" + fi + echo "" + done + + # Retrieve and store the role ARN for use by other tasks + echo "[INFO] Retrieving role ARN for task output..." + ROLE_ARN=$(aws iam get-role --role-name "$ROLE_NAME" --query 'Role.Arn' --output text) + echo "[INFO] Role ARN: $ROLE_ARN" + + # Write ARN to results file for pipeline consumption + echo "$ROLE_ARN" > $(results.node-role-arn) + echo "[SUCCESS] Role ARN saved to task results" + echo "" + + # Verify final role configuration + echo "[INFO] Verifying final role configuration..." + echo "==========================================" + echo "[INFO] Role details:" + aws iam get-role --role-name "$ROLE_NAME" --query 'Role.{RoleName:RoleName,Arn:Arn,CreateDate:CreateDate}' --output table + echo "" + echo "[INFO] Attached policies:" + aws iam list-attached-role-policies --role-name "$ROLE_NAME" --query 'AttachedPolicies[].{PolicyName:PolicyName,PolicyArn:PolicyArn}' --output table + echo "" + echo "==========================================" + echo "Karpenter Node IAM Role Creation Completed" + echo "==========================================" diff --git a/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml b/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml new file mode 100644 index 00000000..4fa8fd03 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/helm-karpenter-install.yaml @@ -0,0 +1,136 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: helm-karpenter-install + namespace: scalability +spec: + description: | + Installs Karpenter on an EKS cluster using Helm. + This task authenticates with ECR, configures kubectl, validates cluster state, + and installs Karpenter with optimized settings for large-scale workloads. + params: + - name: cluster-name + description: The name of the EKS cluster where Karpenter will be installed + - name: aws-account-id + description: AWS account ID for IAM role ARN construction + - name: karpenter-ecr-repo + description: ECR repository URL containing the Karpenter Helm chart + - name: karpenter-version + description: Version of Karpenter to install (e.g., v0.32.0) + - name: endpoint + description: EKS cluster endpoint URL for kubectl configuration + workspaces: + - name: config + steps: + - name: install-karpenter + image: alpine/k8s:1.30.2 + script: | + echo "Starting Karpenter Installation Task" + echo "====================================" + + # Authenticate with ECR for Helm registry access + echo "[INFO] Authenticating with ECR registry..." + aws ecr get-login-password --region us-west-2 | helm registry login --username AWS --password-stdin $(params.karpenter-ecr-repo) + echo "[SUCCESS] Successfully authenticated with ECR" + echo "" + + # Configure kubectl to connect to the EKS cluster + echo "[INFO] Configuring kubectl for cluster access..." + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + echo "[SUCCESS] Successfully configured kubectl" + echo "" + + # Verify karpenter-system nodegroup exists + echo "[INFO] Verifying karpenter-system nodegroup..." + aws eks describe-nodegroup --cluster-name $(params.cluster-name) --endpoint $(params.endpoint) --nodegroup-name karpenter-system-large + echo "" + + # Capture cluster state before installation for troubleshooting + echo "[INFO] Capturing cluster state before Karpenter installation..." + echo "----------------------------------------" + + echo "[INFO] Current cluster nodes:" + kubectl get nodes -o wide + echo "" + + echo "[INFO] Current pods across all namespaces:" + kubectl get pods -A -o wide + echo "" + + echo "[INFO] Current deployments across all namespaces:" + kubectl get deployments -A -o wide + echo "----------------------------------------" + echo "" + + # Install Karpenter using Helm with optimized configuration + echo "[INFO] Installing Karpenter with Helm..." + echo "" + + helm upgrade --install karpenter oci://$(params.karpenter-ecr-repo)/karpenter/karpenter --version $(params.karpenter-version) \ + --namespace "karpenter" \ + --create-namespace \ + --set "settings.clusterName=$(params.cluster-name)" \ + --set "settings.interruptionQueue=" \ + --set "settings.eksControlPlane=true" \ + --set-string "settings.awsCreateQPS=60" \ + --set "settings.featureGates.disableMetricsControllers=true" \ + --set "settings.featureGates.nodeRepair=true" \ + --set settings.featureGates.reservedCapacity="true" \ + --set settings.featureGates.spotToSpotConsolidation="true" \ + --set settings.featureGates.disableMetricsControllers="true" \ + --set settings.preferencePolicy=Ignore \ + --set "serviceAccount.annotations.eks\.amazonaws\.com/role-arn=arn:aws:iam::$(params.aws-account-id):role/KarpenterControllerRole-$(params.cluster-name)" \ + --set controller.resources.requests.cpu=60 \ + --set controller.resources.requests.memory=200Gi \ + --set controller.resources.limits.cpu=60 \ + --set controller.resources.limits.memory=200Gi \ + --set "controller.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key=dedicated" \ + --set "controller.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator=In" \ + --set "controller.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0]=karpenter" \ + --set "tolerations[0].key=dedicated" \ + --set "tolerations[0].value=karpenter" \ + --set "tolerations[0].operator=Equal" \ + --set "dnsPolicy=Default" \ + --set-string "controller.env[0].name=AWS_ENDPOINT_URL_EKS" \ + --set-string "controller.env[0].value=$(params.endpoint)" \ + --set-string "controller.env[1].name=KUBE_CLIENT_QPS" \ + --set-string "controller.env[1].value=50000" \ + --set-string "controller.env[2].name=KUBE_CLIENT_BURST" \ + --set-string "controller.env[2].value=50000" \ + --set-string "controller.env[3].name=ENABLE_PROFILING" \ + --set-string "controller.env[3].value=true" \ + --timeout 100m \ + --debug \ + --wait + + echo "" + echo "[SUCCESS] Karpenter installation completed" + echo "" + + # Verify the installation + echo "[INFO] Verifying Karpenter installation..." + echo "=========================================" + + KARPENTER_PODS=$(kubectl get pods -n karpenter --no-headers 2>/dev/null | wc -l) + echo "[SUCCESS] Found $KARPENTER_PODS Karpenter pod(s) in the cluster" + echo "" + + echo "[INFO] Current Karpenter pod status:" + kubectl get pods -n karpenter -o wide + echo "" + + echo "[INFO] Current Karpenter pod detailed:" + kubectl get pods -n karpenter -o yaml + echo "" + + echo "[INFO] Karpenter deployment details:" + kubectl get deployment -n karpenter -o wide + echo "" + + echo "[INFO] Karpenter deployment details:" + kubectl get deployment -n karpenter -o yaml + echo "" + + echo "=========================================" + echo "Karpenter Installation Completed" + echo "=========================================" diff --git a/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml b/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml new file mode 100644 index 00000000..38da850b --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodeclass.yaml @@ -0,0 +1,85 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: create-ec2nodeclass + namespace: scalability +spec: + description: | + Creates and applies an EC2NodeClass resource for Karpenter node provisioning. + This task retrieves cluster configuration, downloads a nodeclass template, + substitutes environment variables, and applies the configuration to the cluster. + params: + - name: cluster-name + description: The name of the EKS cluster where the EC2NodeClass will be created + - name: endpoint + description: The AWS EKS API endpoint URL to use for cluster operations + - name: karpenter-nodeclass-url + description: The URL of the EC2NodeClass YAML template to download and apply + workspaces: + - name: source + mountPath: /src/karpenter/ + description: Workspace for storing downloaded and processed nodeclass files + steps: + - name: create-ec2nodeclass + image: alpine/k8s:1.30.2 + script: | + echo "Starting EC2NodeClass creation process for cluster: $(params.cluster-name)" + + # Retrieve cluster certificate authority data for node authentication + echo "Fetching cluster certificate authority data..." + export CLUSTER_CA=$(aws eks describe-cluster \ + --name $(params.cluster-name) \ + --endpoint-url $(params.endpoint) \ + --query 'cluster.certificateAuthority.data' \ + --output text) + echo "Successfully retrieved cluster CA data" + + # Retrieve cluster API endpoint for node communication + echo "Fetching cluster API endpoint..." + export CLUSTER_ENDPOINT=$(aws eks describe-cluster \ + --name $(params.cluster-name) \ + --endpoint-url $(params.endpoint) \ + --query 'cluster.endpoint' \ + --output text) + echo "Cluster endpoint retrieved: ${CLUSTER_ENDPOINT}" + + # Set cluster name for template substitution + export CLUSTER_NAME=$(params.cluster-name) + echo "Using cluster name: ${CLUSTER_NAME}" + + # Set AMI alias version for node instances + export ALIAS_VERSION=latest + echo "Using AMI alias version: ${ALIAS_VERSION}" + + # Download the EC2NodeClass template from the specified URL + echo "Downloading EC2NodeClass template from: $(params.karpenter-nodeclass-url)" + curl -fsSL $(params.karpenter-nodeclass-url) -o $(workspaces.source.path)ec2nodeclass.yaml + echo "Template downloaded successfully to $(workspaces.source.path)ec2nodeclass.yaml" + + # Display the original template for verification + echo "Original EC2NodeClass template content:" + cat $(workspaces.source.path)ec2nodeclass.yaml + + # Substitute environment variables in the template + echo "Performing environment variable substitution in template..." + envsubst < $(workspaces.source.path)ec2nodeclass.yaml > $(workspaces.source.path)ec2nodeclass-modified.yaml + echo "Environment variable substitution completed" + + # Display the processed template with substituted values + echo "Processed EC2NodeClass configuration:" + cat $(workspaces.source.path)ec2nodeclass-modified.yaml + + # Update kubeconfig to authenticate with the target cluster + echo "Updating kubeconfig for cluster access..." + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + echo "Kubeconfig updated successfully" + + # Apply the EC2NodeClass configuration to the cluster + echo "Applying EC2NodeClass configuration to cluster..." + kubectl apply -f $(workspaces.source.path)ec2nodeclass-modified.yaml + echo "EC2NodeClass applied successfully" + + # Verify the EC2NodeClass was created and display its configuration + echo "Retrieving and displaying created EC2NodeClass resources:" + kubectl get ec2nodeclass -o yaml + echo "EC2NodeClass creation process completed successfully" diff --git a/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodepools.yaml b/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodepools.yaml new file mode 100644 index 00000000..5f3eb805 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/karpenter/kubectl-nodepools.yaml @@ -0,0 +1,112 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: create-nodepool + namespace: scalability +spec: + description: | + Creates Karpenter NodePool resources for each availability zone in the specified AWS region. + This task downloads a nodepool template, customizes it for each AZ, and applies it to the cluster. + params: + - name: cluster-name + description: The name of the EKS cluster where nodepools will be created + - name: endpoint + description: EKS cluster endpoint URL for kubectl configuration + - name: aws-region + description: AWS region where the cluster is located (used for AZ discovery) + default: us-west-2 + - name: karpenter-nodepool-url + description: URL of the nodepool YAML template to download and customize + workspaces: + - name: source + mountPath: /src/karpenter/ + steps: + - name: create-nodepools + image: alpine/k8s:1.30.2 + script: | + echo "Starting Karpenter NodePool Creation Task" + + # Configure kubectl to connect to the EKS cluster + echo "[INFO] Configuring kubectl for cluster access..." + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) + if [ $? -eq 0 ]; then + echo "[SUCCESS] Successfully configured kubectl" + else + echo "[ERROR] Failed to configure kubectl" + exit 1 + fi + + # Set cluster name environment variable for template substitution + export CLUSTER_NAME=$(params.cluster-name) + + # Download the nodepool template + echo "" + echo "[INFO] Downloading nodepool template from: $(params.karpenter-nodepool-url)" + curl -fsSL $(params.karpenter-nodepool-url) -o $(workspaces.source.path)nodepool.yaml + + # Display the downloaded template for verification + echo "" + echo "[INFO] Downloaded nodepool template content:" + echo "----------------------------------------" + cat $(workspaces.source.path)nodepool.yaml + echo "----------------------------------------" + + # Discover availability zones and create nodepools + echo "" + echo "[INFO] Discovering availability zones in region: $(params.aws-region)" + + # Get list of availability zones + AZ_LIST=$(aws ec2 describe-availability-zones --region $(params.aws-region) --query 'AvailabilityZones[].ZoneName' --output json | jq -r '.[]') + AZ_COUNT=$(echo "$AZ_LIST" | wc -l) + + echo "[INFO] Found $AZ_COUNT availability zones:" + echo "$AZ_LIST" | sed 's/^/ - /' + echo "" + + # Process each availability zone + NODEPOOL_COUNT=0 + echo "$AZ_LIST" | while read -r az; do + if [ -z "$az" ]; then + continue + fi + + NODEPOOL_COUNT=$((NODEPOOL_COUNT + 1)) + export AZ=$az + + echo "[INFO] Creating nodepool for availability zone: $az" + + # Generate AZ-specific nodepool configuration + echo "[INFO] Generating nodepool configuration for $az..." + envsubst < $(workspaces.source.path)nodepool.yaml > $(workspaces.source.path)nodepool-${az}.yaml + + # Display the generated configuration + echo "[INFO] Generated nodepool configuration for $az:" + echo "----------------------------------------" + cat $(workspaces.source.path)nodepool-${az}.yaml | sed 's/^/ /' + echo "----------------------------------------" + + # Apply the nodepool configuration + echo "[INFO] Applying nodepool configuration for $az..." + kubectl apply -f $(workspaces.source.path)nodepool-${az}.yaml + echo "" + done + + # Verify the created nodepools + echo "[INFO] Verifying created nodepools..." + echo "==================================" + + NODEPOOL_LIST=$(kubectl get nodepool --no-headers 2>/dev/null | wc -l) + echo "[SUCCESS] Found $NODEPOOL_LIST nodepool(s) in the cluster" + echo "" + echo "[INFO] Current nodepool status:" + kubectl get nodepool -o wide + echo "" + echo "----------------------------------------" + echo "[INFO] Detailed nodepool configuration:" + kubectl get nodepool -o yaml + echo "----------------------------------------" + + echo "" + echo "==========================================" + echo "Karpenter NodePool Creation Completed" + echo "==========================================" diff --git a/tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter.yaml b/tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter.yaml new file mode 100644 index 00000000..aecdd5a3 --- /dev/null +++ b/tests/tekton-resources/tasks/teardown/karpenter/awscli-karpenter.yaml @@ -0,0 +1,381 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: awscli-eks-karpenter-cluster-teardown + namespace: scalability +spec: + description: | + Teardown an EKS cluster. + This Task can be used to teardown an EKS cluster with mng in an AWS account. + params: + - name: cluster-name + description: The name of the EKS cluster which will be teared down. + - name: region + default: us-west-2 + description: The region where the cluster is in. + - name: endpoint + default: "" + - name: namespace-count + description: The number of namespaces for EKS Pod Identity test. + default: "0" + - name: slack-hook + default: "" + - name: slack-message + default: "Job is completed" + - name: service-role-stack-name + - name: node-role-stack-name + - name: launch-template-stack-name + steps: + - name: terminate-cluster-instances + image: alpine/k8s:1.30.2 + script: | + #!/bin/bash + set -e + + echo "$(date): Starting EC2 instance termination for cluster $(params.cluster-name)..." + + # Find all EC2 instances that belong to the cluster using the aws:eks:cluster-name tag + echo "$(date): Finding instances with tag aws:eks:cluster-name=$(params.cluster-name)..." + INSTANCE_IDS=$(aws ec2 describe-instances \ + --region $(params.region) \ + --filters "Name=tag:aws:eks:cluster-name,Values=$(params.cluster-name)" "Name=instance-state-name,Values=running,pending,stopping,stopped" \ + --query 'Reservations[*].Instances[*].InstanceId' \ + --output text) + + if [ -z "$INSTANCE_IDS" ]; then + echo "$(date): No instances found with tag aws:eks:cluster-name=$(params.cluster-name)" + echo "$(date): Instance termination completed - no instances to terminate" + exit 0 + fi + + # Convert to array and count instances + INSTANCE_ARRAY=($INSTANCE_IDS) + INSTANCE_COUNT=${#INSTANCE_ARRAY[@]} + + echo "$(date): Found $INSTANCE_COUNT instances to terminate: $INSTANCE_IDS" + + # List instance details for logging + echo "$(date): Instance details:" + aws ec2 describe-instances \ + --region $(params.region) \ + --instance-ids $INSTANCE_IDS \ + --query 'Reservations[*].Instances[*].[InstanceId,InstanceType,State.Name,LaunchTime]' \ + --output table + + # Terminate all instances belonging to the cluster + echo "$(date): Terminating instances..." + aws ec2 terminate-instances \ + --region $(params.region) \ + --instance-ids $INSTANCE_IDS + + echo "$(date): Termination request sent for all instances" + + # Wait for all instances to be terminated + echo "$(date): Waiting for all instances to be terminated..." + TIMEOUT=600 # 10 minutes timeout + CHECK_INTERVAL=15 # Check every 15 seconds + START_TIME=$(date +%s) + + while true; do + # Check if timeout has been reached + CURRENT_TIME=$(date +%s) + ELAPSED_TIME=$((CURRENT_TIME - START_TIME)) + + if [ $ELAPSED_TIME -ge $TIMEOUT ]; then + echo "$(date): Timeout reached after ${ELAPSED_TIME} seconds. Some instances may still be terminating." + # List remaining instances for debugging + REMAINING_INSTANCES=$(aws ec2 describe-instances \ + --region $(params.region) \ + --instance-ids $INSTANCE_IDS \ + --query 'Reservations[*].Instances[?State.Name!=`terminated`].InstanceId' \ + --output text 2>/dev/null || echo "") + if [ -n "$REMAINING_INSTANCES" ] && [ "$REMAINING_INSTANCES" != "None" ]; then + echo "$(date): Instances still not terminated: $REMAINING_INSTANCES" + fi + exit 1 + fi + + # Check instance states + RUNNING_INSTANCES=$(aws ec2 describe-instances \ + --region $(params.region) \ + --instance-ids $INSTANCE_IDS \ + --query 'Reservations[*].Instances[?State.Name!=`terminated`].InstanceId' \ + --output text 2>/dev/null || echo "") + + if [ -z "$RUNNING_INSTANCES" ] || [ "$RUNNING_INSTANCES" = "None" ]; then + echo "$(date): Success! All instances have been terminated" + break + else + RUNNING_COUNT=$(echo "$RUNNING_INSTANCES" | wc -w) + echo "$(date): Still waiting for $RUNNING_COUNT instances to be terminated: $RUNNING_INSTANCES" + echo "$(date): Waiting ${CHECK_INTERVAL} seconds before next check..." + sleep $CHECK_INTERVAL + fi + done + + echo "$(date): EC2 instance termination completed successfully" + - name: delete-cluster + image: alpine/k8s:1.23.7 + script: | + set +e + ENDPOINT_FLAG="" + if [ -n "$(params.endpoint)" ]; then + ENDPOINT_FLAG="--endpoint $(params.endpoint)" + fi + + for i in `aws eks list-nodegroups --cluster-name $(params.cluster-name) $ENDPOINT_FLAG --region $(params.region) | jq -r '.nodegroups[]'`; + do + aws eks delete-nodegroup --nodegroup-name $i --cluster-name $(params.cluster-name) $ENDPOINT_FLAG --region $(params.region); + aws eks wait nodegroup-deleted --nodegroup-name $i --cluster-name $(params.cluster-name) $ENDPOINT_FLAG --region $(params.region); + done; + echo "Starting to delete cluster..." + aws eks delete-cluster --name $(params.cluster-name) --region $(params.region) $ENDPOINT_FLAG + echo "Waiting for cluster to be deleted..." + aws eks wait cluster-deleted --name $(params.cluster-name) --region $(params.region) $ENDPOINT_FLAG + echo "Cluster is deleted..." + + for i in $(seq 1 $(params.namespace-count)); do + PIA_ROLE_NAME=$(params.cluster-name)-pia-role-$i + PIA_ROLE_EXISTS=$(aws iam get-role --role-name $PIA_ROLE_NAME --query 'Role.RoleName' --output text 2>/dev/null) + if [ "$PIA_ROLE_EXISTS" == "$PIA_ROLE_NAME" ]; then + # Detach all attached managed policies + aws iam list-attached-role-policies --role-name "$PIA_ROLE_NAME" \ + --query 'AttachedPolicies[*].PolicyArn' --output json | jq -r '.[]' | while read -r policy_arn; do + echo "Detaching managed policy: $policy_arn" + aws iam detach-role-policy --role-name "$PIA_ROLE_NAME" --policy-arn "$policy_arn" + done + # Delete all inline policies + aws iam list-role-policies --role-name "$PIA_ROLE_NAME" \ + --query 'PolicyNames' --output json | jq -r '.[]' | while read -r policy_name; do + echo "Deleting inline policy: $policy_name" + aws iam delete-role-policy --role-name "$PIA_ROLE_NAME" --policy-name "$policy_name" + done + # Delete role + aws iam delete-role --role-name $PIA_ROLE_NAME + echo "Role $PIA_ROLE_NAME deleted successfully." + else + echo "Role $PIA_ROLE_NAME does not exist, no action needed." + fi + done + - name: delete-karpenter-role + image: alpine/k8s:1.30.2 + script: | + # Check if the instance profile exists before attempting to delete + if aws iam get-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" >/dev/null 2>&1; then + echo "Found instance profile KarpenterNodeInstanceProfile-$(params.cluster-name)..." + + # Check if the role is attached to the instance profile and remove it + ATTACHED_ROLES=$(aws iam get-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" --query 'InstanceProfile.Roles[?RoleName==`KarpenterNodeRole-$(params.cluster-name)`].RoleName' --output text) + if [ -n "$ATTACHED_ROLES" ]; then + echo "Removing role KarpenterNodeRole-$(params.cluster-name) from instance profile..." + aws iam remove-role-from-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" --role-name "KarpenterNodeRole-$(params.cluster-name)" + echo "Role KarpenterNodeRole-$(params.cluster-name) removed from instance profile successfully." + else + echo "Role KarpenterNodeRole-$(params.cluster-name) is not attached to instance profile. Skipping role removal..." + fi + + echo "Deleting instance profile KarpenterNodeInstanceProfile-$(params.cluster-name)..." + aws iam delete-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-$(params.cluster-name)" + echo "Instance profile KarpenterNodeInstanceProfile-$(params.cluster-name) deleted successfully." + else + echo "Instance profile KarpenterNodeInstanceProfile-$(params.cluster-name) does not exist. Skipping deletion..." + fi + - name: delete-karpenter-controller-role + image: alpine/k8s:1.30.2 + script: | + echo "Starting Karpenter Controller Role Teardown Task" + echo "===============================================" + + ROLE_NAME="KarpenterControllerRole-$(params.cluster-name)" + POLICY_NAME="KarpenterControllerPolicy-$(params.cluster-name)" + + # Check if the IAM role exists before attempting to delete + echo "" + echo "[INFO] Checking if IAM role $ROLE_NAME exists..." + if aws iam get-role --role-name "$ROLE_NAME" >/dev/null 2>&1; then + echo "[INFO] IAM role $ROLE_NAME found. Proceeding with cleanup..." + + # First, remove any attached inline policies + echo "" + echo "[INFO] Checking for attached inline policies..." + if aws iam get-role-policy --role-name "$ROLE_NAME" --policy-name "$POLICY_NAME" >/dev/null 2>&1; then + echo "[INFO] Removing inline policy $POLICY_NAME from role $ROLE_NAME..." + aws iam delete-role-policy --role-name "$ROLE_NAME" --policy-name "$POLICY_NAME" + echo "[SUCCESS] Successfully removed inline policy $POLICY_NAME" + else + echo "[INFO] No inline policy $POLICY_NAME found on role $ROLE_NAME" + fi + + # List and detach any managed policies (if any exist) + echo "" + echo "[INFO] Checking for attached managed policies..." + ATTACHED_POLICIES=$(aws iam list-attached-role-policies --role-name "$ROLE_NAME" --query 'AttachedPolicies[].PolicyArn' --output text) + + if [ -n "$ATTACHED_POLICIES" ] && [ "$ATTACHED_POLICIES" != "None" ]; then + echo "[INFO] Found attached managed policies. Detaching them..." + for policy_arn in $ATTACHED_POLICIES; do + echo "[INFO] Detaching managed policy: $policy_arn" + aws iam detach-role-policy --role-name "$ROLE_NAME" --policy-arn "$policy_arn" + echo "[SUCCESS] Successfully detached policy: $policy_arn" + done + else + echo "[INFO] No managed policies attached to role $ROLE_NAME" + fi + + # Now delete the role + echo "" + echo "[INFO] Deleting IAM role $ROLE_NAME..." + aws iam delete-role --role-name "$ROLE_NAME" + echo "[SUCCESS] IAM role $ROLE_NAME deleted successfully." + + else + echo "[INFO] IAM role $ROLE_NAME does not exist. Skipping deletion..." + fi + + echo "" + echo "===============================================" + echo "Karpenter Controller Role Teardown Completed" + echo "===============================================" + - name: delete-stacks + image: alpine/k8s:1.30.2 + script: | + #!/bin/bash + set -e + + echo "$(date): Starting CloudFormation stack deletion process..." + + # Define the stacks to delete in order + STACKS=( + "$(params.cluster-name)-node-role" + "$(params.cluster-name)-service-role" + "$(params.cluster-name)" + ) + + # Function to check if a stack exists and get its status + check_stack_status() { + local stack_name=$1 + aws cloudformation describe-stacks \ + --stack-name "$stack_name" \ + --region $(params.region) \ + --query 'Stacks[0].StackStatus' \ + --output text 2>/dev/null || echo "STACK_NOT_FOUND" + } + + # Function to delete a stack if it exists and is in a valid state + delete_stack_if_exists() { + local stack_name=$1 + echo "$(date): Processing stack: $stack_name" + + local stack_status=$(check_stack_status "$stack_name") + echo "$(date): Stack $stack_name status: $stack_status" + + case "$stack_status" in + "STACK_NOT_FOUND") + echo "$(date): Stack $stack_name does not exist. Skipping..." + return 0 + ;; + "CREATE_COMPLETE"|"UPDATE_COMPLETE"|"UPDATE_ROLLBACK_COMPLETE"|"ROLLBACK_COMPLETE") + echo "$(date): Stack $stack_name is in a valid state for deletion. Proceeding..." + ;; + "DELETE_IN_PROGRESS") + echo "$(date): Stack $stack_name is already being deleted. Waiting for completion..." + aws cloudformation wait stack-delete-complete \ + --stack-name "$stack_name" \ + --region $(params.region) + echo "$(date): Stack $stack_name deletion completed." + return 0 + ;; + "DELETE_COMPLETE") + echo "$(date): Stack $stack_name is already deleted. Skipping..." + return 0 + ;; + *) + echo "$(date): Stack $stack_name is in state $stack_status, which is not valid for deletion. Skipping..." + return 0 + ;; + esac + + # Delete the stack + echo "$(date): Initiating deletion of stack $stack_name..." + aws cloudformation delete-stack \ + --stack-name "$stack_name" \ + --region $(params.region) + + # Wait for deletion to complete + echo "$(date): Waiting for stack $stack_name deletion to complete..." + aws cloudformation wait stack-delete-complete \ + --stack-name "$stack_name" \ + --region $(params.region) + + echo "$(date): Stack $stack_name deleted successfully." + } + + # Delete each stack + for stack_name in "${STACKS[@]}"; do + delete_stack_if_exists "$stack_name" + echo "" + done + + echo "$(date): CloudFormation stack deletion process completed successfully." + - name: awscli-delete-asg + image: alpine/k8s:1.23.7 + script: | + #!/bin/bash + set -e + aws sts get-caller-identity + # Stack ids for self managed node groups will have pattern -nodes- + STACK_IDS=$(aws cloudformation describe-stacks \ + --region $(params.region) \ + --query 'Stacks[?contains(StackName, `'$(params.cluster-name)'-nodes-`)].StackName' \ + --output text) + + if [ -z "$STACK_IDS" ]; then + echo "No stacks found matching pattern: $(params.cluster-name)-nodes-" + exit 0 + fi + + echo "Found stacks to delete: $STACK_IDS" + # Delete each stack and wait for completion + for stack_name in $STACK_IDS; do + echo "Deleting stack: $stack_name" + + # Delete the stack + aws cloudformation delete-stack \ + --region $(params.region) \ + --stack-name "$stack_name" + + echo "Waiting for stack deletion to complete..." + + # Wait for deletion to complete + aws cloudformation wait stack-delete-complete \ + --region $(params.region) \ + --stack-name "$stack_name" + + echo "Stack $stack_name deleted successfully!" + done + + echo "All matching stacks have been deleted!" + - name: teardown-eks-role-stack + image: alpine/k8s:1.30.2 + script: | + aws cloudformation delete-stack --stack-name $(params.service-role-stack-name) + aws cloudformation delete-stack --stack-name $(params.launch-template-stack-name) + # wait for the launch-template stack to be completely deleted to avoid race-conditions. + echo "waiting for launch-template stack deletion..." + aws cloudformation wait stack-delete-complete --stack-name $(params.launch-template-stack-name) + STACK_STATUS=$(aws cloudformation describe-stacks --stack-name $(params.node-role-stack-name) --query 'Stacks[0].StackStatus' --output text || echo "STACK_NOT_FOUND") + echo $STACK_STATUS + if [ "$STACK_STATUS" == "DELETE_FAILED" ]; then + echo "Stack is in DELETE_FAILED state, using FORCE_DELETE_STACK" + aws cloudformation delete-stack --stack-name $(params.node-role-stack-name) --deletion-mode FORCE_DELETE_STACK + else + echo "Normal stack deletion" + aws cloudformation delete-stack --stack-name $(params.node-role-stack-name) + fi + - name: send-slack-notification + image: alpine/k8s:1.23.7 + script: | + if [ -n "$(params.slack-hook)" ]; then + curl -H "Content-type: application/json" --data '{"Message": "$(params.slack-message)"}' -X POST $(params.slack-hook) + fi diff --git a/tests/tekton-resources/tasks/teardown/karpenter/kubectl-get-karpenter-logs.yaml b/tests/tekton-resources/tasks/teardown/karpenter/kubectl-get-karpenter-logs.yaml new file mode 100644 index 00000000..e093f423 --- /dev/null +++ b/tests/tekton-resources/tasks/teardown/karpenter/kubectl-get-karpenter-logs.yaml @@ -0,0 +1,56 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: kubectl-get-karpenter-logs + namespace: scalability +spec: + description: "Watch logs from both karpenter pods continually until they are deleted, writing logs to stdout" + params: + - name: cluster-name + description: The name of the cluster + - name: endpoint + description: eks endpoint to use + - name: aws-region + description: AWS region where the cluster is located + default: us-west-2 + - name: namespace + description: Namespace where karpenter is installed + default: karpenter + steps: + - name: get-karpenter-logs + image: alpine/k8s:1.30.2 + script: | + aws eks update-kubeconfig --name $(params.cluster-name) --endpoint $(params.endpoint) --region $(params.aws-region) + + echo "Finding karpenter pods to watch logs..." + + # Get all karpenter pods + karpenter_pods=$(kubectl get pods -n $(params.namespace) -l app.kubernetes.io/name=karpenter -o jsonpath='{.items[*].metadata.name}') + + if [ -z "$karpenter_pods" ]; then + echo "No karpenter pods found in namespace $(params.namespace)" + echo "Checking if namespace exists..." + kubectl get namespace $(params.namespace) || echo "Namespace $(params.namespace) does not exist" + echo "Listing all pods in karpenter namespace (if it exists)..." + kubectl get pods -n $(params.namespace) || echo "Could not list pods in namespace $(params.namespace)" + exit 1 + fi + + echo "Found karpenter pods: $karpenter_pods" + + # Start watching logs for each pod in background + for pod in $karpenter_pods; do + echo "==========================================" + echo "Starting to watch logs for pod: $pod" + echo "==========================================" + + # Follow logs continuously - will exit when pod is deleted + kubectl logs "$pod" -n $(params.namespace) -f & + done + + # Wait for all background log processes + # This will continue until all kubectl logs processes exit (when pods are deleted) + wait + + echo "All karpenter pods have been deleted - log watching completed"