On GCP use BackendConfig to set the backend timeout and other improvm…

…ents (kubeflow#513) * On GCP use BackendConfig to set the backend timeout and other improvements. * As of GKE 1.13 we can now set the backend timeout using a BackendConfig https://cloud.google.com/kubernetes-engine/docs/how-to/configure-backend-service * BasicAuth doesn't have a backendconfig so we need to add one. * In BasicAuth we are still using Ambassador to handle the auth. * Improve the setup_backend.sh script for IAP * This script is only updating the JWT based on IAP; there's no reason the script should be checking if the backend timeout is set. * Make the script run for ever to avoid confusing container restarts * Related to kubeflow#4269 backend timeout not set * Related to kubeflow#4043 crash looping of iap enabler and backend_updater pods. * Update tests. * Update Ambassador test.
arrikto · Oct 15, 2019 · 288ae85 · 288ae85
1 parent 9e78376
commit 288ae85
Show file tree

Hide file tree

Showing 17 changed files with 929 additions and 834 deletions.
diff --git a/common/ambassador/base/service.yaml b/common/ambassador/base/service.yaml
@@ -17,6 +17,9 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
+  annotations:
+    # Ambassador is only used on GCP with basic auth.
+    beta.cloud.google.com/backend-config: '{"ports": {"ambassador":"basicauth-backendconfig"}}'
   labels:
     service: ambassador
   name: ambassador

diff --git a/gcp/basic-auth-ingress/base/backend-config.yaml b/gcp/basic-auth-ingress/base/backend-config.yaml
@@ -0,0 +1,7 @@
+apiVersion: cloud.google.com/v1beta1
+kind: BackendConfig
+metadata:
+  name: basicauth-backendconfig
+spec:
+  # Jupyter uses websockets so we want to increase the timeout.
+  timeoutSec: 3600
diff --git a/gcp/basic-auth-ingress/base/config-map.yaml b/gcp/basic-auth-ingress/base/config-map.yaml
@@ -3,8 +3,8 @@ data:
   update_backend.sh: |
     #!/bin/bash
     #
-    # A simple shell script to configure the backend timeouts and health checks by using gcloud.
-
+    # A simple shell script to configure the health checks by using gcloud.
+    set -x 
     [ -z ${NAMESPACE} ] && echo Error NAMESPACE must be set && exit 1
     [ -z ${SERVICE} ] && echo Error SERVICE must be set && exit 1
     [ -z ${INGRESS_NAME} ] && echo Error INGRESS_NAME must be set && exit 1
@@ -15,58 +15,67 @@ data:
       exit 1
     fi
 
-    # Activate the service account, allow 5 retries
-    for i in {1..5}; do gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS} && break || sleep 10; done
-
-    NODE_PORT=$(kubectl --namespace=${NAMESPACE} get svc ${SERVICE} -o jsonpath='{.spec.ports[0].nodePort}')
-    echo node port is ${NODE_PORT}
-
-    while [[ -z ${BACKEND_NAME} ]]; do
-      BACKENDS=$(kubectl --namespace=${NAMESPACE} get ingress ${INGRESS_NAME} -o jsonpath='{.metadata.annotations.ingress\.kubernetes\.io/backends}')
-      echo "fetching backends info with ${INGRESS_NAME}: ${BACKENDS}"
-      BACKEND_NAME=$(echo $BACKENDS | grep -o "k8s-be-${NODE_PORT}--[0-9a-z]\+")
-      echo "backend name is ${BACKEND_NAME}"
-      sleep 2
-    done
-
-    while [[ -z ${BACKEND_SERVICE} ]];
-    do BACKEND_SERVICE=$(gcloud --project=${PROJECT} compute backend-services list --filter=name~k8s-be-${NODE_PORT}- --uri);
-    echo "Waiting for the backend-services resource PROJECT=${PROJECT} NODEPORT=${NODE_PORT} SERVICE=${SERVICE}...";
-    sleep 2;
-    done
-
-    while [[ -z ${HEALTH_CHECK_URI} ]];
-    do HEALTH_CHECK_URI=$(gcloud compute --project=${PROJECT} health-checks list --filter=name~${BACKEND_NAME} --uri);
-    echo "Waiting for the healthcheck resource PROJECT=${PROJECT} NODEPORT=${NODE_PORT} SERVICE=${SERVICE}...";
-    sleep 2;
+    set_health_check() {
+      # Activate the service account, allow 5 retries
+      if [[ ! -z "${GOOGLE_APPLICATION_CREDENTIALS}" ]]; then
+        # TODO(jlewi): As of 0.7 we should always be using workload identity. We can remove it post 0.7.0 once we have workload identity
+        # fully working
+        # Activate the service account, allow 5 retries
+        for i in {1..5}; do gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS} && break || sleep 10; done
+      fi      
+
+      # For debugging print out what account we are using
+      gcloud auth list
+
+      NODE_PORT=$(kubectl --namespace=${NAMESPACE} get svc ${SERVICE} -o jsonpath='{.spec.ports[0].nodePort}')
+      echo node port is ${NODE_PORT}
+
+      while [[ -z ${BACKEND_NAME} ]]; do
+        BACKENDS=$(kubectl --namespace=${NAMESPACE} get ingress ${INGRESS_NAME} -o jsonpath='{.metadata.annotations.ingress\.kubernetes\.io/backends}')
+        echo "fetching backends info with ${INGRESS_NAME}: ${BACKENDS}"
+        BACKEND_NAME=$(echo $BACKENDS | grep -o "k8s-be-${NODE_PORT}--[0-9a-z]\+")
+        echo "backend name is ${BACKEND_NAME}"
+        sleep 2
+      done
+
+      while [[ -z ${BACKEND_SERVICE} ]];
+      do BACKEND_SERVICE=$(gcloud --project=${PROJECT} compute backend-services list --filter=name~k8s-be-${NODE_PORT}- --uri);
+      echo "Waiting for the backend-services resource PROJECT=${PROJECT} NODEPORT=${NODE_PORT} SERVICE=${SERVICE}...";
+      sleep 2;
+      done
+
+      while [[ -z ${HEALTH_CHECK_URI} ]];
+      do HEALTH_CHECK_URI=$(gcloud compute --project=${PROJECT} health-checks list --filter=name~${BACKEND_NAME} --uri);
+      echo "Waiting for the healthcheck resource PROJECT=${PROJECT} NODEPORT=${NODE_PORT} SERVICE=${SERVICE}...";
+      sleep 2;
+      done
+
+      echo health check URI is ${HEALTH_CHECK_URI}
+
+      # Since we create the envoy-ingress ingress object before creating the envoy
+      # deployment object, healthcheck will not be configured correctly in the GCP
+      # load balancer. It will default the healthcheck request path to a value of
+      # / instead of the intended /healthz.
+      # Manually update the healthcheck request path to /healthz
+      if [[ ${HEALTHCHECK_PATH} ]]; then
+        echo Running health checks update ${HEALTH_CHECK_URI} with ${HEALTHCHECK_PATH}
+        gcloud --project=${PROJECT} compute health-checks update http ${HEALTH_CHECK_URI} --request-path=${HEALTHCHECK_PATH}
+      else
+        echo Running health checks update ${HEALTH_CHECK_URI} with /healthz
+        gcloud --project=${PROJECT} compute health-checks update http ${HEALTH_CHECK_URI} --request-path=/healthz
+      fi
+
+      if [[ ${USE_ISTIO} ]]; then
+        # Create the route so healthcheck can pass
+        kubectl apply -f /var/envoy-config/healthcheck_route.yaml
+      fi
+    }
+
+    while true; do
+      set_health_check
+      echo "Backend updated successfully. Waiting 1 hour before updating again."
+      sleep 3600
     done
-
-    echo health check URI is ${HEALTH_CHECK_URI}
-
-    # Since we create the envoy-ingress ingress object before creating the envoy
-    # deployment object, healthcheck will not be configured correctly in the GCP
-    # load balancer. It will default the healthcheck request path to a value of
-    # / instead of the intended /healthz.
-    # Manually update the healthcheck request path to /healthz
-    if [[ ${HEALTHCHECK_PATH} ]]; then
-      echo Running health checks update ${HEALTH_CHECK_URI} with ${HEALTHCHECK_PATH}
-      gcloud --project=${PROJECT} compute health-checks update http ${HEALTH_CHECK_URI} --request-path=${HEALTHCHECK_PATH}
-    else
-      echo Running health checks update ${HEALTH_CHECK_URI} with /healthz
-      gcloud --project=${PROJECT} compute health-checks update http ${HEALTH_CHECK_URI} --request-path=/healthz
-    fi
-
-    if [[ ${USE_ISTIO} ]]; then
-      # Create the route so healthcheck can pass
-      kubectl apply -f /var/envoy-config/healthcheck_route.yaml
-    fi
-
-    # Since JupyterHub uses websockets we want to increase the backend timeout
-    echo Increasing backend timeout for JupyterHub
-    gcloud --project=${PROJECT} compute backend-services update --global ${BACKEND_SERVICE} --timeout=3600
-
-    echo "Backend updated successfully. Waiting 1 hour before updating again."
-    sleep 3600
 kind: ConfigMap
 metadata:
   name: envoy-config

diff --git a/gcp/basic-auth-ingress/base/kustomization.yaml b/gcp/basic-auth-ingress/base/kustomization.yaml
@@ -1,6 +1,7 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
+- backend-config.yaml
 - cloud-endpoint.yaml
 - cluster-role-binding.yaml
 - cluster-role.yaml

diff --git a/gcp/iap-ingress/base/backend-config.yaml b/gcp/iap-ingress/base/backend-config.yaml
@@ -3,6 +3,8 @@ kind: BackendConfig
 metadata:
   name: iap-backendconfig
 spec:
+  # Jupyter uses websockets so we want to increase the timeout.
+  timeoutSec: 3600
   iap:
     enabled: true
     oauthclientCredentials:

diff --git a/gcp/iap-ingress/base/config-map.yaml b/gcp/iap-ingress/base/config-map.yaml
@@ -48,7 +48,8 @@ data:
   setup_backend.sh: |
     #!/usr/bin/env bash
     #
-    # A simple shell script to configure the backend timeouts and health checks by using gcloud.
+    # A simple shell script to configure the JWT audience used with ISTIO
+    set -x
     [ -z ${NAMESPACE} ] && echo Error NAMESPACE must be set && exit 1
     [ -z ${SERVICE} ] && echo Error SERVICE must be set && exit 1
     [ -z ${INGRESS_NAME} ] && echo Error INGRESS_NAME must be set && exit 1
@@ -66,9 +67,15 @@ data:
     fi
 
     # Activate the service account
-    gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}
+    if [ ! -z "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
+      # As of 0.7.0 we should be using workload identity and never setting GOOGLE_APPLICATION_CREDENTIALS.
+      # But we kept this for backwards compatibility but can remove later.
+      gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}
+    fi
+
     # Print out the config for debugging
     gcloud config list
+    gcloud auth list
 
     NODE_PORT=$(kubectl --namespace=${NAMESPACE} get svc ${SERVICE} -o jsonpath='{.spec.ports[?(@.name=="http2")].nodePort}')
     echo "node port is ${NODE_PORT}"
@@ -110,30 +117,15 @@ data:
     echo "Clearing lock on service annotation"
     kubectl patch svc "${SERVICE}" -p "{\"metadata\": { \"annotations\": {\"backendlock\": \"\" }}}"
 
-    checkBackend() {
-      # created by init container.
-      . /var/shared/healthz.env
-
-      # If node port or backend id change, so does the JWT audience.
-      CURR_NODE_PORT=$(kubectl --namespace=${NAMESPACE} get svc ${SERVICE} -o jsonpath='{.spec.ports[?(@.name=="http2")].nodePort}')
-      read -ra toks <<<"$(gcloud compute --project=${PROJECT} backend-services list --filter=name~k8s-be-${CURR_NODE_PORT}- --format='value(id,timeoutSec)')"
-      CURR_BACKEND_ID="${toks[0]}"
-      CURR_BACKEND_TIMEOUT="${toks[1]}"
-      [[ "$BACKEND_ID" == "$CURR_BACKEND_ID" && "${CURR_BACKEND_TIMEOUT}" -eq 3600 ]]
-    }
-
-    # Verify configuration every 10 seconds.
+    # Loop for ever; we don't want to exit because restarting the container leads users to think there might be a problem
     while true; do
-      if ! checkBackend; then
-        echo "$(date) WARN: Backend check failed, restarting container."
-        exit 1
-      fi
-      sleep 10
+      sleep 3600
     done
   update_backend.sh: |
     #!/bin/bash
     #
-    # A simple shell script to configure the backend timeouts and health checks by using gcloud.
+    # A simple shell script to configure the health checks by using gcloud.
+    set -x
 
     [ -z ${NAMESPACE} ] && echo Error NAMESPACE must be set && exit 1
     [ -z ${SERVICE} ] && echo Error SERVICE must be set && exit 1
@@ -145,58 +137,63 @@ data:
       exit 1
     fi
 
-    # Activate the service account, allow 5 retries
-    for i in {1..5}; do gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS} && break || sleep 10; done
-
-    NODE_PORT=$(kubectl --namespace=${NAMESPACE} get svc ${SERVICE} -o jsonpath='{.spec.ports[?(@.name=="http2")].nodePort}')
-    echo node port is ${NODE_PORT}
-
-    while [[ -z ${BACKEND_NAME} ]]; do
-      BACKENDS=$(kubectl --namespace=${NAMESPACE} get ingress ${INGRESS_NAME} -o jsonpath='{.metadata.annotations.ingress\.kubernetes\.io/backends}')
-      echo "fetching backends info with ${INGRESS_NAME}: ${BACKENDS}"
-      BACKEND_NAME=$(echo $BACKENDS | grep -o "k8s-be-${NODE_PORT}--[0-9a-z]\+")
-      echo "backend name is ${BACKEND_NAME}"
-      sleep 2
-    done
-
-    while [[ -z ${BACKEND_SERVICE} ]];
-    do BACKEND_SERVICE=$(gcloud --project=${PROJECT} compute backend-services list --filter=name~k8s-be-${NODE_PORT}- --uri);
-    echo "Waiting for the backend-services resource PROJECT=${PROJECT} NODEPORT=${NODE_PORT} SERVICE=${SERVICE}...";
-    sleep 2;
-    done
+    if [[ ! -z "${GOOGLE_APPLICATION_CREDENTIALS}" ]]; then
+      # TODO(jlewi): As of 0.7 we should always be using workload identity. We can remove it post 0.7.0 once we have workload identity
+      # fully working
+      # Activate the service account, allow 5 retries
+      for i in {1..5}; do gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS} && break || sleep 10; done
+    fi      
+
+    set_health_check () {
+      NODE_PORT=$(kubectl --namespace=${NAMESPACE} get svc ${SERVICE} -o jsonpath='{.spec.ports[?(@.name=="http2")].nodePort}')
+      echo node port is ${NODE_PORT}
+
+      while [[ -z ${BACKEND_NAME} ]]; do
+        BACKENDS=$(kubectl --namespace=${NAMESPACE} get ingress ${INGRESS_NAME} -o jsonpath='{.metadata.annotations.ingress\.kubernetes\.io/backends}')
+        echo "fetching backends info with ${INGRESS_NAME}: ${BACKENDS}"
+        BACKEND_NAME=$(echo $BACKENDS | grep -o "k8s-be-${NODE_PORT}--[0-9a-z]\+")
+        echo "backend name is ${BACKEND_NAME}"
+        sleep 2
+      done
+
+      while [[ -z ${BACKEND_SERVICE} ]];
+      do BACKEND_SERVICE=$(gcloud --project=${PROJECT} compute backend-services list --filter=name~k8s-be-${NODE_PORT}- --uri);
+      echo "Waiting for the backend-services resource PROJECT=${PROJECT} NODEPORT=${NODE_PORT} SERVICE=${SERVICE}...";
+      sleep 2;
+      done
+
+      while [[ -z ${HEALTH_CHECK_URI} ]];
+      do HEALTH_CHECK_URI=$(gcloud compute --project=${PROJECT} health-checks list --filter=name~${BACKEND_NAME} --uri);
+      echo "Waiting for the healthcheck resource PROJECT=${PROJECT} NODEPORT=${NODE_PORT} SERVICE=${SERVICE}...";
+      sleep 2;
+      done
+
+      echo health check URI is ${HEALTH_CHECK_URI}
+
+      # Since we create the envoy-ingress ingress object before creating the envoy
+      # deployment object, healthcheck will not be configured correctly in the GCP
+      # load balancer. It will default the healthcheck request path to a value of
+      # / instead of the intended /healthz.
+      # Manually update the healthcheck request path to /healthz
+      if [[ ${HEALTHCHECK_PATH} ]]; then
+        # This is basic auth
+        echo Running health checks update ${HEALTH_CHECK_URI} with ${HEALTHCHECK_PATH}
+        gcloud --project=${PROJECT} compute health-checks update http ${HEALTH_CHECK_URI} --request-path=${HEALTHCHECK_PATH}
+      else
+        # /healthz/ready is the health check path for istio-ingressgateway
+        echo Running health checks update ${HEALTH_CHECK_URI} with /healthz/ready
+        gcloud --project=${PROJECT} compute health-checks update http ${HEALTH_CHECK_URI} --request-path=/healthz/ready
+        # We need the nodeport for istio-ingressgateway status-port
+        STATUS_NODE_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="status-port")].nodePort}')
+        gcloud --project=${PROJECT} compute health-checks update http ${HEALTH_CHECK_URI} --port=${STATUS_NODE_PORT}
+      fi      
+    }
 
-    while [[ -z ${HEALTH_CHECK_URI} ]];
-    do HEALTH_CHECK_URI=$(gcloud compute --project=${PROJECT} health-checks list --filter=name~${BACKEND_NAME} --uri);
-    echo "Waiting for the healthcheck resource PROJECT=${PROJECT} NODEPORT=${NODE_PORT} SERVICE=${SERVICE}...";
-    sleep 2;
+    while true; do
+      set_health_check
+      echo "Backend updated successfully. Waiting 1 hour before updating again."
+      sleep 3600
     done
-
-    echo health check URI is ${HEALTH_CHECK_URI}
-
-    # Since we create the envoy-ingress ingress object before creating the envoy
-    # deployment object, healthcheck will not be configured correctly in the GCP
-    # load balancer. It will default the healthcheck request path to a value of
-    # / instead of the intended /healthz.
-    # Manually update the healthcheck request path to /healthz
-    if [[ ${HEALTHCHECK_PATH} ]]; then
-      # This is basic auth
-      echo Running health checks update ${HEALTH_CHECK_URI} with ${HEALTHCHECK_PATH}
-      gcloud --project=${PROJECT} compute health-checks update http ${HEALTH_CHECK_URI} --request-path=${HEALTHCHECK_PATH}
-    else
-      # /healthz/ready is the health check path for istio-ingressgateway
-      echo Running health checks update ${HEALTH_CHECK_URI} with /healthz/ready
-      gcloud --project=${PROJECT} compute health-checks update http ${HEALTH_CHECK_URI} --request-path=/healthz/ready
-      # We need the nodeport for istio-ingressgateway status-port
-      STATUS_NODE_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="status-port")].nodePort}')
-      gcloud --project=${PROJECT} compute health-checks update http ${HEALTH_CHECK_URI} --port=${STATUS_NODE_PORT}
-    fi
-
-    # Since JupyterHub uses websockets we want to increase the backend timeout
-    echo Increasing backend timeout for JupyterHub
-    gcloud --project=${PROJECT} compute backend-services update --global ${BACKEND_SERVICE} --timeout=3600
-
-    echo "Backend updated successfully. Waiting 1 hour before updating again."
-    sleep 3600
 kind: ConfigMap
 metadata:
   name: envoy-config

diff --git a/tests/ambassador-base_test.go b/tests/ambassador-base_test.go
@@ -129,6 +129,9 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
+  annotations:
+    # Ambassador is only used on GCP with basic auth.
+    beta.cloud.google.com/backend-config: '{"ports": {"ambassador":"basicauth-backendconfig"}}'
   labels:
     service: ambassador
   name: ambassador