# MLOps for Spark MLLib with Vertex AI Pipelines - Part 1
In this notebook, we create a custom docker image for Serverless Spark.

In [1]:
import random
from pathlib import Path as path
from typing import NamedTuple
import os


from google.cloud import aiplatform as vertex_ai
from google_cloud_pipeline_components import aiplatform as vertex_ai_components
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import (Artifact, ClassificationMetrics, Condition, Input,
                        Metrics, Output, component)

## 1. Install packages for Vertex AI capabilties of interest
This is a one-time activity

In [2]:
#!pip3 install --user --upgrade google-cloud-aiplatform==1.11.0 kfp==1.8.11 google-cloud-pipeline-components==1.0.1 --quiet --no-warn-conflicts

# Automatically restart kernel after installs
"""
if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)
"""


'\nif not os.getenv("IS_TESTING"):\n    # Automatically restart kernel after installs\n    import IPython\n\n    app = IPython.Application.instance()\n    app.kernel.do_shutdown(True)\n'

## 2. Initialize variables

In [3]:
import os

PROJECT_ID = ""
PROJECT_NBR = ""

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = project_id_output[0]
    print("Project ID: ", PROJECT_ID)
    
    project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
    PROJECT_NBR = project_nbr_output[0]
    print("Project Number: ", PROJECT_NBR)

Project ID:  s8s-spark-ml-mlops
Project Number:  974925525028


In [4]:
!gcloud config set project $PROJECT_ID

Updated property [core/project].


In [5]:
CODE_BUCKET_URI = "gs://s8s_code_bucket-" + PROJECT_NBR + "/"  
REGION = "us-central1"  
GCR_REPO_NM = "s8s-spark-" + PROJECT_NBR
UMSA = !gcloud config list account --format "value(core.account)"

In [6]:
LOCAL_SCRATCH_DIR = path("docker-build-scratch")
DOCKER_IMAGE_TAG = "1.0.3"
DOCKER_IMAGE_NM = "s8s-sparkml-serve"
DOCKER_IMAGE_FQN = f"gcr.io/{PROJECT_ID}/{DOCKER_IMAGE_NM}:{DOCKER_IMAGE_TAG}"

## 3. Create customer Container Image for Serverless Spark

#### 3.1. Create a local scratch directory (if not exists) and clean up any previously created artifacts

In [7]:
!mkdir -m 777 -p $LOCAL_SCRATCH_DIR

In [8]:
!ls -al $LOCAL_SCRATCH_DIR

total 99960
drwxrwxrwx  3 jupyter jupyter     4096 Jul 27 03:54 .
drwxr-xr-x 12 jupyter jupyter     4096 Jul 27 04:12 ..
drwxr-xr-x  2 jupyter jupyter     4096 Jul 25 18:47 .ipynb_checkpoints
-rw-r--r--  1 jupyter jupyter     3075 Jul 27 03:54 Dockerfile
-rw-r--r--  1 jupyter jupyter 66709754 Jul 21  2021 Miniconda3-py39_4.10.3-Linux-x86_64.sh
-rw-r--r--  1 jupyter jupyter    10332 Jul 26 17:04 pipeline_1032.json
-rw-r--r--  1 jupyter jupyter    22920 Jul 26 22:51 pipeline_1544.json
-rw-r--r--  1 jupyter jupyter    13731 Jul 26 21:28 pipeline_1891.json
-rw-r--r--  1 jupyter jupyter    10373 Jul 25 20:41 pipeline_2265.json
-rw-r--r--  1 jupyter jupyter    10352 Jul 26 17:13 pipeline_2426.json
-rw-r--r--  1 jupyter jupyter    10332 Jul 26 16:49 pipeline_2458.json
-rw-r--r--  1 jupyter jupyter    13748 Jul 26 20:11 pipeline_3517.json
-rw-r--r--  1 jupyter jupyter    13731 Jul 26 21:32 pipeline_4152.json
-rw-r--r--  1 jupyter jupyter    10332 Jul 26 16:46 pipeline_6117.json
-rw-r--r--  1 j

In [9]:
!rm -rf $LOCAL_SCRATCH_DIR/*

#### 3.2. Download the jars/files to be baked into the container image

In [10]:
!gsutil cp gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.22.2.jar $LOCAL_SCRATCH_DIR/
!wget -P $LOCAL_SCRATCH_DIR https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh

Copying gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.22.2.jar...
- [1 files][ 33.8 MiB/ 33.8 MiB]                                                
Operation completed over 1 objects/33.8 MiB.                                     
--2022-07-27 13:56:01--  https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.131.3, 104.16.130.3, 2606:4700::6810:8203, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.131.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 66709754 (64M) [application/x-sh]
Saving to: ‘docker-build-scratch/Miniconda3-py39_4.10.3-Linux-x86_64.sh’


2022-07-27 13:56:02 (70.9 MB/s) - ‘docker-build-scratch/Miniconda3-py39_4.10.3-Linux-x86_64.sh’ saved [66709754/66709754]



#### 3.3. Create Dockerfile and persist to local scratch directory

In [11]:
!rm -rf $LOCAL_SCRATCH_DIR/Dockerfile

In [12]:
dockerFileContent = """
# Debian 11 is recommended.
FROM debian:11-slim

# Suppress interactive prompts
ENV DEBIAN_FRONTEND=noninteractive

# (Required) Install utilities required by Spark scripts.
RUN apt update && apt install -y procps tini

# (Optional) Add extra jars.
ENV SPARK_EXTRA_JARS_DIR=/opt/spark/jars/
ENV SPARK_EXTRA_CLASSPATH='/opt/spark/jars/*'
RUN mkdir -p "${SPARK_EXTRA_JARS_DIR}"
COPY spark-bigquery-with-dependencies_2.12-0.22.2.jar "${SPARK_EXTRA_JARS_DIR}"

# (Optional) Install and configure Miniconda3.
ENV CONDA_HOME=/opt/miniconda3
ENV PYSPARK_PYTHON=${CONDA_HOME}/bin/python
ENV PATH=${CONDA_HOME}/bin:${PATH}
COPY Miniconda3-py39_4.10.3-Linux-x86_64.sh .
RUN bash Miniconda3-py39_4.10.3-Linux-x86_64.sh -b -p /opt/miniconda3 \
  && ${CONDA_HOME}/bin/conda config --system --set always_yes True \
  && ${CONDA_HOME}/bin/conda config --system --set auto_update_conda False \
  && ${CONDA_HOME}/bin/conda config --system --prepend channels conda-forge \
  && ${CONDA_HOME}/bin/conda config --system --set channel_priority strict

# (Optional) Install Conda packages.
#
# The following packages are installed in the default image, it is strongly
# recommended to include all of them.
#
# Use mamba to install packages quickly.
RUN ${CONDA_HOME}/bin/conda install mamba -n base -c conda-forge \
    && ${CONDA_HOME}/bin/mamba install \
      conda \
      cython \
      fastavro \
      fastparquet \
      gcsfs \
      google-cloud-bigquery-storage \
      google-cloud-bigquery[pandas] \
      google-cloud-bigtable \
      google-cloud-container \
      google-cloud-datacatalog \
      google-cloud-dataproc \
      google-cloud-datastore \
      google-cloud-language \
      google-cloud-logging \
      google-cloud-monitoring \
      google-cloud-pubsub \
      google-cloud-redis \
      google-cloud-spanner \
      google-cloud-speech \
      google-cloud-storage \
      google-cloud-texttospeech \
      google-cloud-translate \
      google-cloud-vision \
      koalas \
      matplotlib \
      mleap \
      nltk \
      numba \
      numpy \
      openblas \
      orc \
      pandas \
      pyarrow \
      pysal \
      pytables \
      python \
      regex \
      requests \
      rtree \
      scikit-image \
      scikit-learn \
      scipy \
      seaborn \
      sqlalchemy \
      sympy \
      virtualenv

# (Optional) Add extra Python modules.
#ENV PYTHONPATH=/opt/python/packages
#RUN mkdir -p "${PYTHONPATH}"
#COPY test_util.py "${PYTHONPATH}"

# (Optional) Install R and R libraries.
RUN apt update \
  && apt install -y gnupg \
  && apt-key adv --no-tty \
      --keyserver "hkp://keyserver.ubuntu.com:80" \
      --recv-keys 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 \
  && echo "deb http://cloud.r-project.org/bin/linux/debian bullseye-cran40/" \
      >/etc/apt/sources.list.d/cran-r.list \
  && apt update \
  && apt install -y \
      libopenblas-base \
      libssl-dev \
      r-base \
      r-base-dev \
      r-recommended \
      r-cran-blob

ENV R_HOME=/usr/lib/R

# (Required) Create the 'spark' group/user.
# The GID and UID must be 1099. Home directory is required.
RUN groupadd -g 1099 spark
RUN useradd -u 1099 -g 1099 -d /home/spark -m spark
USER spark

"""

with open(f"{LOCAL_SCRATCH_DIR}/Dockerfile", "w") as f:
    f.write(dockerFileContent)
f.close()

In [13]:
!ls -al $LOCAL_SCRATCH_DIR

total 99720
drwxrwxrwx  3 jupyter jupyter     4096 Jul 27 13:56 .
drwxr-xr-x 12 jupyter jupyter     4096 Jul 27 04:12 ..
drwxr-xr-x  2 jupyter jupyter     4096 Jul 25 18:47 .ipynb_checkpoints
-rw-r--r--  1 jupyter jupyter     3075 Jul 27 13:56 Dockerfile
-rw-r--r--  1 jupyter jupyter 66709754 Jul 21  2021 Miniconda3-py39_4.10.3-Linux-x86_64.sh
-rw-r--r--  1 jupyter jupyter 35385228 Jul 27 13:56 spark-bigquery-with-dependencies_2.12-0.22.2.jar


#### 3.4. Build an image and register into the Artifact registry 

In [14]:
#!gcloud builds submit --tag $DOCKER_IMAGE_FQN $LOCAL_SCRATCH_DIR --machine-type=N1_HIGHCPU_32 --timeout=900s --verbosity=info

In [15]:
!gcloud auth configure-docker us-central1-docker.pkg.dev -q


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.


In [16]:
print(DOCKER_IMAGE_FQN)

gcr.io/s8s-spark-ml-mlops/s8s-sparkml-serve:1.0.3


In [17]:
# Build the image - started at 1:50 PM
!cd $LOCAL_SCRATCH_DIR && docker build . --progress=tty -f Dockerfile -t $DOCKER_IMAGE_FQN

Sending build context to Docker daemon  102.1MB
Step 1/18 : FROM debian:11-slim
 ---> f8f4b4b67518
Step 2/18 : ENV DEBIAN_FRONTEND=noninteractive
 ---> Using cache
 ---> 72e2721130e4
Step 3/18 : RUN apt update && apt install -y procps tini
 ---> Using cache
 ---> 5c583e67cb2b
Step 4/18 : ENV SPARK_EXTRA_JARS_DIR=/opt/spark/jars/
 ---> Using cache
 ---> 2b07abc94b1a
Step 5/18 : ENV SPARK_EXTRA_CLASSPATH='/opt/spark/jars/*'
 ---> Using cache
 ---> a6241159484d
Step 6/18 : RUN mkdir -p "${SPARK_EXTRA_JARS_DIR}"
 ---> Using cache
 ---> 26fbf7739ffa
Step 7/18 : COPY spark-bigquery-with-dependencies_2.12-0.22.2.jar "${SPARK_EXTRA_JARS_DIR}"
 ---> Using cache
 ---> 38b8b6757804
Step 8/18 : ENV CONDA_HOME=/opt/miniconda3
 ---> Using cache
 ---> e2c5e418255f
Step 9/18 : ENV PYSPARK_PYTHON=${CONDA_HOME}/bin/python
 ---> Using cache
 ---> 69df30d56c7f
Step 10/18 : ENV PATH=${CONDA_HOME}/bin:${PATH}
 ---> Using cache
 ---> a7d5a470069b
Step 11/18 : COPY Miniconda3-py39_4.10.3-Linux-x86_64.sh .
 --

In [18]:
!docker push $DOCKER_IMAGE_FQN

The push refers to repository [gcr.io/s8s-spark-ml-mlops/s8s-sparkml-serve]
An image does not exist locally with the tag: gcr.io/s8s-spark-ml-mlops/s8s-sparkml-serve


In [20]:
!docker image list

REPOSITORY                                                     TAG       IMAGE ID       CREATED         SIZE
gcr.io/s8s-spark-ml-mlops/dataproc_serverless_custom_runtime   1.0.2     36bbe54c9077   4 days ago      6.71GB
gcr.io/s8s-spark-ml-mlops/dataproc_serverless_custom_runtime   1.0.0     275372c9c459   4 days ago      6.71GB
<none>                                                         <none>    89dfecaa665e   4 days ago      5.34GB
debian                                                         11-slim   f8f4b4b67518   2 weeks ago     80.4MB
gcr.io/inverting-proxy/agent                                   <none>    fe507176d0e6   17 months ago   1.73GB
