# Sample using Python SDK for Dataproc Serverless Spark
This notebook can be run in Vertex AI Workbench, User Managed Notebooks with a plain Python kernel</br>
Docs: https://cloud.google.com/python/docs/reference/dataproc/latest/google.cloud.dataproc_v1.types.Batch <br>
BQ Connector jar versions: https://github.com/GoogleCloudDataproc/spark-bigquery-connector <br>
Dataproc Serverless Spark runtime versions: https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-versions

In [15]:
#!python3 -m pip install google-cloud-dataproc

In [16]:
from google.cloud import dataproc_v1 as dataproc
import random
import string

In [17]:
# 1. Get project and UMSA details dynamically
project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = project_id_output[0]
print("Project ID: ", PROJECT_ID)
    
    
project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
PROJECT_NBR = project_nbr_output[0]
print("Project Number: ", PROJECT_NBR)
    
umsa_output = !gcloud config list account --format "value(core.account)"
UMSA_FQN = umsa_output[0]
print("UMSA FQN: ", UMSA_FQN)

!gcloud config set project $PROJECT_ID

Project ID:  gcp-scalable-ml-workshop
Project Number:  569379262211
UMSA FQN:  s8s-lab-sa@gcp-scalable-ml-workshop.iam.gserviceaccount.com
Updated property [core/project].


In [18]:
# 2. Variables
GCP_REGION = "us-central1" 
NETWORK_TAG = "dataproc"
BQ_CONNECTOR_COORDS = "2.12:0.25.2"
CUSTOM_CONTAINER_IMAGE_TAG = "1.0.0"
SERVERLESS_SPARK_RUNTIME_VERSION = "1.0.23"
CODE_BUCKET=f"s8s_code_bucket-{PROJECT_NBR}"
SPARK_SERVERLESS_SUBNET="spark-snet"
DPMS_NAME=f"s8s-dpms-{PROJECT_NBR}"
PERSISTENT_HISTORY_SERVER_NM=f"s8s-sphs-{PROJECT_NBR}"

In [19]:
# Generate a unique batch ID
randomizerCharLength = 10 
PIPELINE_ID = ''.join(random.choices(string.digits, k = randomizerCharLength))
BATCH_ID=f"customer-churn-model-preprocessing-{PIPELINE_ID}"
print(BATCH_ID)

customer-churn-model-preprocessing-4243706688


In [20]:
# 3. Create a client
client = dataproc.BatchControllerClient(
    client_options={
                "api_endpoint": f"{GCP_REGION}-dataproc.googleapis.com:443"
            }
        )

In [21]:
# 4. Initialize request argument(s)
batch = dataproc.Batch()

In [22]:
# 5. App specifics
# ....
# Main python file
batch.pyspark_batch.main_python_file_uri = f"gs://{CODE_BUCKET}/pyspark/preprocessing.py"
# Dependenices
batch.pyspark_batch.python_file_uris = [f"gs://{CODE_BUCKET}/pyspark/common_utils.py"]

# Jars (optional)
#batch.pyspark_batch.jar_file_uris = f"gs://{CODE_BUCKET}/jars/"
# Files (optional)
#batch.pyspark_batch.file_uris = f"gs://{CODE_BUCKET}/code/files/"
# Archives (optional)
#batch.pyspark_batch.archive_uris = f"gs://{CODE_BUCKET}/archives/"

# Spark Application Args (optional)
batch.pyspark_batch.args = [f"--pipelineID={PIPELINE_ID}", \
        f"--projectID={PROJECT_ID}", \
        f"--projectNbr={PROJECT_NBR}", 
        f"--displayPrintStatements={True}"]

In [23]:
# 6. Runtime conf
# ....
batch.runtime_config.version = f"{SERVERLESS_SPARK_RUNTIME_VERSION}"
batch.runtime_config.container_image = f"gcr.io/{PROJECT_ID}/customer_churn_image:{CUSTOM_CONTAINER_IMAGE_TAG}"
batch.runtime_config.properties = {"spark.jars.packages": f"com.google.cloud.spark:spark-bigquery-with-dependencies_{BQ_CONNECTOR_COORDS}"}

In [24]:
# 7. Env execution conf
# Docs: https://cloud.google.com/python/docs/reference/dataproc/latest/google.cloud.dataproc_v1.types.ExecutionConfig
batch.environment_config.execution_config.service_account = f"{UMSA_FQN}"
batch.environment_config.execution_config.subnetwork_uri = f"projects/{PROJECT_ID}/regions/{GCP_REGION}/subnetworks/{SPARK_SERVERLESS_SUBNET}"
batch.environment_config.execution_config.network_tags = [f"{NETWORK_TAG}"]

In [25]:
# 8. Env peripherals conf
batch.environment_config.peripherals_config.metastore_service = f"projects/{PROJECT_ID}/locations/{GCP_REGION}/services/{DPMS_NAME}"
PHS = dataproc.SparkHistoryServerConfig(dataproc_cluster=f"projects/{PROJECT_ID}/regions/{GCP_REGION}/clusters/{PERSISTENT_HISTORY_SERVER_NM}")
batch.environment_config.peripherals_config.spark_history_server_config = PHS

In [26]:
# 9. Create a request

request = dataproc.CreateBatchRequest(
    parent = f"projects/{PROJECT_ID}/locations/{GCP_REGION}",
    batch = batch,
    batch_id = f"{BATCH_ID}"
)

In [27]:
# 10. Submit batch
operation = client.create_batch(request=request)

In [28]:
print("Waiting for operation to complete...")
response = operation.result()

Waiting for operation to complete...


In [29]:
# 11. Handle the response
print(response)

name: "projects/gcp-scalable-ml-workshop/locations/us-central1/batches/customer-churn-model-preprocessing-4243706688"
uuid: "342d5a0e-a66f-449c-b1ef-1c3bf4f25aa1"
create_time {
  seconds: 1670025001
  nanos: 910665000
}
pyspark_batch {
  main_python_file_uri: "gs://s8s_code_bucket-569379262211/pyspark/preprocessing.py"
  args: "--pipelineID=4243706688"
  args: "--projectID=gcp-scalable-ml-workshop"
  args: "--projectNbr=569379262211"
  args: "--displayPrintStatements=True"
  python_file_uris: "gs://s8s_code_bucket-569379262211/pyspark/common_utils.py"
}
runtime_info {
  endpoints {
    key: "Spark History Server"
    value: "https://torvjlsgyjb73jwsyuujcs3pei-dot-us-central1.dataproc.googleusercontent.com/sparkhistory/?eventLogDirFilter=342d5a0e-a66f-449c-b1ef-1c3bf4f25aa1"
  }
  output_uri: "gs://dataproc-staging-us-central1-569379262211-osxvqskd/google-cloud-dataproc-metainfo/7707c3b2-98c7-4d99-aabc-343d3341c871/jobs/srvls-batch-342d5a0e-a66f-449c-b1ef-1c3bf4f25aa1/driveroutput"
}
st