In [None]:
# imports
import google.cloud.aiplatform as aiplatform
from datetime import datetime
from typing import NamedTuple

from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, Metrics, Model, Output, component)

In [None]:
# project variables
PROJECT_ID = "your-project"
REGION = "us-central1"
BUCKET_NAME = f"bkt-{PROJECT_ID}"
BUCKET_URI = f"gs://{BUCKET_NAME}"

In [None]:
#####################################################################
#
# spark program
#
#####################################################################

In [None]:
%%writefile wordcount.py

"""A PySpark program that counts the number of words in Shakespeare."""

import argparse
import sys
from pyspark.sql import SparkSession

def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default=' ',
                        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
                        help='Output file to write results to.')
    
    known_args, _ = parser.parse_known_args(argv)
    
    spark = SparkSession\
            .builder\
            .appName("wordcount")\
            .getOrCreate()
    
    sc = spark.sparkContext    
    words = sc.textFile(known_args.input).flatMap(lambda line: line.split(" "))
    wordCounts = words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
    wordCounts.saveAsTextFile(known_args.output)
    
    spark.stop()
    
if __name__ == '__main__':
    run(sys.argv)

In [None]:
#####################################################################
#
# dask program
#
#####################################################################

In [None]:
%%writefile daskcompute.py

"""A dask program that runs on dataproc serverless."""

# dask
import dask.array as da
import time

def run(argv=None):
    start_time = time.time()
    x = da.random.random((1000000, 1000000), chunks=(1000, 1000))
    s = x.sum().compute()
    print(f"ADAM: {str(s)}")
    end_time = time.time()
    elapsed_time = end_time-start_time
    print(f"ADAM: {str(elapsed_time)}")
    
if __name__ == '__main__':
    run()

In [None]:
#Copy python module to Cloud Storage
GCS_WC_PY = BUCKET_URI + "/daskcompute.py"
! gsutil cp daskcompute.py $GCS_WC_PY

# these lines are not necessary for this to run
GCS_WC_OUT = BUCKET_URI + "/wc_out/"
GCS_WC_IN = "gs://dataproc-datasets-us-central1/shakespeare/all-lines.txt"

In [None]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

PIPELINE_ROOT = "{}/pipeline_root/dataproc_pyspark".format(BUCKET_URI)
BATCH_ID = "ap-dask-dataproc-serverless-" + TIMESTAMP
ARGS = [
    "--input",
    GCS_WC_IN,
    "--output",
    GCS_WC_OUT,
]

In [None]:
# pipeline def
@dsl.pipeline(
    name="dataproc-pyspark-with-dask",
    description="An exmaple pipeline that uses DataprocPySparkBatchOp for running a PySpark batch workload.",
)
def pipeline(
    batch_id: str = BATCH_ID,
    project_id: str = PROJECT_ID,
    location: str = REGION,
    main_python_file_uri: str = GCS_WC_PY,
    #service_account: str = SERVICE_ACCOUNT,
    args: list = ARGS,
):
    from google_cloud_pipeline_components.experimental.dataproc import \
        DataprocPySparkBatchOp

    _ = DataprocPySparkBatchOp(
        project=project_id,
        location=location,
        batch_id=batch_id,
        main_python_file_uri=main_python_file_uri,
        #service_account=service_account,
        args=args,
    )


compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json")

pipeline = aiplatform.PipelineJob(
    display_name="pipeline",
    template_path="pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
)

In [None]:
pipeline.run()