In [12]:
# PARAMETERIZED VARIABLES
PROJECT_ID = "project-for-sda-development"
PYSPARK_SCRIPT = "gs://vertex-testing-poc-123/generic_lookalike/pyspark_script.py"
CUSTOM_IMAGE_TAG = "asia-southeast2-docker.pkg.dev/project-for-sda-development/dataproc-image/my-image:1.0.1"
PARAMETERS = {
    "source_table": "project-for-sda-development.dataset_test.audience_table",
    "target_table": "project-for-sda-development.dataset_test.generic_lookalike_test",
    "feature_list": "total_arpu,data_usage_in_mb,total_topups",
    "msisdn_list": "1,2,3,4,5,6,7,8,9,10,27"
}

In [41]:
from datetime import datetime

BUCKET_TEMP = "vertex-testing-poc-123/generic_lookalike/temp"
SERVICE_ACCOUNT = "serviceaccountnyavertex@project-for-sda-development.iam.gserviceaccount.com"
REGION = "asia-southeast2"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "vertex-testing-poc-123"
BUCKET_URI = f"gs://{BUCKET_NAME}"
PIPELINE_ROOT = "{}/pipeline_root/generic_pyspark".format(BUCKET_URI)
BATCH_ID = "generic-pyspark-" + TIMESTAMP
SUBNETWORK_URI = "projects/project-for-sda-development/regions/asia-southeast2/subnetworks/default"

In [20]:
import google.cloud.aiplatform as aiplatform
from kfp import dsl
from kfp.v2 import compiler

In [21]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI, location=REGION)

In [22]:
SCRIPT_ARG = []
for key, value in PARAMETERS.items():
    SCRIPT_ARG.append(f"--{key}")
    SCRIPT_ARG.append(value)

In [23]:
import tempfile
tmpdir = tempfile.gettempdir()
tmpdir

'/var/folders/s4/8_vd44gd01569g443f_jqg1h0000gn/T'

In [42]:
@dsl.pipeline(
    name="generic-lookalike-pyspark",
)
def pipeline(
    main_python_file_uri: str,
    args: list,
    project_id: str,
    container_image: str,
    batch_id: str = BATCH_ID,
    location: str = REGION,   
    service_account: str = SERVICE_ACCOUNT, 
    bucket: str = BUCKET_TEMP, 
):
    from google_cloud_pipeline_components.v1.dataproc import \
        DataprocPySparkBatchOp

    _ = DataprocPySparkBatchOp(
        project=project_id,
        location=location,
        main_python_file_uri=main_python_file_uri,
        service_account=service_account,
        args=args,
        batch_id=batch_id,
        container_image=container_image,
        subnetwork_uri=SUBNETWORK_URI,
    )


compiler.Compiler().compile(pipeline_func=pipeline, package_path=f"{tmpdir}/pipeline.json")

pipeline = aiplatform.PipelineJob(
    display_name="generic-lookalike-pyspark",
    template_path=f"{tmpdir}/pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False,
    parameter_values={
        "main_python_file_uri": PYSPARK_SCRIPT,
        "args": SCRIPT_ARG,
        "project_id": PROJECT_ID,
        "container_image": CUSTOM_IMAGE_TAG
    }
)

pipeline.run()



Creating PipelineJob
PipelineJob created. Resource name: projects/960061420307/locations/asia-southeast2/pipelineJobs/generic-lookalike-pyspark-20221215155649
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/960061420307/locations/asia-southeast2/pipelineJobs/generic-lookalike-pyspark-20221215155649')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/asia-southeast2/pipelines/runs/generic-lookalike-pyspark-20221215155649?project=960061420307
PipelineJob projects/960061420307/locations/asia-southeast2/pipelineJobs/generic-lookalike-pyspark-20221215155649 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/960061420307/locations/asia-southeast2/pipelineJobs/generic-lookalike-pyspark-20221215155649 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/960061420307/locations/asia-southeast2/pipelineJobs/generic-lookalike-pyspark-20221215155649 current state:
PipelineState.PIPELINE_S