In [None]:
#####################################################################
#
# setup
#
#####################################################################

In [1]:
# imports for this notebook to run
import sys
from datetime import datetime
from typing import NamedTuple

from google.cloud import aiplatform as vertex
from google_cloud_pipeline_components.experimental import vertex_notification_email as gcc_exp

import kfp
from kfp.v2 import dsl, compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Metrics, Output, OutputPath, component)

In [2]:
# specify parameters
PROJECT_ID = "your-project-id"
REGION = "us-central1"
BUCKET_NAME = f"bkt-{PROJECT_ID}-vpipelines"
BUCKET_PATH = f"gs://{BUCKET_NAME}"
PIPELINE_ROOT = f"{BUCKET_PATH}/pipeline_root"
PIPELINE_DATA = f"{BUCKET_PATH}/data"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

In [None]:
#####################################################################
# BEGIN vertex pipelines
#####################################################################

In [None]:
#####################################################################
#
# create individual pipeline components, then specify the pipeline
#
#####################################################################

In [3]:
# Download BigQuery data and convert to CSV
@component(
    packages_to_install=["google-cloud-bigquery", "pandas", "pyarrow", "db-dtypes"],
    base_image="python:3.9",
    output_component_file="component_create_dataset.yaml"
)
def get_dataframe(
    bq_table: str,
    output_data_path: OutputPath("Dataset")
):
    from google.cloud import bigquery
    import pandas as pd
    import os

    project_number = os.environ["CLOUD_ML_PROJECT_ID"]
    bqclient = bigquery.Client(project=project_number)
    table = bigquery.TableReference.from_string(
        bq_table
    )
    rows = bqclient.list_rows(
        table
    )
    dataframe = rows.to_dataframe(
        create_bqstorage_client=True,
    )
    dataframe = dataframe.sample(frac=1, random_state=2)
    dataframe.to_csv(output_data_path)

In [29]:
# Create a component to train a Scikit-learn model
@component(
    packages_to_install=["sklearn", "pandas", "numpy", "joblib", "db-dtypes"],
    base_image="python:3.9",
    output_component_file="component_train_model.yaml",
)
def sklearn_train(
    dataset: Input[Dataset],
    metrics: Output[Metrics],
    model: Output[Model]
):
    from numpy import mean, std
    from sklearn.metrics import roc_curve
    
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import RepeatedStratifiedKFold    
    
    from joblib import dump

    import pandas as pd
    df = pd.read_csv(dataset.path)
    labels = df.pop("label").tolist()
    data = df.values.tolist()
    
    # cross validation
    skmodel = GradientBoostingClassifier()
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(skmodel, data, labels, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    
    # log cv metrics
    metrics.log_metric("framework", "Scikit Learn")
    metrics.log_metric("dataset_size", len(df))
    metrics.log_metric("CV_accuracy_mean", mean(n_scores))
    metrics.log_metric("CV_accuracy_stdev", std(n_scores))
    
    # fit the model on the whole dataset
    skmodel = GradientBoostingClassifier()
    skmodel.fit(data, labels)
    
    dump(skmodel, model.path + ".joblib")
    model.uri = model.uri  + ".joblib"

In [None]:
#####################################################################
#
# define the pipeline
#
#####################################################################

In [33]:
# define a pipeline
@dsl.pipeline(name="my-pipeline-name", description="my pipeline description")

# specify all the inputs the pipeline needs to run
def my_pipeline(
    bq_table: str = "",
    output_data_path: str = "data.csv",
    project_id: str = PROJECT_ID,
    region: str = REGION
):
    
    # notification recipients
    RECIPIENTS_LIST = ["adampilz@google.com"]
    notify_email_task = gcc_exp.VertexNotificationEmailOp(recipients=RECIPIENTS_LIST)
    
    # when pipeline exits, send status notification
    with dsl.ExitHandler(notify_email_task):
        
        # specify the nodes in the pipeline
        dataset_task = get_dataframe(bq_table)
        
        #------comment out one of the following        
        # default machine type
        #model_task = sklearn_train(dataset_task.output)
        # custom machine type
        model_task = (sklearn_train(dataset_task.output)
                      .set_cpu_limit('96')
                      .set_memory_limit('624G')
                     )

In [None]:
#####################################################################
#
# compile and run the pipeline
#
#####################################################################

In [34]:
# compile the pipeline
my_package_path = 'my_vertex_pipeline_specification_file.json'
compiler.Compiler().compile(pipeline_func=my_pipeline, package_path=my_package_path)

In [None]:
# runtime parameters to pass to pipeline
pipeline_params = {"bq_table": "ap-alto-ml-1000.my_ds.tabular_binary_class_even_split_slim"}

# run the pipeline
vertex.init(project=PROJECT_ID)

job = vertex.PipelineJob(
    display_name = "my-pipeline-job-name",
    template_path = my_package_path,
    pipeline_root = PIPELINE_ROOT,
    parameter_values = pipeline_params,
    enable_caching = False
)

job.run()