## First Time Installation

Install the latest version of Vertex SDK for Python. First time in instance only.

In [1]:
# ! pip3 install $USER google-cloud-aiplatform==1.16.0 --upgrade
# ! pip3 install $USER google-cloud-bigquery==2.34.4 --upgrade
# ! pip3 install $USER google-cloud-bigquery-storage==2.13.2 --upgrade
# ! pip3 install $USER google-cloud-storage==1.44.0 --upgrade
# ! pip3 install $USER kfp==1.8.13 --upgrade
# ! pip3 install $USER google-cloud-pipeline-components==1.0.14 --upgrade
# ! pip3 install $USER icecream==2.1.1 --upgrade
# ! pip3 install $USER pandas-gbq==0.15.0 --upgrade
# ! pip3 install $USER google-cloud-secret-manager --upgrade
# ! pip3 install $USER google-cloud-pubsub==2.13.4 --upgrade

# import os
# if not os.getenv("IS_TESTING"):
#     # Automatically restart kernel after installs
#     import IPython

#     app = IPython.Application.instance()
#     app.kernel.do_shutdown(True)

# # Check versions
# ! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
# ! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

## Load your pipeline components and configs

In [None]:
%load_ext autoreload
%autoreload 2

# Related Python packages
from icecream import ic
from kfp.v2 import dsl

# Components
from ml_components.dataimport import (
    get_rundates,
    get_period,
    get_import_query,
    bq_query_no_return,
    bq_query_to_dataframe,
    get_previous_dataset_from_model_league,
    
)
from ml_components.datacheck import (
    printing,
    grand_drift_check,
    collect_check_result,
)
from ml_components.datapreproc import (
    data_preprocess,
    data_split,
)
from ml_components.modelling import (
    get_champion_model,
    candidate_generation_train,
    candidate_generation_prediction,
    ranking,
)
from ml_components.outbound import (
    generate_bq_table_from_gsc,
    update_model_league,
)
from ml_components.alert import (
    push_slack_notification,
)
from ml_components.pipelinehelper import (
    func_op,
    save_pipeline,
    run_pipeline,
)

# Configs
from config import (
    SERVICE_ACCOUNT,
    PROJECT_ID,
    REGION,
    RUNNER,
    PIPELINE_NAME,
    BUCKET_NAME,
    PIPELINE_ROOT,
    USE_VAIEXP,
    PARAMETER_VALUES,
    parameter_checks
)
parameter_checks()

## Define your pipeline
```
@dsl.pipeline(
    name=PIPELINE_NAME,
    pipeline_root=PIPELINE_ROOT,
)
def your_pipeline(
    parameter_value: data_type,
):
    ops_1 = func_op(
        func=function,
        _component_human_name='your ops 1 label',
        base_image='python:3.7', #optional
        function_arg=parameter_value,
    )
    
    ops_2 = func_op(
        func=function,
        _component_human_name='your ops 2 label',
        packages_to_install=[
            'pandas==1.3.3'
        ],
        function_arg=ops_1.outputs['output'],
    )
```

#### Choose your desired GCP pre-built image for your node(s)
- https://cloud.google.com/deep-learning-containers/docs/choosing-container
- `! gcloud container images list --repository="gcr.io/deeplearning-platform-release"`
- You can also just choose 'python:3.x' if want

In [3]:
@dsl.pipeline(
    name=PIPELINE_NAME,
    pipeline_root=PIPELINE_ROOT,
)
def grand_pipeline(
    bucket_name: str,
    project_id: str,
    project_name: str,
    region: str,
    job_id: str,
    run_date: str,
    seed: int,
    embedding_dimension: int,
    train_size: float,
    catalog_col: str,
    user_id_col: str,
    product_score: str,
    drop_columns: str,
    numerical_columns: str,
    feature_importance_dict_str: str,
    numerical_drift_partition_threshold: float,
    numerical_importance_partition_threshold: float,
    categorical_drift_partition_threshold: float,
    categorical_importance_partition_threshold: float,
    category_threshold: int,
    delta: int,
    cg_model_params: str,
    r_model_params: str,
    is_endpoint: bool,
    mlops_topic: str,
    runner: str,
    commit_short_sha: str,
):
    ########################################################################################################################
    ########################################################################################################################
    ################################################### Ops Declaration ####################################################
    ########################################################################################################################
    ########################################################################################################################

    
    ###########################################################
    ######################### Rundate #########################
    ###########################################################

    rundates_op = func_op(
        func=get_rundates,
        _component_human_name='get_rundates',
        base_image='python:3.7',
        packages_to_install=['pandas==1.3.3'],
        run_date=run_date,
    )

    usage_run_date = rundates_op.outputs['usage_run_date']
    src_run_dt = rundates_op.outputs['src_run_dt']

    ###########################################################
    ####################### Import Data #######################
    ###########################################################

    period_op = func_op(
        func=get_period,
        _component_human_name='get_period',
        base_image='python:3.7',
        packages_to_install=['pandas==1.3.3'],
        run_date=usage_run_date,
    )

    # Get queries
    import_query_op = func_op(
        func=get_import_query,
        _component_human_name='get_import_query',
        base_image='python:3.7',
        start_date=period_op.outputs['train_start_date'],
        end_date=period_op.outputs['train_end_date'],
    )

    # Get datasets using queries
    user_dataset_op = func_op(
        func=bq_query_to_dataframe,
        _component_human_name='get_user_dataset',
        base_image='python:3.7',
        project_id=project_id,
        cpu_limit='2',
        memory_limit='32G',
        query=import_query_op.outputs['user_query'],
    )

    history_dataset_op = func_op(
        func=bq_query_to_dataframe,
        _component_human_name='get_history_dataset',
        base_image='python:3.7',
        project_id=project_id,
        cpu_limit='2',
        memory_limit='32G',
        query=import_query_op.outputs['history_query'],
    )
    catalog_dataset_op = func_op(
        func=bq_query_to_dataframe,
        _component_human_name='get_catalog_dataset',
        base_image='python:3.7',
        project_id=project_id,
        cpu_limit='2',
        memory_limit='32G',
        query=import_query_op.outputs['catalog_query'],
    )

    # Get previous datasets using model league
    prev_user_dataset_op = func_op(
        func=get_previous_dataset_from_model_league,
        _component_human_name='get_prev_user_dataset',
        base_image='python:3.7',
        packages_to_install=['pandas-gbq==0.15.0'],
        cpu_limit='2',
        memory_limit='32G',
        project_id=project_id,
        project_name=project_name,
        dataset_type='user',
        runner=runner,
        location=region,
    )
    prev_catalog_dataset_op = func_op(
        func=get_previous_dataset_from_model_league,
        _component_human_name='get_prev_catalog_dataset',
        base_image='python:3.7',
        packages_to_install=['pandas-gbq==0.15.0'],
        cpu_limit='2',
        memory_limit='32G',
        project_id=project_id,
        project_name=project_name,
        dataset_type='catalog',
        runner=runner,
        location=region,
    )


    ###########################################################
    #################### Data Preprocessing ###################
    ###########################################################

    history_data_preprocess_op = func_op(
        func=data_preprocess,
        _component_human_name='history_data_preprocess',
        base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
        cpu_limit='4',
        memory_limit='32G',
        drop_columns=drop_columns,
        numerical_columns=numerical_columns,
        dataset=history_dataset_op.outputs['dataset'],
    )

    data_split_op = func_op(
        func=data_split,
        _component_human_name='data_split',
        base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
        packages_to_install=[
            'imbalanced-learn==0.9.0',
        ],
        cpu_limit='4',
        memory_limit='32G',
        seed=seed,
        train_size=train_size,
        processed_dataset=history_data_preprocess_op.outputs['processed_dataset'],
    )


    ###########################################################
    #################### Data Drift Check #####################
    ###########################################################

    user_drift_check_op = func_op(
        func=grand_drift_check,
        _component_human_name='user_drift_check',
        base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
        cpu_limit='4',
        memory_limit='32G',
        project_name=project_name,
        dataset_p=user_dataset_op.outputs['dataset'],
        dataset_q=prev_user_dataset_op.outputs['previous_dataset'],
        feature_importance_dict_str=feature_importance_dict_str,
        numerical_drift_partition_threshold=numerical_drift_partition_threshold,
        numerical_importance_partition_threshold=numerical_importance_partition_threshold,
        categorical_drift_partition_threshold=categorical_drift_partition_threshold,
        categorical_importance_partition_threshold=categorical_importance_partition_threshold,
        category_threshold=category_threshold,
        delta=delta,
        mode='user',
    )
    catalog_drift_check_op = func_op(
        func=grand_drift_check,
        _component_human_name='catalog_drift_check',
        base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
        cpu_limit='4',
        memory_limit='32G',
        project_name=project_name,
        dataset_p=catalog_dataset_op.outputs['dataset'],
        dataset_q=prev_catalog_dataset_op.outputs['previous_dataset'],
        feature_importance_dict_str=feature_importance_dict_str,
        numerical_drift_partition_threshold=numerical_drift_partition_threshold,
        numerical_importance_partition_threshold=numerical_importance_partition_threshold,
        categorical_drift_partition_threshold=categorical_drift_partition_threshold,
        categorical_importance_partition_threshold=categorical_importance_partition_threshold,
        category_threshold=category_threshold,
        delta=delta,
        mode='catalog',
    )
    collect_check_result_op = func_op(
        func=collect_check_result,
        _component_human_name='collect_check_result',
        base_image='python:3.7',
        cpu_limit='2',
        memory_limit='16G',
        user_drift_check=user_drift_check_op.outputs['drift_status'],
        catalog_drift_check=catalog_drift_check_op.outputs['drift_status'],
    )

    ###########################################################
    ######################## Modelling ########################
    ###########################################################

    with dsl.Condition(
        collect_check_result_op.outputs['retrain_model'] == 'true',
        name="modelling"
    ):
    #     data_check_result_fail_op = func_op(
    #         func=push_slack_notification,
    #         _component_human_name='data_check_result_fail',
    #         base_image='python:3.7',
    #         packages_to_install=['google-cloud-secret-manager'],
    #         job_id=job_id,
    #         src_run_dt=src_run_dt,
    #         text=data_check_result_op.outputs['alert_msg'],
    #         channel='#your-project-name-internal',
    #         webhook_config_str=failure_webhook_config,
    #         runner=runner,
    #     )

        user_dataset_to_bq_op = func_op(
            func=generate_bq_table_from_gsc,
            _component_human_name='user_dataset_to_bq',
            base_image='python:3.7',
            cpu_limit='2',
            memory_limit='8G',
            retry=3,
            project_id=project_id,
            project_name=project_name,
            dataset_id='MLOPS_TRAIN_DATASET',
            runner=runner,
            table_name='user',
            src_run_dt=src_run_dt,
            dataset_format='PARQUET',
            location=region,
            dataset_to_save=user_dataset_op.outputs['dataset'],
        )
        catalog_dataset_to_bq_op = func_op(
            func=generate_bq_table_from_gsc,
            _component_human_name='catalog_dataset_to_bq',
            base_image='python:3.7',
            cpu_limit='2',
            memory_limit='8G',
            retry=3,
            project_id=project_id,
            project_name=project_name,
            dataset_id='MLOPS_TRAIN_DATASET',
            runner=runner,
            table_name='catalog',
            src_run_dt=src_run_dt,
            dataset_format='PARQUET',
            location=region,
            dataset_to_save=catalog_dataset_op.outputs['dataset'],
        )
        train_dataset_to_bq_op = func_op(
            func=generate_bq_table_from_gsc,
            _component_human_name='train_dataset_to_bq',
            base_image='python:3.7',
            cpu_limit='2',
            memory_limit='8G',
            retry=3,
            project_id=project_id,
            project_name=project_name,
            dataset_id='MLOPS_TRAIN_DATASET',
            runner=runner,
            table_name='train',
            src_run_dt=src_run_dt,
            dataset_format='PARQUET',
            location=region,
            dataset_to_save=data_split_op.outputs['train_dataset'],
        )

        model_champion_op = func_op(
            func=get_champion_model,
            _component_human_name='get_champion_model',
            base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
            packages_to_install=[
                'pandas-gbq==0.15.0',
            ],
            project_id=project_id,
            project_name=project_name,
            runner=runner,
            location=region,
        )

        candidate_generation_train_op = func_op(
            func=candidate_generation_train,
            _component_human_name='candidate_generation_train',
            base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
            packages_to_install=[
                'tensorflow==2.9.1',
                'tensorflow-recommenders==0.6.0',
                'scann==1.2.7',
            ],
            cpu_limit='4',
            memory_limit='32G',
            is_cold_start=model_champion_op.outputs['is_cold_start'],
            bucket_name=bucket_name,
            model_params=cg_model_params,
            user_id_col=user_id_col,
            catalog_col=catalog_col,
            embedding_dimension=embedding_dimension,
            champion_embedding_dimension=model_champion_op.outputs['embedding_dimension'],
            cg_model_weight_path=model_champion_op.outputs['cg_model_weight_path'],
            catalog_dataset=catalog_dataset_op.outputs['dataset'],
            train_dataset=data_split_op.outputs['train_dataset'],
            test_dataset=data_split_op.outputs['test_dataset'],
            champion_unique_catalog_ids_dataset=model_champion_op.outputs['champion_unique_catalog_ids_dataset'],
            champion_unique_user_ids_dataset=model_champion_op.outputs['champion_unique_user_ids_dataset'],

        )

        candidate_generation_prediction_op = func_op(
            func=candidate_generation_prediction,
            _component_human_name='candidate_generation_prediction',
            base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
            packages_to_install=[
                'tensorflow==2.9.1',
                'tensorflow-recommenders==0.6.0',
                'scann==1.2.7',
                'joblib==1.1.0',
                'tqdm==4.63.0',
            ],
            cpu_limit='4',
            memory_limit='32G',
            user_id_col=user_id_col,
            catalog_col=catalog_col,
            model_params=cg_model_params,
            user_dataset=user_dataset_op.outputs['dataset'],
            cg_index_object=candidate_generation_train_op.outputs['cg_index_object'],
        )

        ranking_op = func_op(
            func=ranking,
            _component_human_name='ranking',
            base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
            packages_to_install=[
                'tensorflow==2.9.1',
                'tensorflow-recommenders==0.6.0',
                'joblib==1.1.0',
                'tqdm==4.63.0',
            ],
            cpu_limit='4',
            memory_limit='32G',
            is_endpoint=is_endpoint,
            best_model=candidate_generation_train_op.outputs['best_model'],
            model_params=r_model_params,
            user_id_col=user_id_col,
            catalog_col=catalog_col,
            product_score=product_score,
            chosen_embedding_dimension=candidate_generation_train_op.outputs['chosen_embedding_dimension'],
            chosen_unique_catalog_ids_dataset=candidate_generation_train_op.outputs['chosen_unique_catalog_ids_dataset'],
            chosen_unique_user_ids_dataset=candidate_generation_train_op.outputs['chosen_unique_user_ids_dataset'],
            train_dataset=data_split_op.outputs['train_dataset'],
            test_dataset=data_split_op.outputs['test_dataset'],
            cg_results_dataset=candidate_generation_prediction_op.outputs['cg_results_dataset'],
        )


        ###########################################################
        ######################## Outbound #########################
        ###########################################################

        update_model_league_op = func_op(
            func=update_model_league,
            _component_human_name='update_model_league',
            base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
            packages_to_install=[
                'pandas-gbq==0.15.0',
                'google-cloud-pubsub==2.13.0',
            ],
            cpu_limit='4',
            memory_limit='16G',
            project_id=project_id,
            project_name=project_name,
            location=region,
            job_id=job_id,
            src_run_dt=src_run_dt,
            user_bq_path=user_dataset_to_bq_op.outputs['bq_path'],
            catalog_bq_path=catalog_dataset_to_bq_op.outputs['bq_path'],
            train_bq_path=train_dataset_to_bq_op.outputs['bq_path'],
            chosen_embedding_dimension=candidate_generation_train_op.outputs['chosen_embedding_dimension'],
            chosen_model=candidate_generation_train_op.outputs['best_model'],
            runner=runner,
            commit_short_sha=commit_short_sha,
            is_cold_start=model_champion_op.outputs['is_cold_start'],
            is_endpoint=is_endpoint,
            mlops_topic=mlops_topic,
            chosen_unique_catalog_ids_dataset=candidate_generation_train_op.outputs['chosen_unique_catalog_ids_dataset'],
            chosen_unique_user_ids_dataset=candidate_generation_train_op.outputs['chosen_unique_user_ids_dataset'],
            cg_model_object=candidate_generation_train_op.outputs['cg_model_object'],
            cg_index_object=candidate_generation_train_op.outputs['cg_index_object'],
            r_model_object=ranking_op.outputs['r_model_object'],
        )


    ########################################################################################################################
    ########################################################################################################################
    ##################################################### Ops Caching ######################################################
    ########################################################################################################################
    ########################################################################################################################

    nodes_no_cache = [
        rundates_op,

#         period_op,
#         import_query_op,

#         user_dataset_op,
#         history_dataset_op,
#         catalog_dataset_op,

#         history_data_preprocess_op,
#         data_split_op,

        prev_user_dataset_op,
        prev_catalog_dataset_op,

#         user_drift_check_op,
#         catalog_drift_check_op,
#         collect_check_result_op

#         user_dataset_to_bq_op,
#         catalog_dataset_to_bq_op,
#         train_dataset_to_bq_op,

        model_champion_op,

#         candidate_generation_train_op,
#         candidate_generation_prediction_op,
#         ranking_op,

        update_model_league_op,
    ]
    
    if nodes_no_cache:
        for node in nodes_no_cache:
            node.set_caching_options(enable_caching=False)

## Save your pipeline

In [4]:
TEMPLATE_PATH, JOB_ID, DISPLAY_NAME = save_pipeline(
    pipeline=grand_pipeline,
    pipeline_name=PIPELINE_NAME,
    bucket_name=BUCKET_NAME,
)



pipeline_spec/usecase-recsys-dev-2022-08-11-t-11-04-29.json uploaded to GCS.
/tmp/usecase-recsys-dev-2022-08-11-t-11-04-29.json removed in local.


## Run your pipeline locally

In [5]:
run_pipeline(
    project_id=PROJECT_ID,
    staging_bucket=BUCKET_NAME,
    location=REGION,
    display_name=DISPLAY_NAME,
    template_path=TEMPLATE_PATH,
    job_id=JOB_ID,
    pipeline_root=PIPELINE_ROOT,
    service_account=SERVICE_ACCOUNT,
    parameter_values=PARAMETER_VALUES,
    # enable_caching=False,
    use_vaiexp=USE_VAIEXP,
)

Creating PipelineJob
PipelineJob created. Resource name: projects/751015570376/locations/us-central1/pipelineJobs/usecase-recsys-dev-2022-08-11-t-11-04-29
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/751015570376/locations/us-central1/pipelineJobs/usecase-recsys-dev-2022-08-11-t-11-04-29')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/usecase-recsys-dev-2022-08-11-t-11-04-29?project=751015570376
Associating projects/751015570376/locations/us-central1/pipelineJobs/usecase-recsys-dev-2022-08-11-t-11-04-29 to Experiment: usecase-recsys-dev-experiment
