## First Time Installation

Install the latest version of Vertex SDK for Python. First time in instance only.

In [None]:
# ! pip3 install $USER google-cloud-aiplatform==1.15.1 --upgrade
# ! pip3 install $USER google-cloud-bigquery==2.34.4 --upgrade
# ! pip3 install $USER google-cloud-bigquery-storage==2.13.2 --upgrade
# ! pip3 install $USER google-cloud-storage==1.44.0 --upgrade
# ! pip3 install $USER kfp==1.8.13 --upgrade
# ! pip3 install $USER google-cloud-pipeline-components==1.0.14 --upgrade
# ! pip3 install $USER icecream==2.1.1 --upgrade
# ! pip3 install $USER pandas-gbq==0.15.0 --upgrade
# ! pip3 install $USER google-cloud-secret-manager --upgrade
# ! pip3 install $USER google-cloud-pubsub==2.13.4 --upgrade

# import os
# if not os.getenv("IS_TESTING"):
#     # Automatically restart kernel after installs
#     import IPython

#     app = IPython.Application.instance()
#     app.kernel.do_shutdown(True)

# # Check versions
# ! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
# ! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

## Load your pipeline components and configs

In [None]:
%load_ext autoreload
%autoreload 2

# Related Python packages
from icecream import ic
from kfp.v2 import dsl

# Components
from ml_components.dataimport import (
    get_rundates,
    get_period,
    get_import_query,
    bq_query_no_return,
    bq_query_to_dataframe,
    get_previous_dataset_from_model_league,
    
)
from ml_components.datacheck import (
    printing,
    grand_drift_check,
)
from ml_components.datapreproc import (
    data_preprocess,
)
from ml_components.modelling import (
    get_champion_model,
)
from ml_components.prediction import (
    model_predictor,
)
from ml_components.outbound import (
    generate_bq_table_from_gsc,
    update_model_league,
    export_prediction,
)
from ml_components.alert import (
    push_slack_notification,
)
from ml_components.pipelinehelper import (
    func_op,
    save_pipeline,
    run_pipeline,
)

# Configs
from config import (
    SERVICE_ACCOUNT,
    PROJECT_ID,
    REGION,
    RUNNER,
    PIPELINE_NAME,
    BUCKET_NAME,
    PRED_PIPELINE_ROOT as PIPELINE_ROOT,
    USE_VAIEXP,
    PARAMETER_VALUES,
    parameter_checks
)
parameter_checks()

## Define your pipeline
```
@dsl.pipeline(
    name=PIPELINE_NAME,
    pipeline_root=PIPELINE_ROOT,
)
def your_pipeline(
    parameter_value: data_type,
):
    ops_1 = func_op(
        func=function,
        _component_human_name='your ops 1 label',
        base_image='python:3.7', #optional
        function_arg=parameter_value,
    )
    
    ops_2 = func_op(
        func=function,
        _component_human_name='your ops 2 label',
        packages_to_install=[
            'pandas==1.3.3'
        ],
        function_arg=ops_1.outputs['output'],
    )
```

#### Choose your desired GCP pre-built image for your node(s)
- https://cloud.google.com/deep-learning-containers/docs/choosing-container
- `! gcloud container images list --repository="gcr.io/deeplearning-platform-release"`
- You can also just choose 'python:3.x' if want

In [None]:
@dsl.pipeline(
    name=PIPELINE_NAME,
    pipeline_root=PIPELINE_ROOT,
)
def grand_pipeline(
    bucket_name: str,
    project_id: str,
    project_name: str,
    region: str,
    job_id: str,
    run_date: str,
    seed: int,
    train_size: float,
    target_column: str,
    drop_columns: str,
    auto_balance: str,
    feature_importance_dict_str: str,
    numerical_drift_partition_threshold: float,
    numerical_importance_partition_threshold: float,
    categorical_drift_partition_threshold: float,
    categorical_importance_partition_threshold: float,
    category_threshold: int,
    delta: int,
    model_params: str,
    is_endpoint: bool,
    mlops_topic: str,
    runner: str,
    commit_short_sha: str,
):
    ########################################################################################################################
    ########################################################################################################################
    ################################################### Ops Declaration ####################################################
    ########################################################################################################################
    ########################################################################################################################

    
    ###########################################################
    ######################### Rundate #########################
    ###########################################################

    rundates_op = func_op(
        func=get_rundates,
        _component_human_name='get_rundates',
        base_image='python:3.7',
        packages_to_install=['pandas==1.3.3'],
        run_date=run_date,
    )

    usage_run_date = rundates_op.outputs['usage_run_date']
    src_run_dt = rundates_op.outputs['src_run_dt']

    ###########################################################
    ####################### Import Data #######################
    ###########################################################

    period_op = func_op(
        func=get_period,
        _component_human_name='get_period',
        base_image='python:3.7',
        packages_to_install=['pandas==1.3.3'],
        run_date=usage_run_date,
    )

    # Get queries
    import_pred_data_query_op = func_op(
        func=get_import_query,
        _component_human_name='get_pred_import_query',
        base_image='python:3.7',
        datestr=period_op.outputs['pred_dt'],
    )

    # Get previous datasets using model league
    prev_pred_dataset_op = func_op(
        func=get_previous_dataset_from_model_league,
        _component_human_name='get_prev_pred_dataset',
        base_image='python:3.7',
        packages_to_install=['pandas-gbq==0.15.0'],
        cpu_limit='2',
        memory_limit='32G',
        project_id=project_id,
        project_name=project_name,
        dataset_type='pred',
        runner=runner,
        location=region,
    )

    # Get datasets using queries
    pred_dataset_op = func_op(
        func=bq_query_to_dataframe,
        _component_human_name='get_pred_dataset',
        base_image='python:3.7',
        project_id=project_id,
        cpu_limit='2',
        memory_limit='32G',
        query=import_pred_data_query_op.outputs['query'],
    )


    ###########################################################
    #################### Data Preprocessing ###################
    ###########################################################

    pred_data_preprocess_op = func_op(
        func=data_preprocess,
        _component_human_name='pred_data_preprocess',
        base_image='python:3.7',
        packages_to_install=['pandas==1.3.3'],
        cpu_limit='4',
        memory_limit='32G',
        bucket_name=bucket_name,
        category_threshold=category_threshold,
        drop_columns=drop_columns,
        target_column=target_column,
        dataset=pred_dataset_op.outputs['dataset'],
    )


    ###########################################################
    #################### Data Drift Check #####################
    ###########################################################

    pred_drift_check_op = func_op(
        func=grand_drift_check,
        _component_human_name='pred_drift_check',
        base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
        cpu_limit='4',
        memory_limit='32G',
        bucket_name=bucket_name,
        src_run_dt=src_run_dt,
        project_name=project_name,
        dataset_p=pred_data_preprocess_op.outputs['processed_dataset'],
        dataset_q=prev_pred_dataset_op.outputs['previous_dataset'],
        feature_importance_dict_str=feature_importance_dict_str,
        numerical_drift_partition_threshold=numerical_drift_partition_threshold,
        numerical_importance_partition_threshold=numerical_importance_partition_threshold,
        categorical_drift_partition_threshold=categorical_drift_partition_threshold,
        categorical_importance_partition_threshold=categorical_importance_partition_threshold,
        category_threshold=category_threshold,
        delta=delta,
        mode='pred',
    )


    ###########################################################
    ####################### Prediction ########################
    ###########################################################

    model_champion_op = func_op(
        func=get_champion_model,
        _component_human_name='get_champion_model',
        base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
        packages_to_install=[
            'xgboost',
            'pandas-gbq==0.15.0',
        ],
        project_id=project_id,
        project_name=project_name,
        runner=runner,
        location=region,
    )

    model_predictor_op = func_op(
        func=model_predictor,
        _component_human_name='model_predictor',
        base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
        packages_to_install=[
            'xgboost',
        ],
        cpu_limit='4',
        memory_limit='32G',
        target_column=target_column,
        model_champion_object=model_champion_op.outputs['model_champion_object'],
        processed_dataset=pred_data_preprocess_op.outputs['processed_dataset'],
    )

    ###########################################################
    ######################## Outbound #########################
    ###########################################################

    prediction_dataset_to_bq_op = func_op(
        func=generate_bq_table_from_gsc,
        _component_human_name='prediction_dataset_to_bq',
        base_image='python:3.7',
        cpu_limit='4',
        memory_limit='32G',
        retry=3,
        project_id=project_id,
        project_name=project_name,
        dataset_id='MLOPS_PREDICTION_DATASET',
        runner=runner,
        table_name='prediction',
        src_run_dt=src_run_dt,
        dataset_format='PARQUET',
        location=region,
        dataset_to_save=model_predictor_op.outputs['prediction_dataset'],
    )

    pred_update_model_league_op = func_op(
        func=update_model_league,
        _component_human_name='pred_update_model_league',
        base_image='python:3.7',
        packages_to_install=['pandas-gbq==0.15.0'],
        cpu_limit='4',
        memory_limit='16G',
        project_id=project_id,
        project_name=project_name,
        location=region,
        job_id=job_id,
        src_run_dt=src_run_dt,
        bq_path=prediction_dataset_to_bq_op.outputs['bq_path'],
        choose_model='champion',
        runner=runner,
        commit_short_sha=commit_short_sha,
        is_cold_start=model_champion_op.outputs['is_cold_start'],
        is_endpoint=is_endpoint,
        mlops_topic=mlops_topic,
        update_mode='pred',
        model_object=model_champion_op.outputs['model_champion_object'],
    )

    export_prediction_op = func_op(
        func=export_prediction,
        _component_human_name='export_prediction',
        base_image='python:3.7',
        packages_to_install=[
            'pandas',
            'pandas-gbq==0.15.0',
        ],
        project_id=project_id,
        runner=runner,
        prediction_dataset=model_predictor_op.outputs['prediction_dataset'],
    )

    # ########################
    # #### Failing points ####
    # ########################

    # with dsl.Condition(
    #     data_check_result_op.outputs['alert_msg'] != '',
    #     name="failing_point_1"
    # ):
    #     data_quality_check_result_fail_op = func_op(
    #         func=push_slack_notification,
    #         _component_human_name='data_quality_check_result_fail',
    #         base_image='python:3.7',
    #         packages_to_install=['google-cloud-secret-manager'],
    #         job_id=job_id,
    #         src_run_dt=src_run_dt,
    #         text=data_check_result_op.outputs['alert_msg'],
    #         channel='#your-project-name-internal',
    #         webhook_config_str=webhook_config,
    #         runner=runner,
    #     )


    ########################################################################################################################
    ########################################################################################################################
    ##################################################### Ops Caching ######################################################
    ########################################################################################################################
    ########################################################################################################################

    nodes_no_cache = [
        rundates_op,

#         period_op,
#         import_pred_data_query_op,

#         pred_dataset_op,
#         pred_data_preprocess_op,

        prev_pred_dataset_op,

#         pred_drift_check_op,

#         data_check_result_op,

        model_champion_op,

#         model_predictor_op,

#         export_prediction_op,
        pred_update_model_league_op
    ]
    
    if nodes_no_cache:
        for node in nodes_no_cache:
            node.set_caching_options(enable_caching=False)

## Save your pipeline

In [None]:
TEMPLATE_PATH, JOB_ID, DISPLAY_NAME = save_pipeline(
    pipeline=grand_pipeline,
    pipeline_name=PIPELINE_NAME,
    bucket_name=BUCKET_NAME,
    mode='pred',
)

## Run your pipeline locally

In [None]:
run_pipeline(
    project_id=PROJECT_ID,
    staging_bucket=BUCKET_NAME,
    location=REGION,
    display_name=DISPLAY_NAME,
    template_path=TEMPLATE_PATH,
    job_id=JOB_ID,
    pipeline_root=PIPELINE_ROOT,
    service_account=SERVICE_ACCOUNT,
    parameter_values=PARAMETER_VALUES,
    # enable_caching=False,
    use_vaiexp=USE_VAIEXP,
)