## First Time Installation

Install the latest version of Vertex SDK for Python. First time in instance only.

In [1]:
# import os

# # Google Cloud Notebook
# if os.path.exists("/opt/deeplearning/metadata/env_version"):
#     USER_FLAG = "--user"
# else:
#     USER_FLAG = ""

# ! pip3 install --upgrade google-cloud-aiplatform $USER_FLAG
# ! pip3 install -U google-cloud-storage $USER_FLAG
# ! pip3 install $USER kfp google-cloud-pipeline-components --upgrade
# ! pip3 install $USER icecream==2.1.1 --upgrade
# ! pip3 install $USER pandas-gbq==0.15.0 --upgrade
# ! pip3 install $USER google-cloud-secret-manager --upgrade

# if not os.getenv("IS_TESTING"):
#     # Automatically restart kernel after installs
#     import IPython

#     app = IPython.Application.instance()
#     app.kernel.do_shutdown(True)

# # Check versions
# ! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
# ! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

## Load your pipeline components and configs

In [2]:
%load_ext autoreload
%autoreload 2

# Related Python packages
from icecream import ic
from kfp.v2 import dsl

# Components
from ml_components.dataimport import (
    get_rundates,
    get_period,
    get_import_query,
    bq_query_no_return,
    bq_query_to_dataframe,
    get_previous_dataset_from_model_league,
    
)
from ml_components.datacheck import (
    printing,
    grand_drift_check,
)
from ml_components.datapreproc import (
    data_preprocess,
    data_split,
)
from ml_components.modelling import (
    model_trainer,
    get_champion_model,
    model_evaluator,
)
from ml_components.outbound import (
    generate_bq_table_from_gsc,
    update_model_league,
)
from ml_components.alert import (
    push_slack_notification,
)
from ml_components.pipelinehelper import (
    func_op,
    save_pipeline,
    run_pipeline,
)

# Configs
from config import (
    SERVICE_ACCOUNT,
    PROJECT_ID,
    REGION,
    RUNNER,
    PIPELINE_NAME,
    BUCKET_NAME,
    TRAIN_PIPELINE_ROOT as PIPELINE_ROOT,
    PARAMETER_VALUES,
    parameter_checks
)
parameter_checks()

ic| SERVICE_ACCOUNT: '751015570376-compute@developer.gserviceaccount.com'
ic| PROJECT_ID: 'airasia-gaexport'
ic| REGION: 'us-central1'
ic| PIPELINE_NAME: 'fraudy-classify-vaip-dev'


BUCKET_NAME exists : gs://fraudy-classify-vaip-dev


## Define your pipeline
```
@dsl.pipeline(
    name=PIPELINE_NAME,
    pipeline_root=PIPELINE_ROOT,
)
def your_pipeline(
    parameter_value: data_type,
):
    ops_1 = func_op(
        func=function,
        _component_human_name='your ops 1 label',
        base_image='python:3.7', #optional
        function_arg=parameter_value,
    )
    
    ops_2 = func_op(
        func=function,
        _component_human_name='your ops 2 label',
        packages_to_install=[
            'pandas==1.3.3'
        ],
        function_arg=ops_1.outputs['output'],
    )
```

#### Choose your desired GCP pre-built image for your node(s)
- https://cloud.google.com/deep-learning-containers/docs/choosing-container
- `! gcloud container images list --repository="gcr.io/deeplearning-platform-release"`
- You can also just choose 'python:3.x' if want

In [3]:
@dsl.pipeline(
    name=PIPELINE_NAME,
    pipeline_root=PIPELINE_ROOT,
)
def grand_pipeline(
    bucket_name: str,
    project_id: str,
    project_name: str,
    region: str,
    job_id: str,
    run_date: str,
    seed: int,
    train_size: float,
    target_column: str,
    drop_columns: str,
    auto_balance: str,
    feature_importance_dict_str: str,
    numerical_drift_partition_threshold: float,
    numerical_importance_partition_threshold: float,
    categorical_drift_partition_threshold: float,
    categorical_importance_partition_threshold: float,
    category_threshold: int,
    delta: int,
    model_params: str,
    is_endpoint: bool,
    mlops_topic: str,
    runner: str,
):
    ########################################################################################################################
    ########################################################################################################################
    ################################################### Ops Declaration ####################################################
    ########################################################################################################################
    ########################################################################################################################

    
    ###########################################################
    ######################### Rundate #########################
    ###########################################################

    rundates_op = func_op(
        func=get_rundates,
        _component_human_name='get_rundates',
        base_image='python:3.7',
        packages_to_install=['pandas==1.3.3'],
        run_date=run_date,
    )

    usage_run_date = rundates_op.outputs['usage_run_date']
    src_run_dt = rundates_op.outputs['src_run_dt']

    ###########################################################
    ####################### Import Data #######################
    ###########################################################

    period_op = func_op(
        func=get_period,
        _component_human_name='get_period',
        base_image='python:3.7',
        packages_to_install=['pandas==1.3.3'],
        run_date=usage_run_date,
    )

    # Get queries
    import_train_data_query_op = func_op(
        func=get_import_query,
        _component_human_name='get_train_import_query',
        base_image='python:3.7',
        datestr=period_op.outputs['train_dt'],
    )

    # Get previous datasets using model league
    prev_train_dataset_op = func_op(
        func=get_previous_dataset_from_model_league,
        _component_human_name='get_prev_train_dataset',
        base_image='python:3.7',
        packages_to_install=['pandas-gbq==0.15.0'],
        cpu_limit='2',
        memory_limit='32G',
        project_id=project_id,
        project_name=project_name,
        dataset_type='train',
        runner=runner,
        location=region,
    )

    # Get datasets using queries
    train_dataset_op = func_op(
        func=bq_query_to_dataframe,
        _component_human_name='get_train_dataset',
        base_image='python:3.7',
        project_id=project_id,
        cpu_limit='2',
        memory_limit='32G',
        query=import_train_data_query_op.outputs['query'],
    )


    ###########################################################
    #################### Data Preprocessing ###################
    ###########################################################

    train_data_preprocess_op = func_op(
        func=data_preprocess,
        _component_human_name='train_data_preprocess',
        base_image='python:3.7',
        packages_to_install=['pandas==1.3.3'],
        cpu_limit='4',
        memory_limit='32G',
        drop_columns=drop_columns,
        dataset=train_dataset_op.outputs['dataset'],
    )

    data_split_op = func_op(
        func=data_split,
        _component_human_name='data_split',
        base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
        packages_to_install=[
            'imbalanced-learn==0.9.0',
        ],
        cpu_limit='4',
        memory_limit='32G',
        auto_balance=auto_balance,
        category_threshold=category_threshold,
        seed=seed,
        train_size=train_size,
        target_column=target_column,
        processed_dataset=train_data_preprocess_op.outputs['processed_dataset'],
    )


    ###########################################################
    #################### Data Drift Check #####################
    ###########################################################

    train_drift_check_op = func_op(
        func=grand_drift_check,
        _component_human_name='train_drift_check',
        base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
        cpu_limit='4',
        memory_limit='32G',
        bucket_name=bucket_name,
        src_run_dt=src_run_dt,
        project_name=project_name,
        dataset_p=data_split_op.outputs['train_dataset'],
        dataset_q=prev_train_dataset_op.outputs['previous_dataset'],
        feature_importance_dict_str=feature_importance_dict_str,
        numerical_drift_partition_threshold=numerical_drift_partition_threshold,
        numerical_importance_partition_threshold=numerical_importance_partition_threshold,
        categorical_drift_partition_threshold=categorical_drift_partition_threshold,
        categorical_importance_partition_threshold=categorical_importance_partition_threshold,
        category_threshold=category_threshold,
        delta=delta,
        mode='train',
    )


    ###########################################################
    ######################## Modelling ########################
    ###########################################################

    with dsl.Condition(
        train_drift_check_op.outputs['drift_status'] == 'false',  ## If got drift
        name="modelling"
    ):
    #     data_check_result_fail_op = func_op(
    #         func=push_slack_notification,
    #         _component_human_name='data_check_result_fail',
    #         base_image='python:3.7',
    #         packages_to_install=['google-cloud-secret-manager'],
    #         job_id=job_id,
    #         src_run_dt=src_run_dt,
    #         text=data_check_result_op.outputs['alert_msg'],
    #         channel='#your-project-name-internal',
    #         webhook_config_str=failure_webhook_config,
    #         runner=runner,
    #     )

        train_dataset_to_bq_op = func_op(
            func=generate_bq_table_from_gsc,
            _component_human_name='train_dataset_to_bq',
            base_image='python:3.7',
            cpu_limit='2',
            memory_limit='8G',
            retry=3,
            project_id=project_id,
            project_name=project_name,
            dataset_id='MLOPS_TRAIN_DATASET',
            runner=runner,
            table_name='train',
            src_run_dt=src_run_dt,
            dataset_format='PARQUET',
            location=region,
            dataset_to_save=data_split_op.outputs['train_dataset'],
        )

        model_trainer_op = func_op(
            func=model_trainer,
            _component_human_name='model_trainer',
            base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
            packages_to_install=[
                'xgboost',
            ],
            cpu_limit='4',
            memory_limit='16G',
            seed=seed,
            model_params=model_params,
            x_train_dataset=data_split_op.outputs['x_train_dataset'],
            x_val_dataset=data_split_op.outputs['x_val_dataset'],
            y_train_dataset=data_split_op.outputs['y_train_dataset'],
            y_val_dataset=data_split_op.outputs['y_val_dataset'],
        )

        model_champion_op = func_op(
            func=get_champion_model,
            _component_human_name='get_champion_model',
            base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
            packages_to_install=[
                'xgboost',
                'pandas-gbq==0.15.0',
            ],
            project_id=project_id,
            project_name=project_name,
            runner=runner,
            location=region,
        )

        model_evaluator_op = func_op(
            func=model_evaluator,
            _component_human_name='model_evaluator',
            base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
            packages_to_install=[
                'xgboost',
                'pandas-gbq==0.15.0',
                'seaborn==0.11.2',
                'shap==0.40.0',
            ],
            cpu_limit='4',
            memory_limit='16G',
            is_cold_start=model_champion_op.outputs['is_cold_start'],
            model_object= model_trainer_op.outputs['model_object'],
            model_champion_object=model_champion_op.outputs['model_champion_object'],
            x_train_dataset=data_split_op.outputs['x_train_dataset'],
            x_val_dataset=data_split_op.outputs['x_val_dataset'],
            y_val_dataset=data_split_op.outputs['y_val_dataset'],
        )


        ###########################################################
        ######################## Outbound #########################
        ###########################################################

        train_update_model_league_op = func_op(
            func=update_model_league,
            _component_human_name='train_update_model_league',
            base_image='gcr.io/deeplearning-platform-release/sklearn-cpu',
            packages_to_install=[
                'pandas-gbq==0.15.0',
                'google-cloud-pubsub==2.13.0',
            ],
            cpu_limit='4',
            memory_limit='16G',
            project_id=project_id,
            project_name=project_name,
            location=region,
            job_id=job_id,
            src_run_dt=src_run_dt,
            bq_path=train_dataset_to_bq_op.outputs['bq_path'],
            choose_model=model_evaluator_op.outputs['best_model'],
            runner=runner,
            is_cold_start=model_champion_op.outputs['is_cold_start'],
            is_endpoint=is_endpoint,
            mlops_topic=mlops_topic,
            update_mode='train',
            model_object=model_trainer_op.outputs['model_object'],
        )


    ########################################################################################################################
    ########################################################################################################################
    ##################################################### Ops Caching ######################################################
    ########################################################################################################################
    ########################################################################################################################

    nodes_no_cache = [
        rundates_op,

#         period_op,
#         import_train_data_query_op,

#         train_dataset_op,
#         train_data_preprocess_op,        
#         data_split_op,

        prev_train_dataset_op,

#         train_drift_check_op,
        
#         data_check_result_op,

        model_champion_op,

#         model_trainer_op,
#         model_evaluator_op,

        train_update_model_league_op,
    ]
    
    if nodes_no_cache:
        for node in nodes_no_cache:
            node.set_caching_options(enable_caching=False)

## Save your pipeline

In [4]:
TEMPLATE_PATH, JOB_ID, DISPLAY_NAME = save_pipeline(
    pipeline=grand_pipeline,
    pipeline_name=PIPELINE_NAME,
    bucket_name=BUCKET_NAME,
    mode='train',
)



train_pipeline_spec/fraudy-classify-vaip-dev-train-2022-06-17-t-15-02-10.json uploaded to GCS.
/tmp/fraudy-classify-vaip-dev-train-2022-06-17-t-15-02-10.json removed in local.


## Run your pipeline locally

In [None]:
run_pipeline(
    project_id=PROJECT_ID,
    staging_bucket=BUCKET_NAME,
    location=REGION,
    display_name=DISPLAY_NAME,
    template_path=TEMPLATE_PATH,
    job_id=JOB_ID,
    pipeline_root=PIPELINE_ROOT,
    service_account=SERVICE_ACCOUNT,
    parameter_values=PARAMETER_VALUES,
    # disable_caching_all=True,
)

Creating PipelineJob
PipelineJob created. Resource name: projects/751015570376/locations/us-central1/pipelineJobs/fraudy-classify-vaip-dev-train-2022-06-17-t-15-02-10
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/751015570376/locations/us-central1/pipelineJobs/fraudy-classify-vaip-dev-train-2022-06-17-t-15-02-10')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/fraudy-classify-vaip-dev-train-2022-06-17-t-15-02-10?project=751015570376
PipelineJob projects/751015570376/locations/us-central1/pipelineJobs/fraudy-classify-vaip-dev-train-2022-06-17-t-15-02-10 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/751015570376/locations/us-central1/pipelineJobs/fraudy-classify-vaip-dev-train-2022-06-17-t-15-02-10 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/751015570376/locations/us-central1/pipelineJobs/fraudy-classify-vaip-dev-train-2022-06-17-t-