In [None]:
# PARAMETERIZED VARIABLES

PROJECT_ID = "{{PROJECT_ID}}"
TARGET_TABLE = "{{TARGET_TABLE}}"
MODEL_URI = "{{MODEL_URI}}"
SCALER_URI = "{{SCALER_URI}}"

In [1]:
from datetime import datetime

AUDIENCE_TABLE = "dataset_test.audience_table"
REGION = "asia-southeast2"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET_NAME = "vertex-testing-poc-123"
BUCKET_URI = f"gs://{BUCKET_NAME}"
PIPELINE_ROOT = "{}/pipeline_root/creditscoring_test".format(BUCKET_URI)

In [3]:
import google.cloud.aiplatform as aip

from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component)

In [4]:
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_URI, location=REGION)

In [1]:
import tempfile
tmpdir = tempfile.gettempdir()

In [5]:
# query from BQ

@component(
    base_image='python:3.9',
    packages_to_install=[
        "pandas==1.5.2", "google-cloud-bigquery==2.34.4", "pyarrow==10.0.1"
    ],
    output_component_file=tmpdir+"/query_from_bq.yaml"
)
def query_from_bq(
    data: Output[Dataset], table_full_name: str,
    project_id: str
):
    from google.cloud import bigquery
    client = bigquery.Client(project_id)
    sql = f"""SELECT 
    *
FROM `{table_full_name}` 
WHERE (partition_month = '2022-10-01')"""
    df = client.query(sql).to_dataframe()

    df.to_parquet(data.path, index=False)

In [6]:
# preprocessing

@component(
    base_image='python:3.9',
    packages_to_install=[
        "pandas==1.5.2", "pyarrow==10.0.1", "scikit-learn==1.1.3", "joblib==1.2.0", "gcsfs==2022.11.0"
    ],
    output_component_file=tmpdir+"/preprocess.yaml"
)
def preprocess(
    data: Input[Dataset], preprocessed_data: Output[Dataset],
    scaler_uri: str = None
):
    import pandas as pd
    import gcsfs, joblib

    df_data = pd.read_parquet(data.path)
    df_preprocessed = df_data[[
        'msisdn',
        'total_arpu', 'data_usage_in_mb', 
        'data_usage_duration', 
        'total_topups', 
        'ARPU_1m', 
        'number_of_topups_1m', 
        'total_topups_1m', 
        'total_usage_GB_1m', 
        'daily_GB_consumption_rate_1m',
        'number_of_topups_2m', 
        'total_topups_2m', 
        'total_usage_GB_2m'
    ]]
    df_preprocessed.fillna(0, inplace=True)

    if scaler_uri:
        fs = gcsfs.GCSFileSystem()
        with fs.open(scaler_uri, "rb") as f:
            scaler = joblib.load(f)  
    
        X_test_scaled = scaler.transform(df_preprocessed[[x for x in df_preprocessed.columns if x not in ['msisdn']]])
        df_preprocessed = pd.DataFrame(X_test_scaled, columns = df_preprocessed[[x for x in df_preprocessed.columns if x not in ['msisdn']]].columns)

    df_preprocessed.to_parquet(preprocessed_data.path)

In [7]:
# model inference

@component(
    base_image='python:3.9',
    packages_to_install=[
        "pandas==1.5.2", "pyarrow==10.0.1", "scikit-learn==1.1.3", "joblib==1.2.0", "gcsfs==2022.11.0"
    ],
    output_component_file=tmpdir+"/predict.yaml"
)
def predict(
    model_uri: str, preprocessed_data: Input[Dataset], 
    original_data: Input[Dataset],
    prediction_result: Output[Dataset]
):
    import pandas as pd
    import gcsfs, joblib

    df_preprocessed = pd.read_parquet(preprocessed_data.path)
    df_original = pd.read_parquet(original_data.path)

    fs = gcsfs.GCSFileSystem()
    with fs.open(model_uri, "rb") as f:
        model = joblib.load(f)

    score = model.predict(df_preprocessed)
    df_original['score'] = score

    df_original.to_parquet(prediction_result.path)

In [8]:
# write to BQ

@component(
    base_image='python:3.9',
    packages_to_install=[
        "pandas==1.5.2", "google-cloud-bigquery==2.34.4", "pyarrow==10.0.1", "datetime"
    ],
    output_component_file=tmpdir+"/write_to_bq.yaml"
)
def write_to_bq(
    data: Input[Dataset], table_full_name: str,
    project_id: str, write_mode: str
):
    import pandas as pd
    from google.cloud import bigquery
    client = bigquery.Client(project_id)

    df_to_store = pd.read_parquet(data.path)

    job_config = bigquery.LoadJobConfig(
        autodetect=True,
        write_disposition=write_mode,
    )

    job = client.load_table_from_dataframe(
        df_to_store,
        table_full_name,
        job_config=job_config
    )  # Make an API request.
    job.result()

In [9]:
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name="creditscoring-sample",
)
def pipeline(
    model_uri: str,
    target_table_fullname: str,
    audience_table_fullname: str,
    project_id: str,
    scaler_uri: str = None,
    write_mode: str = "WRITE_APPEND"
):
    query_audience_data = query_from_bq(
        table_full_name=audience_table_fullname,
        project_id=project_id
    )
    preprocess_audience_data = preprocess(
        data=query_audience_data.outputs["data"],
        scaler_uri=scaler_uri
    )
    predict_creditscoring = predict(
        model_uri=model_uri,
        preprocessed_data=preprocess_audience_data.outputs["preprocessed_data"],
        original_data=query_audience_data.outputs["data"]
    )
    write_result_to_bq = write_to_bq(
        data=predict_creditscoring.outputs["prediction_result"],
        table_full_name=target_table_fullname,
        project_id=project_id,
        write_mode=write_mode
    )

In [15]:
from kfp.v2 import compiler

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path=tmpdir+"/creditscoring_sample_pipeline.json"
)

In [11]:
DISPLAY_NAME = "creditscoring_test_" + TIMESTAMP

job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path=tmpdir+"/creditscoring_sample_pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={
        "project_id": PROJECT_ID,
        "target_table_fullname": TARGET_TABLE,
        "model_uri": MODEL_URI,
        "scaler_uri": None if SCALER_URI == 'None' else SCALER_URI,
        "audience_table_fullname": AUDIENCE_TABLE,
        "write_mode": "WRITE_TRUNCATE"
    }
)
job.run(sync=False)

Creating PipelineJob
PipelineJob created. Resource name: projects/960061420307/locations/asia-southeast2/pipelineJobs/creditscoring-sample-20221209072212
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/960061420307/locations/asia-southeast2/pipelineJobs/creditscoring-sample-20221209072212')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/asia-southeast2/pipelines/runs/creditscoring-sample-20221209072212?project=960061420307
PipelineJob projects/960061420307/locations/asia-southeast2/pipelineJobs/creditscoring-sample-20221209072212 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/960061420307/locations/asia-southeast2/pipelineJobs/creditscoring-sample-20221209072212 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/960061420307/locations/asia-southeast2/pipelineJobs/creditscoring-sample-20221209072212 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob proje