![tracker](https://us-central1-vertex-ai-mlops-369716.cloudfunctions.net/pixel-tracking?path=statmike%2Fvertex-ai-mlops%2FDev%2Fnew&file=sklearn-test.ipynb)
<!--- header table --->
<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/statmike/vertex-ai-mlops/blob/main/Dev/new/sklearn-test.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo">
      <br>Run in<br>Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https%3A%2F%2Fraw.githubusercontent.com%2Fstatmike%2Fvertex-ai-mlops%2Fmain%2FDev%2Fnew%2Fsklearn-test.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo">
      <br>Run in<br>Colab Enterprise
    </a>
  </td>      
  <td style="text-align: center">
    <a href="https://github.com/statmike/vertex-ai-mlops/blob/main/Dev/new/sklearn-test.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo">
      <br>View on<br>GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/statmike/vertex-ai-mlops/main/Dev/new/sklearn-test.ipynb">
      <img width="32px" src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo">
      <br>Open in<br>Vertex AI Workbench
    </a>
  </td>
</table>

# Scikit-Learn Workflow

A new template workflow for [scikit-Learn](https://scikit-learn.org/stable/index.html) model training and serving workflows in Vertex AI.

**Prerequisites:**
-  [01 - BigQuery - Table Data Source](../01%20-%20Data%20Sources/01%20-%20BigQuery%20-%20Table%20Data%20Source.ipynb)

---
## Colab Setup

To run this notebook in Colab run the cells in this section.  Otherwise, skip this section.

This cell will authenticate to GCP (follow prompts in the popup).

In [1]:
PROJECT_ID = 'statmike-mlops-349915' # replace with project ID

In [2]:
try:
    from google.colab import auth
    auth.authenticate_user()
    !gcloud config set project {PROJECT_ID}
    print('Colab authorized to GCP')
except Exception:
    print('Not a Colab Environment')
    pass

Not a Colab Environment


---
## Installs

The list `packages` contains tuples of package import names and install names.  If the import name is not found then the install name is used to install quitely for the current user.

In [665]:
# tuples of (import name, install name, min_version)
packages = [
    ('google.cloud.aiplatform', 'google-cloud-aiplatform'),
    ('google.cloud.bigquery', 'google-cloud-bigquery'),
    ('google.cloud.storage', 'google-cloud-storage'),
    ('google.cloud.devtools', 'google-cloud-build'),
    ('bigframes', 'bigframes'),
    ('kfp', 'kfp'),
    ('google.cloud.artifactregistry_v1', 'google-cloud-artifact-registry'),
    ('google_cloud_pipeline_components', 'google-cloud-pipeline-components'),
    ('skl2onnx', 'skl2onnx'),
    ('onnxruntime', 'onnxruntime')
]

import importlib
install = False
for package in packages:
    if not importlib.util.find_spec(package[0]):
        print(f'installing package {package[1]}')
        install = True
        !pip install {package[1]} -U -q --user
    elif len(package) == 3:
        if importlib.metadata.version(package[0]) < package[2]:
            print(f'updating package {package[1]}')
            install = True
            !pip install {package[1]} -U -q --user

## API Enablement

In [77]:
!gcloud services enable aiplatform.googleapis.com
!gcloud services enable artifactregistry.googleapis.com
!gcloud services enable cloudbuild.googleapis.com

### Restart Kernel (If Installs Occured)

After a kernel restart the code submission can start with the next cell after this one.

In [5]:
if install:
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

---
## Setup

Inputs

In [6]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'statmike-mlops-349915'

In [7]:
REGION = 'us-central1'
EXPERIMENT = 'sklearn-workflow'
SERIES = 'dev'

# gcs bucket
GCS_BUCKET = PROJECT_ID

# source data
BQ_PROJECT = PROJECT_ID
BQ_DATASET = 'fraud'
BQ_TABLE = 'fraud_prepped'

Packages

In [666]:
import os
import sklearn.ensemble
import pickle
import importlib
import time
from datetime import datetime
from google.cloud import aiplatform
from google.cloud import bigquery
from google.cloud import artifactregistry_v1
from google.cloud.devtools import cloudbuild_v1
from google.cloud import storage
import bigframes.pandas as bpd

import skl2onnx
import onnxruntime

import kfp

Clients

In [84]:
# vertex ai clients
aiplatform.init(project = PROJECT_ID, location = REGION)

# artifact registry client
ar_client = artifactregistry_v1.ArtifactRegistryClient()

# gcs storage client
gcs = storage.Client(project = PROJECT_ID)
bucket = gcs.bucket(GCS_BUCKET)

# bigquery clients
bq = bigquery.Client(project = PROJECT_ID)
bpd.options.bigquery.project = PROJECT_ID

# cloud build client:
cb_client = cloudbuild_v1.CloudBuildClient()

parameters:

In [10]:
DIR = f"temp/{EXPERIMENT}"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

In [11]:
RUN_NAME = f'run-{TIMESTAMP}'

In [12]:
SERVICE_ACCOUNT = !gcloud config list --format='value(core.account)' 
SERVICE_ACCOUNT = SERVICE_ACCOUNT[0]
SERVICE_ACCOUNT

'1026793852137-compute@developer.gserviceaccount.com'

environment:

In [13]:
if not os.path.exists(DIR):
    os.makedirs(DIR)

---
## Data Source

In [14]:
data = bq.query(f'SELECT * FROM {BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}').to_dataframe()

In [15]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,transaction_id,splits
0,35337,1.092844,-0.01323,1.359829,2.731537,-0.707357,0.873837,-0.79613,0.437707,0.39677,...,-0.167647,0.027557,0.592115,0.219695,0.03697,0.010984,0.0,0,a1b10547-d270-48c0-b902-7a0f735dadc7,TEST
1,60481,1.238973,0.035226,0.063003,0.641406,-0.260893,-0.580097,0.049938,-0.034733,0.405932,...,-0.057718,0.104983,0.537987,0.589563,-0.046207,-0.006212,0.0,0,814c62c8-ade4-47d5-bf83-313b0aafdee5,TEST
2,139587,1.870539,0.211079,0.224457,3.889486,-0.380177,0.249799,-0.577133,0.179189,-0.120462,...,0.180776,-0.060226,-0.228979,0.080827,0.009868,-0.036997,0.0,0,d08a1bfa-85c5-4f1b-9537-1c5a93e6afd0,TEST
3,162908,-3.368339,-1.980442,0.153645,-0.159795,3.847169,-3.516873,-1.209398,-0.292122,0.760543,...,-1.171627,0.214333,-0.159652,-0.060883,1.294977,0.120503,0.0,0,802f3307-8e5a-4475-b795-5d5d8d7d0120,TEST
4,165236,2.180149,0.218732,-2.637726,0.348776,1.063546,-1.249197,0.942021,-0.547652,-0.087823,...,-0.176957,0.563779,0.730183,0.707494,-0.131066,-0.090428,0.0,0,c8a5b93a-1598-4689-80be-4f9f5df0b8ce,TEST


---
## Model Training: Local

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html

In [125]:
train_x = data.loc[data['splits']=='TRAIN', ~data.columns.isin(['transaction_id', 'splits'])]
train_y = train_x.pop('Class').astype('int')

In [126]:
classifier = sklearn.ensemble.HistGradientBoostingClassifier().fit(train_x, train_y)

In [127]:
classifier.score(train_x, train_y)

0.9987327951732212

In [128]:
classifier.predict(train_x[0:5]), train_y[0:5].values

(array([0, 0, 0, 0, 0]), array([0, 0, 0, 0, 0]))

In [129]:
classifier.predict_proba(train_x[0:5])

array([[9.99305131e-01, 6.94868540e-04],
       [9.99305131e-01, 6.94868540e-04],
       [9.99305131e-01, 6.94868540e-04],
       [9.99338476e-01, 6.61524475e-04],
       [9.99338476e-01, 6.61524475e-04]])

In [130]:
classifier.classes_

array([0, 1])

In [131]:
with open(f'{DIR}/model.pkl','wb') as f:
    pickle.dump(classifier, f)

In [132]:
with open(f'{DIR}/model.pkl','rb') as f:
    classifier_import = pickle.load(f)

In [133]:
classifier_import.predict_proba(train_x[0:5])

array([[9.99305131e-01, 6.94868540e-04],
       [9.99305131e-01, 6.94868540e-04],
       [9.99305131e-01, 6.94868540e-04],
       [9.99338476e-01, 6.61524475e-04],
       [9.99338476e-01, 6.61524475e-04]])

In [134]:
predictions = [
    dict(
        classes = list(classifier.classes_),
        scores = list(val)
    )
    for val in list(classifier.predict_proba(train_x[0:5]))
]
predictions

[{'classes': [0, 1], 'scores': [0.999305131460341, 0.000694868539659074]},
 {'classes': [0, 1], 'scores': [0.999305131460341, 0.000694868539659074]},
 {'classes': [0, 1], 'scores': [0.999305131460341, 0.000694868539659074]},
 {'classes': [0, 1], 'scores': [0.9993384755254646, 0.0006615244745354223]},
 {'classes': [0, 1], 'scores': [0.9993384755254646, 0.0006615244745354223]}]

In [137]:
[dict(classes = list(classifier.classes_), scores = p) for p in classifier.predict_proba(train_x[0:5]).tolist()]

[{'classes': [0, 1], 'scores': [0.999305131460341, 0.000694868539659074]},
 {'classes': [0, 1], 'scores': [0.999305131460341, 0.000694868539659074]},
 {'classes': [0, 1], 'scores': [0.999305131460341, 0.000694868539659074]},
 {'classes': [0, 1], 'scores': [0.9993384755254646, 0.0006615244745354223]},
 {'classes': [0, 1], 'scores': [0.9993384755254646, 0.0006615244745354223]}]

---
## Model Training: Vertex AI Training Custom Job
-https://cloud.google.com/vertex-ai/docs/training/create-custom-job#create_custom_job-python_vertex_ai_sdk
-https://cloud.google.com/vertex-ai/docs/training/exporting-model-artifacts#scikit-learn

In [104]:
%%writefile {DIR}/train.py
# imports
from google.cloud import bigquery
import sklearn.ensemble
import argparse
import pickle
import os
import logging

# setup logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

# import argument to local variables
parser = argparse.ArgumentParser()
parser.add_argument('--project_id', dest = 'PROJECT_ID', type=str)
parser.add_argument('--bq_project', dest = 'BQ_PROJECT', type=str)
parser.add_argument('--bq_dataset', dest = 'BQ_DATASET', type=str)
parser.add_argument('--bq_table', dest = 'BQ_TABLE', type=str)
args = parser.parse_args()
logging.info('Finished parsing input parameters.')

# bigquery client
bq = bigquery.Client(project = args.PROJECT_ID)

# download data
data = bq.query(f'SELECT * FROM {args.BQ_PROJECT}.{args.BQ_DATASET}.{args.BQ_TABLE}').to_dataframe()
logging.info('Read data from BQ.')

# prepare training data
train_x = data.loc[data['splits']=='TRAIN', ~data.columns.isin(['transaction_id', 'splits'])]
train_y = train_x.pop('Class').astype('int')
logging.info('Prepared training data.')

# fit model
classifier = sklearn.ensemble.HistGradientBoostingClassifier().fit(train_x, train_y)
logging.info('Model training complete.')

# Use predefined environment variable to establish model directory
storage_path = f"/gcs/{os.environ['AIP_MODEL_DIR'][5:]}" + 'model.pkl'
os.makedirs(os.path.dirname(storage_path), exist_ok=True)

# output the model save files directly to GCS destination
with open(storage_path,'wb') as f:
    pickle.dump(classifier, f)
logging.info('Model saved to GCS.')

Overwriting temp/sklearn-workflow/train.py


https://cloud.google.com/vertex-ai/docs/training/pre-built-containers#scikit-learn

In [105]:
CMDARGS = [
    "--project_id=" + PROJECT_ID,
    "--bq_project=" + BQ_PROJECT,
    "--bq_dataset=" + BQ_DATASET,
    "--bq_table=" + BQ_TABLE
]

TRAIN_IMAGE = 'us-docker.pkg.dev/vertex-ai/training/sklearn-cpu.1-0:latest'
TRAIN_COMPUTE = 'n1-standard-4'
URI = f"gs://{GCS_BUCKET}/{SERIES}/{EXPERIMENT}"

In [106]:
customJob = aiplatform.CustomJob.from_local_script(
    display_name = f'{SERIES}_{EXPERIMENT}_{TIMESTAMP}',
    script_path = f'{DIR}/train.py',
    container_uri = TRAIN_IMAGE,
    args = CMDARGS,
    requirements = ['db-dtypes', 'google-cloud-bigquery'],
    replica_count = 1,
    machine_type = TRAIN_COMPUTE,
    accelerator_count = 0,
    base_output_dir = f"{URI}/models/{TIMESTAMP}",
    staging_bucket = f"{URI}/models/{TIMESTAMP}",
    labels = {'series' : f'{SERIES}', 'experiment' : f'{EXPERIMENT}'}
)

Training script copied to:
gs://statmike-mlops-349915/dev/sklearn-workflow/models/20240225203750/aiplatform-2024-02-25-21:56:30.388-aiplatform_custom_trainer_script-0.1.tar.gz.


In [107]:
customJob.run(
    service_account = SERVICE_ACCOUNT
)

Creating CustomJob
CustomJob created. Resource name: projects/1026793852137/locations/us-central1/customJobs/1880269818137935872
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/1026793852137/locations/us-central1/customJobs/1880269818137935872')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/1880269818137935872?project=1026793852137
CustomJob projects/1026793852137/locations/us-central1/customJobs/1880269818137935872 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/1026793852137/locations/us-central1/customJobs/1880269818137935872 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/1026793852137/locations/us-central1/customJobs/1880269818137935872 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/1026793852137/locations/us-central1/customJobs/1880269818137935872 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/1026793852137/locations/us-central1/customJobs

In [108]:
customJob.display_name

'dev_sklearn-workflow_20240225203750'

In [109]:
customJob.resource_name

'projects/1026793852137/locations/us-central1/customJobs/1880269818137935872'

Create hyperlinks to job here:

In [110]:
job_link = f"https://console.cloud.google.com/vertex-ai/locations/{REGION}/training/{customJob.resource_name.split('/')[-1]}/cpu?cloudshell=false&project={PROJECT_ID}"
print(f'Review the Custom Job here:\n{job_link}')

Review the Custom Job here:
https://console.cloud.google.com/vertex-ai/locations/us-central1/training/1880269818137935872/cpu?cloudshell=false&project=statmike-mlops-349915


---
## Register Model: Vertex AI Model Registry

- https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers
- https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model#google_cloud_aiplatform_Model_training_job

**NOTE:** It was very important to use a serving container with the same version of scikit-learn as the training container.  When serving was set to 1.3 after training with 1.0 the following error happens on serving:
> `ModuleNotFoundError: No module named 'sklearn.ensemble._hist_gradient_boosting.loss'"`

In [123]:
DEPLOY_IMAGE = 'us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest'
DEPLOY_COMPUTE = 'n1-standard-4'

In [164]:
upload_model = True
try:
    model = aiplatform.Model(
        project = PROJECT_ID,
        location = REGION,
        model_name = f'model_{SERIES}_{EXPERIMENT}'
    )
    print('Model already in registry')
    if RUN_NAME in model.version_aliases:
        upload_model = False
        print("This version already loaded, no action taken.")
    else:
        ('Loading model as new default version.')
        parent_model = model.resource_name
except Exception:
    print('This is a new model, creating in model registry')
    parent_model = ''

if upload_model:
    print('Uploading Model now...')
    model = aiplatform.Model.upload(
        display_name = f'{SERIES}_{EXPERIMENT}',
        model_id = f'model_{SERIES}_{EXPERIMENT}',
        parent_model =  parent_model,
        serving_container_image_uri = DEPLOY_IMAGE,
        artifact_uri = f"{URI}/models/{TIMESTAMP}/model",
        is_default_version = True,
        version_aliases = [RUN_NAME],
        version_description = RUN_NAME
    )

This is a new model, creating in model registry
Uploading Model now...
Creating Model
Create Model backing LRO: projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow/operations/489564183696769024
Model created. Resource name: projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow@1
To use this Model in another session:
model = aiplatform.Model('projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow@1')


In [165]:
model.versioned_resource_name

'projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow@1'

---
## Model Serving: Online with Vertex AI Prediction Endpoints

- https://cloud.google.com/vertex-ai/docs/general/deployment
- https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Endpoint

### Create/Retrieve Endpoint

In [17]:
endpoints = aiplatform.Endpoint.list(filter = f"display_name={SERIES}")
if endpoints:
    endpoint = endpoints[0]
    print(f'Endpoint Exists: {endpoint.resource_name}')
else:
    endpoint = aiplatform.Endpoint.create(
        display_name = SERIES
    )
    print('Endpoint Created: ')
    
print(f'Review the Endpoint in the Console:\nhttps://console.cloud.google.com/vertex-ai/locations/{REGION}/endpoints/{endpoint.name}?project={PROJECT_ID}')

Endpoint Exists: projects/1026793852137/locations/us-central1/endpoints/8609714806183690240
Review the Endpoint in the Console:
https://console.cloud.google.com/vertex-ai/locations/us-central1/endpoints/8609714806183690240?project=statmike-mlops-349915


In [18]:
endpoint.display_name

'dev'

In [19]:
endpoint.traffic_split

{'8337304203332419584': 100}

In [20]:
deployed_models = endpoint.list_models()
#deployed_models

### Deploy Model To Endpoint

In [170]:
endpoint.deploy(
    model = model,
    deployed_model_display_name = model.display_name,
    traffic_percentage = 100,
    machine_type = DEPLOY_COMPUTE,
    min_replica_count = 1,  
    max_replica_count = 1
) 

Deploying Model projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow to Endpoint : projects/1026793852137/locations/us-central1/endpoints/8609714806183690240
Deploy Endpoint model backing LRO: projects/1026793852137/locations/us-central1/endpoints/8609714806183690240/operations/240740304284549120
Endpoint model deployed. Resource name: projects/1026793852137/locations/us-central1/endpoints/8609714806183690240


In [181]:
endpoint.traffic_split

{'8337304203332419584': 100}

In [173]:
endpoint.list_models()

[id: "8337304203332419584"
 model: "projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow"
 display_name: "dev_sklearn-workflow"
 create_time {
   seconds: 1708909118
   nanos: 730301000
 }
 dedicated_resources {
   machine_spec {
     machine_type: "n1-standard-4"
   }
   min_replica_count: 1
   max_replica_count: 1
 }
 model_version_id: "1"]

In [175]:
model.resource_name

'projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow'

In [177]:
model.version_id

'1'

In [178]:
for deployed_model in endpoint.list_models():
    if deployed_model.id in endpoint.traffic_split:
        print(f"Model {deployed_model.display_name} with version {deployed_model.model_version_id} has traffic = {endpoint.traffic_split[deployed_model.id]}")
    else:
        endpoint.undeploy(deployed_model_id = deployed_model.id)
        print(f"Undeploying {deployed_model.display_name} with version {deployed_model.model_version_id} because it has no traffic.")

Model dev_sklearn-workflow with version 1 has traffic = 100


In [179]:
endpoint.traffic_split

{'8337304203332419584': 100}

### Get Predictions
- https://cloud.google.com/vertex-ai/docs/predictions/get-online-predictions

In [189]:
test_x = data.loc[data['splits']=='TEST', ~data.columns.isin(['transaction_id', 'splits'])]
test_y = test_x.pop('Class').astype('int')

instances = test_x.values.tolist()

In [190]:
test_x.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')

In [191]:
instances[0]

[35337,
 1.0928441854981998,
 -0.0132303486713432,
 1.35982868199426,
 2.7315370965921004,
 -0.707357349219652,
 0.8738370029866129,
 -0.7961301510622031,
 0.437706509544851,
 0.39676985012996396,
 0.587438102569443,
 -0.14979756231827498,
 0.29514781622888103,
 -1.30382621882143,
 -0.31782283120234495,
 -2.03673231037199,
 0.376090905274179,
 -0.30040350116459497,
 0.433799615590844,
 -0.145082264348681,
 -0.240427548108996,
 0.0376030733329398,
 0.38002620963091405,
 -0.16764742731151097,
 0.0275573495476881,
 0.59211469704354,
 0.219695164116351,
 0.0369695108704894,
 0.010984441006191,
 0.0]

In [107]:
endpoint.predict(instances = instances[0:1])

Prediction(predictions=[0.0], deployed_model_id='8337304203332419584', metadata=None, model_version_id='1', model_resource_name='projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow', explanations=None)

In [108]:
test_y.ne(0).idxmax()

53

In [109]:
endpoint.predict(instances = instances[53:54])

Prediction(predictions=[1.0], deployed_model_id='8337304203332419584', metadata=None, model_version_id='1', model_resource_name='projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow', explanations=None)

## Model Serving: Batch With Vertex AI Prediction Batch Jobs

- https://cloud.google.com/vertex-ai/docs/predictions/get-batch-predictions#bigquery
- https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model#google_cloud_aiplatform_Model_batch_predict

Here, we want to filter to rows with `splits = EVAL` and exclude columns.  This will use the gapic version of the API:
- https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform_v1.types.BatchPredictionJob
- https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform_v1.services.job_service.JobServiceClient#google_cloud_aiplatform_v1_services_job_service_JobServiceClient_create_batch_prediction_job

In [110]:
client_options = {"api_endpoint": f"{REGION}-aiplatform.googleapis.com"}
jobs_client = aiplatform.gapic.JobServiceClient(client_options = client_options)

In [209]:
batch_prediction_job = aiplatform.gapic.BatchPredictionJob(
    display_name = f'{SERIES}_{EXPERIMENT}',
    model = model.versioned_resource_name,
    input_config = dict(
        instances_format = 'bigquery',
        bigquery_source = dict(input_uri = f'bq://{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}')
    ),
    output_config = dict(
        predictions_format = 'bigquery',
        bigquery_destination = dict(output_uri = f'bq://{BQ_PROJECT}.{BQ_DATASET}')
    ),
    dedicated_resources = dict(
        machine_spec = dict(machine_type = DEPLOY_COMPUTE),
        starting_replica_count = 10,
        max_replica_count = 10
    ),
    instance_config = dict(
        instance_type = 'array',
        included_fields = list(train_x.columns),
        #excluded_fields = ['Class', 'splits', 'transaction_id']
    )
)

In [210]:
bqBatchJob = jobs_client.create_batch_prediction_job(
    parent = f'projects/{PROJECT_ID}/locations/{REGION}',
    batch_prediction_job = batch_prediction_job
)

In [215]:
state = jobs_client.get_batch_prediction_job(
    name = bqBatchJob.name
).state
state

<JobState.JOB_STATE_RUNNING: 3>

In [235]:
state, state.value

(<JobState.JOB_STATE_RUNNING: 3>, 3)

In [228]:
state._member_names_

['JOB_STATE_UNSPECIFIED',
 'JOB_STATE_QUEUED',
 'JOB_STATE_PENDING',
 'JOB_STATE_RUNNING',
 'JOB_STATE_SUCCEEDED',
 'JOB_STATE_FAILED',
 'JOB_STATE_CANCELLING',
 'JOB_STATE_CANCELLED',
 'JOB_STATE_PAUSED',
 'JOB_STATE_EXPIRED',
 'JOB_STATE_UPDATING',
 'JOB_STATE_PARTIALLY_SUCCEEDED']

In [238]:
while state.value <= 3:
    print('Checking again in 1 Minute...')
    time.sleep(60)
    state = jobs_client.get_batch_prediction_job(
        name = bqBatchJob.name
    ).state

Checking again in 1 Minute...


In [239]:
state.name

'JOB_STATE_SUCCEEDED'

In [242]:
bqBatchJob = jobs_client.get_batch_prediction_job(
    name = bqBatchJob.name
)
bqBatchJob.output_info

bigquery_output_dataset: "bq://statmike-mlops-349915.fraud"
bigquery_output_table: "predictions_2024_02_26T08_04_51_229Z_720"

In [279]:
str(bqBatchJob.end_time - bqBatchJob.start_time)

'0:13:48.626896'

In [243]:
bq_table = f"{bqBatchJob.output_info.bigquery_output_dataset.split('bq://')[-1]}.{bqBatchJob.output_info.bigquery_output_table}"
bq_table

'statmike-mlops-349915.fraud.predictions_2024_02_26T08_04_51_229Z_720'

In [244]:
batch_predictions = bq.query(f'''
SELECT *
FROM `{bq_table}`
''').to_dataframe()

In [250]:
batch_predictions.loc[(batch_predictions['prediction'] == '0') & (batch_predictions['Class'] == 1)].head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V24,V25,V26,V27,V28,Amount,Class,transaction_id,splits,prediction
143692,140293,0.951025,3.252926,-5.039105,4.632411,3.014501,-1.34957,0.98094,-1.819539,-2.099049,...,-1.185942,-1.286177,0.000365,0.169662,0.108276,0.77,1,d29d9996-2d90-4ad6-9cd9-e744424b81e0,TRAIN,0
143693,54846,-2.986466,-0.000891,0.605887,0.338338,0.685448,-1.581954,0.504206,-0.233403,0.636768,...,0.355065,0.448552,0.19349,1.214588,-0.013923,1.79,1,b949175e-d019-4637-b02f-8d0f8cc6c51c,TRAIN,0
143694,87202,-0.41982,-1.155978,-2.092516,2.78675,0.736297,-0.167292,1.600027,-0.117427,-0.796954,...,0.516131,-0.602941,-0.305024,-0.021363,0.129096,451.27,1,d0cfc4bf-40be-468e-b80b-403e5219136d,TRAIN,0
143695,139107,-4.6665,-3.95232,0.206094,5.153525,5.229469,0.93904,-0.635033,-0.704506,-0.234786,...,-0.759673,-0.502304,0.630639,-0.51388,0.729526,22.47,1,7a960a18-0a2f-4351-a148-7978c2bb36b0,TRAIN,0
143696,102318,-1.020632,1.496959,-4.490937,1.836727,0.627318,-2.735569,-1.546274,0.459822,-0.682741,...,-0.1555,0.412166,-0.22008,0.392338,-0.020089,22.04,1,f3b8a1b7-0000-4231-b541-52996821a6ad,TRAIN,0


---
# Challenge: Customize Prediction Response

The prediction responses only include the predicted value of the classification.  If we want the probability for each class then we need to customize the prediction response.  

This means a customized container to accompany the model that will format and respond with the desired prediction format. There are several ways to accomplish this custom container build:
- Build it following the requirements here: https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements
- Use a prebuilt container with options for customization like NVIDIA Triton Inference Server: https://cloud.google.com/vertex-ai/docs/predictions/using-nvidia-triton
- Use a helper to build a container like Custom Prediction Routines: https://cloud.google.com/vertex-ai/docs/predictions/custom-prediction-routines


## Setup Artifact Registry

[Artifact registry](https://cloud.google.com/artifact-registry/docs) organizes artifacts with repositories.  Each repository contains packages and is designated to hold a partifcular format of package: Docker images, Python Packages and [others](https://cloud.google.com/artifact-registry/docs/supported-formats#package).  There is even a registry type specifically for [Kubeflow pipeline templates](https://cloud.google.com/artifact-registry/docs/kfp?hl=en).

### List Repositories

This may be empty if no repositories have been created for this project

In [79]:
for repo in ar_client.list_repositories(parent = f'projects/{PROJECT_ID}/locations/{REGION}'):
    print(repo.name)

projects/statmike-mlops-349915/locations/us-central1/repositories/gcf-artifacts
projects/statmike-mlops-349915/locations/us-central1/repositories/statmike-mlops-349915
projects/statmike-mlops-349915/locations/us-central1/repositories/statmike-mlops-349915-docker
projects/statmike-mlops-349915/locations/us-central1/repositories/statmike-mlops-349915-python


### Create Docker Image Repository

Create an Artifact Registry Repository to hold Docker Images created by this notebook.  First, check to see if it is already created by a previous run and retrieve it if it has.  Otherwise, create!

In [80]:
docker_repo = None
for repo in ar_client.list_repositories(parent = f'projects/{PROJECT_ID}/locations/{REGION}'):
    if f'{PROJECT_ID}' == repo.name.split('/')[-1]:
        docker_repo = repo
        print(f'Retrieved existing repo: {docker_repo.name}')

if not docker_repo:
    operation = ar_client.create_repository(
        request = artifactregistry_v1.CreateRepositoryRequest(
            parent = f'projects/{PROJECT_ID}/locations/{REGION}',
            repository_id = f'{PROJECT_ID}',
            repository = artifactregistry_v1.Repository(
                description = f'A repository for the {SERIES} series that holds docker images.',
                name = f'{PROJECT_ID}',
                format_ = artifactregistry_v1.Repository.Format.DOCKER,
                labels = {'series': SERIES}
            )
        )
    )
    print('Creating Repository ...')
    docker_repo = operation.result()
    print(f'Completed creating repo: {docker_repo.name}')

Retrieved existing repo: projects/statmike-mlops-349915/locations/us-central1/repositories/statmike-mlops-349915


In [81]:
docker_repo.name, docker_repo.format_.name

('projects/statmike-mlops-349915/locations/us-central1/repositories/statmike-mlops-349915',
 'DOCKER')

In [82]:
REPOSITORY = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{docker_repo.name.split('/')[-1]}"

In [83]:
REPOSITORY

'us-central1-docker.pkg.dev/statmike-mlops-349915/statmike-mlops-349915'

---
## Model Serving: Custom Container

It is really not all that hard with Python!

For this example [FastAPI](https://fastapi.tiangolo.com/) is used.

### Create Application Files

```
|__ Dockerfile
|__ requirements.txt
|__ app
    |__ main.py
    |__ prestart.sh
```

In [94]:
if not os.path.exists(DIR + '/custom/SRC/app'):
    os.makedirs(DIR + '/custom/SRC/app')

In [432]:
%%writefile {DIR}/custom/SRC/app/main.py
from fastapi import FastAPI, Request
import json
import os
import numpy as np
import pickle
from google.cloud import storage

app = FastAPI()
gcs_client = storage.Client()

# Download the model file from Cloud Storage bucket
with open("model.pkl", 'wb') as model_f:
    gcs_client.download_blob_to_file(
            f"{os.environ['AIP_STORAGE_URI']}/model.pkl", model_f
        )
    
# Load the scikit-learn model
with open('model.pkl','rb') as f:
    _model = pickle.load(f)

classes = [str(c) for c in list(_model.classes_)]
    
# Define function for health route
@app.get(os.environ['AIP_HEALTH_ROUTE'], status_code=200)
def health():
    return {}

# Define function for prediction route
@app.post(os.environ['AIP_PREDICT_ROUTE'])
async def predict(request: Request):
    # await the request
    body = await request.json()
    
    # parse the request
    instances = body["instances"]
    inputs = np.asarray(instances)
    
    # get predicted probabilities
    predictions = _model.predict_proba(inputs).tolist()
    
    # format predictions:
    preds = [dict(classes = classes, scores = p) for p in predictions]
    
    # following requires a prediction_schema so the model understands the output format:
    #return {"predictions": preds}
    # this returns just the predicted probabilities:
    return {"predictions": predictions}

Overwriting temp/sklearn-workflow/custom/SRC/app/main.py


In [433]:
%%writefile {DIR}/custom/SRC/app/__init__.py
# init file

Overwriting temp/sklearn-workflow/custom/SRC/app/__init__.py


In [434]:
%%writefile {DIR}/custom/SRC/requirements.txt
numpy
scikit-learn==1.0
google-cloud-storage

Overwriting temp/sklearn-workflow/custom/SRC/requirements.txt


In [435]:
%%writefile {DIR}/custom/SRC/app/prestart.sh
#!/bin/bash
export PORT=$AIP_HTTP_PORT

Overwriting temp/sklearn-workflow/custom/SRC/app/prestart.sh


In [436]:
%%writefile {DIR}/custom/SRC/Dockerfile
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.9

COPY ./app /app

COPY ./requirements.txt requirements.txt
RUN pip install --no-cache-dir --upgrade pip \
  && pip install --no-cache-dir -r requirements.txt

Overwriting temp/sklearn-workflow/custom/SRC/Dockerfile


In [437]:
bucket.blob(f'{SERIES}/{EXPERIMENT}/custom/SRC/app/main.py').upload_from_filename(f'{DIR}/custom/SRC/app/main.py')
bucket.blob(f'{SERIES}/{EXPERIMENT}/custom/SRC/app/__init__.py').upload_from_filename(f'{DIR}/custom/SRC/app/__init__.py')
bucket.blob(f'{SERIES}/{EXPERIMENT}/custom/SRC/requirements.txt').upload_from_filename(f'{DIR}/custom/SRC/requirements.txt')
bucket.blob(f'{SERIES}/{EXPERIMENT}/custom/SRC/app/prestart.sh').upload_from_filename(f'{DIR}/custom/SRC/app/prestart.sh')
bucket.blob(f'{SERIES}/{EXPERIMENT}/custom/SRC/Dockerfile').upload_from_filename(f'{DIR}/custom/SRC/Dockerfile')

### Build Application Container

Use the Cloud Build client to construct and run the build instructions. Here the files collected in GCS are copied to the build instance, then the Docker build is run in the folder with the `Dockerfile`. The resulting image is pushed to Artifact Registry (setup above).

In [438]:
# setup the build config with empty list of steps - these will be added sequentially
build = cloudbuild_v1.Build(
    steps = []
)
# retrieve the source
build.steps.append(
    {
        'name': 'gcr.io/cloud-builders/gsutil',
        'args': ['cp', '-r', f'gs://{GCS_BUCKET}/{SERIES}/{EXPERIMENT}/custom/SRC/*', '/workspace']
    }
)
# docker build
build.steps.append(
    {
        'name': 'gcr.io/cloud-builders/docker',
        'args': ['build', '-t', f'{REPOSITORY}/{EXPERIMENT}_custom', '/workspace']
    }    
)
# docker push
build.images = [f"{REPOSITORY}/{EXPERIMENT}_custom"]

In [439]:
build

steps {
  name: "gcr.io/cloud-builders/gsutil"
  args: "cp"
  args: "-r"
  args: "gs://statmike-mlops-349915/dev/sklearn-workflow/custom/SRC/*"
  args: "/workspace"
}
steps {
  name: "gcr.io/cloud-builders/docker"
  args: "build"
  args: "-t"
  args: "us-central1-docker.pkg.dev/statmike-mlops-349915/statmike-mlops-349915/sklearn-workflow_custom"
  args: "/workspace"
}
images: "us-central1-docker.pkg.dev/statmike-mlops-349915/statmike-mlops-349915/sklearn-workflow_custom"

In [440]:
operation = cb_client.create_build(
    project_id = PROJECT_ID,
    build = build
)

In [441]:
response = operation.result()
response.status, response.artifacts

(<Status.SUCCESS: 3>,
 images: "us-central1-docker.pkg.dev/statmike-mlops-349915/statmike-mlops-349915/sklearn-workflow_custom")

In [442]:
print(f"Review the Custom Container with Artifact Registry in the Google Cloud Console:\nhttps://console.cloud.google.com/artifacts/docker/{PROJECT_ID}/{REGION}/{PROJECT_ID}?project={PROJECT_ID}")

Review the Custom Container with Artifact Registry in the Google Cloud Console:
https://console.cloud.google.com/artifacts/docker/statmike-mlops-349915/us-central1/statmike-mlops-349915?project=statmike-mlops-349915


### Upload Model as New Version

In [443]:
model.uri

'gs://statmike-mlops-349915/dev/sklearn-workflow/models/20240225203750/model'

In [387]:
prediction_schema = f"""title: TabularClassification
description: 'Classification results.'
type: object
properties:
  classes:
    type: array
    items:
      type: string
      enum:
      - '0'
      - '1'
    description: 'The class being classified, contains all possible values'
  scores:
    type: array
    items:
      type: number
      format: float
      minimum: 0.0
      maximum: 1.0
    description: 'The model''s confidence in each class being correct, higher value means higher confidence.  The N-th score corresponds to the N-th class in classes'
x-batchpredict-csv-classification-labels: classes
x-batchpredict-csv-classification-scores: scores
x-target-column-name: Class      
"""
bucket.blob(f"{model.uri.split(GCS_BUCKET + '/')[-1]}/prediction.yaml").upload_from_string(prediction_schema)

In [444]:
custom_model = aiplatform.Model.upload(
    display_name = f'{SERIES}_{EXPERIMENT}',
    model_id = f'model_{SERIES}_{EXPERIMENT}',
    parent_model =  model.resource_name,
    #prediction_schema_uri = f'{model.uri}/prediction.yaml',
    serving_container_image_uri = f"{REPOSITORY}/{EXPERIMENT}_custom",
    serving_container_predict_route="/predict",
    serving_container_health_route="/health",
    artifact_uri = model.uri,
    is_default_version = True,
    version_aliases = ['custom-container'],
    version_description = 'custom_container'
)

Creating Model
Create Model backing LRO: projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow/operations/7980520411060240384
Model created. Resource name: projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow@14
To use this Model in another session:
model = aiplatform.Model('projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow@14')


In [445]:
custom_model.to_dict()

{'name': 'projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow@14',
 'displayName': 'dev_sklearn-workflow',
 'predictSchemata': {},
 'metadata': None,
 'containerSpec': {'imageUri': 'us-central1-docker.pkg.dev/statmike-mlops-349915/statmike-mlops-349915/sklearn-workflow_custom',
  'predictRoute': '/predict',
  'healthRoute': '/health'},
 'supportedDeploymentResourcesTypes': ['DEDICATED_RESOURCES'],
 'supportedInputStorageFormats': ['jsonl',
  'bigquery',
  'csv',
  'tf-record',
  'tf-record-gzip',
  'file-list'],
 'supportedOutputStorageFormats': ['jsonl', 'bigquery'],
 'createTime': '2024-02-26T00:58:28.590188Z',
 'updateTime': '2024-03-15T17:06:55.757726Z',
 'etag': 'AMEw9yMyXAWjCun4UUJlFNmS7LE3hd3taZ4JRdXZZqkuGnpZw3YanG7Lccw881RgcUhI',
 'supportedExportFormats': [{'id': 'custom-trained',
   'exportableContents': ['ARTIFACT', 'IMAGE']}],
 'artifactUri': 'gs://statmike-mlops-349915/dev/sklearn-workflow/models/20240225203750/model',
 'versionId': '14',
 'versi

### Deploy New Version To Vertex AI Prediction Endpoint

In [446]:
endpoint.deploy(
    model = custom_model,
    deployed_model_display_name = custom_model.display_name,
    traffic_percentage = 100,
    machine_type = DEPLOY_COMPUTE,
    min_replica_count = 1,  
    max_replica_count = 1
) 

Deploying Model projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow to Endpoint : projects/1026793852137/locations/us-central1/endpoints/8609714806183690240
Deploy Endpoint model backing LRO: projects/1026793852137/locations/us-central1/endpoints/8609714806183690240/operations/833448539911618560
Endpoint model deployed. Resource name: projects/1026793852137/locations/us-central1/endpoints/8609714806183690240


In [447]:
endpoint.traffic_split

{'5365983980430557184': 100}

In [448]:
endpoint.list_models()

[dedicated_resources {
   machine_spec {
     machine_type: "n1-standard-4"
   }
   min_replica_count: 1
   max_replica_count: 1
 }
 id: "2426822273617887232"
 model: "projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow"
 model_version_id: "13"
 display_name: "dev_sklearn-workflow"
 create_time {
   seconds: 1710467845
   nanos: 785726000
 },
 dedicated_resources {
   machine_spec {
     machine_type: "n1-standard-4"
   }
   min_replica_count: 1
   max_replica_count: 1
 }
 id: "5365983980430557184"
 model: "projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow"
 model_version_id: "14"
 display_name: "dev_sklearn-workflow"
 create_time {
   seconds: 1710522537
   nanos: 93696000
 }]

In [449]:
custom_model.resource_name

'projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow'

In [450]:
custom_model.version_id

'14'

In [451]:
for deployed_model in endpoint.list_models():
    if deployed_model.id in endpoint.traffic_split:
        print(f"Model {deployed_model.display_name} with version {deployed_model.model_version_id} has traffic = {endpoint.traffic_split[deployed_model.id]}")
    else:
        endpoint.undeploy(deployed_model_id = deployed_model.id)
        print(f"Undeploying {deployed_model.display_name} with version {deployed_model.model_version_id} because it has no traffic.")

Undeploying Endpoint model: projects/1026793852137/locations/us-central1/endpoints/8609714806183690240
Undeploy Endpoint model backing LRO: projects/1026793852137/locations/us-central1/endpoints/8609714806183690240/operations/5333670467561586688
Endpoint model undeployed. Resource name: projects/1026793852137/locations/us-central1/endpoints/8609714806183690240
Undeploying dev_sklearn-workflow with version 13 because it has no traffic.
Model dev_sklearn-workflow with version 14 has traffic = 100


In [452]:
endpoint.traffic_split

{'5365983980430557184': 100}

### Get Predictions

In [453]:
endpoint.predict(instances = instances[0:2])

Prediction(predictions=[[0.9994812225587977, 0.000518777441202357], [0.9994363774143662, 0.0005636225856337875]], deployed_model_id='5365983980430557184', metadata=None, model_version_id='14', model_resource_name='projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow', explanations=None)

### Get Batch Predictions

In [454]:
client_options = {"api_endpoint": f"{REGION}-aiplatform.googleapis.com"}
jobs_client = aiplatform.gapic.JobServiceClient(client_options = client_options)

In [457]:
batch_prediction_job = aiplatform.gapic.BatchPredictionJob(
    display_name = f'{SERIES}_{EXPERIMENT}',
    model = custom_model.versioned_resource_name,
    input_config = dict(
        instances_format = 'bigquery',
        bigquery_source = dict(input_uri = f'bq://{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}')
    ),
    output_config = dict(
        predictions_format = 'bigquery',
        bigquery_destination = dict(output_uri = f'bq://{BQ_PROJECT}.{BQ_DATASET}')
    ),
    dedicated_resources = dict(
        machine_spec = dict(machine_type = DEPLOY_COMPUTE),
        starting_replica_count = 10,
        max_replica_count = 10
    ),
    instance_config = dict(
        instance_type = 'array',
        included_fields = list(train_x.columns),
        #excluded_fields = ['Class', 'splits', 'transaction_id']
    )
)

In [458]:
bqBatchJob = jobs_client.create_batch_prediction_job(
    parent = f'projects/{PROJECT_ID}/locations/{REGION}',
    batch_prediction_job = batch_prediction_job
)

In [459]:
state = jobs_client.get_batch_prediction_job(
    name = bqBatchJob.name
).state
state, state.value, state.name

(<JobState.JOB_STATE_RUNNING: 3>, 3, 'JOB_STATE_RUNNING')

In [460]:
state._member_names_

['JOB_STATE_UNSPECIFIED',
 'JOB_STATE_QUEUED',
 'JOB_STATE_PENDING',
 'JOB_STATE_RUNNING',
 'JOB_STATE_SUCCEEDED',
 'JOB_STATE_FAILED',
 'JOB_STATE_CANCELLING',
 'JOB_STATE_CANCELLED',
 'JOB_STATE_PAUSED',
 'JOB_STATE_EXPIRED',
 'JOB_STATE_UPDATING',
 'JOB_STATE_PARTIALLY_SUCCEEDED']

In [461]:
while state.value <= 3:
    print('Checking again in 1 Minute...')
    time.sleep(60)
    state = jobs_client.get_batch_prediction_job(
        name = bqBatchJob.name
    ).state
    print(f'State {state.value}: {state.name}')

Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
State 3: JOB_STATE_RUNNING
Checking again in 1 Minute...
S

In [462]:
bqBatchJob = jobs_client.get_batch_prediction_job(
    name = bqBatchJob.name
)
bqBatchJob.output_info

bigquery_output_dataset: "bq://statmike-mlops-349915.fraud"
bigquery_output_table: "predictions_2024_03_15T10_58_12_223Z_830"

In [463]:
str(bqBatchJob.end_time - bqBatchJob.start_time)

'0:16:20.704409'

In [572]:
bqBatchJob.output_info.bigquery_output_table

'predictions_2024_03_15T10_58_12_223Z_830'

In [464]:
bq_table = f"{bqBatchJob.output_info.bigquery_output_dataset.split('bq://')[-1]}.{bqBatchJob.output_info.bigquery_output_table}"
bq_table

'statmike-mlops-349915.fraud.predictions_2024_03_15T10_58_12_223Z_830'

In [479]:
batch_predictions = bq.query(f'''
SELECT * EXCEPT(prediction),
    ARRAY(SELECT * FROM UNNEST(SPLIT(SUBSTR(prediction, 2, LENGTH(prediction) - 2)))) AS prediction
FROM `{bq_table}`
''').to_dataframe()

In [480]:
batch_predictions.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V24,V25,V26,V27,V28,Amount,Class,transaction_id,splits,prediction
0,93879,-13.086519,7.352148,-18.256576,10.648505,-11.731476,-3.659167,-14.873658,8.810473,-5.418204,...,0.519952,-0.743909,-0.167808,-2.4983,-0.711066,30.31,1,385c59d4-c5cd-40f5-93e9-3435791a7209,TRAIN,"[0.0, 1.0]"
1,93879,-13.086519,7.352148,-18.256576,10.648505,-11.731476,-3.659167,-14.873658,8.810473,-5.418204,...,0.519952,-0.743909,-0.167808,-2.4983,-0.711066,30.31,1,21f430df-d4e5-421b-9952-634fa9c8de56,TRAIN,"[0.0, 1.0]"
2,55311,-6.159607,1.468713,-6.850888,5.174706,-2.986704,-1.795054,-6.545072,2.621236,-3.60587,...,-0.568731,0.582825,-0.042583,0.95113,0.158996,0.83,1,618287b3-d508-4601-9454-52746324014a,TRAIN,"[0.0, 1.0]"
3,55614,-7.347955,2.397041,-7.572356,5.177819,-2.854838,-1.795239,-8.783235,0.437157,-3.740598,...,-0.503722,-0.310933,-0.163986,1.197895,0.378187,0.83,1,e5486500-92f9-4aab-b37e-109f677ca5ca,TRAIN,"[0.0, 1.0]"
4,67150,-1.824295,0.403327,-1.994122,2.756558,-3.139064,0.408185,-1.209045,1.095634,-1.447225,...,-0.145493,0.049326,0.831065,0.332421,0.252713,489.71,1,1fea87d8-9a38-416d-ac96-b9dc6d36af80,TRAIN,"[0.0, 1.0]"


In [483]:
batch_predictions['prediction'].iloc[100]

array(['0.9994436978921893', ' 0.000556302107810608'], dtype=object)

---
## Model Serving: Batch With BigQuery ML

### Convert Model To ONNX

- Using [sklearn-onnx](https://onnx.ai/sklearn-onnx/)
    - All the available data types in [the source](https://github.com/onnx/sklearn-onnx/blob/main/skl2onnx/common/data_types.py)
    - more on zipmap option [here](https://onnx.ai/sklearn-onnx/auto_tutorial/plot_dbegin_options_zipmap.html)

In [732]:
bucket = gcs.bucket(GCS_BUCKET)
blob = bucket.blob(custom_model.uri.split(f'{GCS_BUCKET}/')[-1] + '/model.pkl')

In [733]:
local_custom_model = pickle.loads(blob.download_as_string())

In [734]:
local_custom_model.predict(train_x[0:50])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [735]:
initial_type = [('float_array', skl2onnx.common.data_types.FloatTensorType([None, train_x.shape[1]]))]

In [737]:
onnx_custom_model = skl2onnx.convert_sklearn(
    local_custom_model,
    initial_types = initial_type,
    target_opset=12,
    options = {id(local_custom_model): {'zipmap': False}}
)

In [738]:
bucket.blob(custom_model.uri.split(f'{GCS_BUCKET}/')[-1] + '/model.onnx').upload_from_string(onnx_custom_model.SerializeToString())

### Local Test of ONNX Model

- With [onnxruntime](https://onnxruntime.ai/)

In [739]:
local_onnx = onnxruntime.InferenceSession(onnx_custom_model.SerializeToString())

In [740]:
train_x.iloc[0:1].to_numpy().astype(np.float32)

array([[ 2.8120000e+03, -6.3340300e-01,  9.6361601e-01,  2.4949455e+00,
         2.0990510e+00, -4.0433067e-01,  2.3586158e-01, -7.9319049e-03,
         2.1144152e-01, -2.0981681e-01,  3.0829760e-01, -1.2049923e+00,
        -4.7470781e-01, -6.5406358e-01, -4.7459912e-01, -4.2841780e-01,
         5.3665149e-01, -3.8065460e-01,  2.8650539e-02, -6.8796945e-01,
        -1.7498475e-01,  1.4675528e-02,  1.6278177e-02, -6.1462473e-02,
         3.5519636e-01, -1.7908551e-01, -1.0694742e-01, -2.1503925e-01,
         5.0697796e-02,  0.0000000e+00]], dtype=float32)

In [741]:
local_onnx.run(None, {'float_array': train_x.iloc[0:1].to_numpy().astype(np.float32)})

[array([0], dtype=int64), array([[9.994363e-01, 5.636811e-04]], dtype=float32)]

### Import Model With BigQuery ML

In [742]:
query = f"""
CREATE OR REPLACE MODEL `{BQ_PROJECT}.{BQ_DATASET}.{SERIES}-{EXPERIMENT}-onnx`
    OPTIONS(
        MODEL_TYPE = 'ONNX',
        MODEL_PATH = '{custom_model.uri}/*'
    )
"""
print(query)


CREATE OR REPLACE MODEL `statmike-mlops-349915.fraud.dev-sklearn-workflow-onnx`
    OPTIONS(
        MODEL_TYPE = 'ONNX',
        MODEL_PATH = 'gs://statmike-mlops-349915/dev/sklearn-workflow/models/20240225203750/model/*'
    )



In [743]:
job = bq.query(query = query)
job.result()
(job.ended-job.started).total_seconds()

6.693

### Get Predictions

In [745]:
query = f"""
SELECT *
FROM ML.PREDICT (MODEL `{BQ_PROJECT}.{BQ_DATASET}.{SERIES}-{EXPERIMENT}-onnx`,(
    SELECT ARRAY[{', '.join(list(train_x.columns))}] as float_array
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
    WHERE splits = 'TEST'
    LIMIT 1)
  )
"""
pred = bq.query(query = query).to_dataframe()
pred

Unnamed: 0,label,probabilities,float_array
0,0,"[0.9994812607765198, 0.0005187392234802246]","[35337.0, 1.0928441854981998, -0.0132303486713..."


---
## Model Evaluations: With SDK

- https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.Model#google_cloud_aiplatform_Model_evaluate

In [640]:
custom_model.uri

'gs://statmike-mlops-349915/dev/sklearn-workflow/models/20240225203750/model'

In [652]:
bqjob = bq.query(f'''
CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}_prediction_input` AS
SELECT Class, {', '.join(list(train_x.columns))}
FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
''')
bqjob.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f47a77c49a0>

In [653]:
evaluateJob = custom_model.evaluate(
    prediction_type = 'classification',
    target_field_name = 'Class',
    bigquery_source_uri = f'bq://{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}_prediction_input',
    bigquery_destination_output_uri = f'bq://{BQ_PROJECT}.{BQ_DATASET}',
    class_labels = [0, 1],
    prediction_label_column = '',
    prediction_score_column = 'prediction',
    staging_bucket = f'{custom_model.uri}/evaluations'
)

Created PipelineJob for your Model Evaluation. View it in the console: https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/evaluation-classification-pipeline-20240315235632?project=1026793852137


In [654]:
evaluateJob.wait()

PipelineJob projects/1026793852137/locations/us-central1/pipelineJobs/evaluation-classification-pipeline-20240315235632 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/1026793852137/locations/us-central1/pipelineJobs/evaluation-classification-pipeline-20240315235632 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/1026793852137/locations/us-central1/pipelineJobs/evaluation-classification-pipeline-20240315235632 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/1026793852137/locations/us-central1/pipelineJobs/evaluation-classification-pipeline-20240315235632 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/1026793852137/locations/us-central1/pipelineJobs/evaluation-classification-pipeline-20240315235632 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/1026793852137/locations/us-central1/pipelineJobs/evaluation-classification-pipeline-20240315235632 current state:
Pipel

RuntimeError: Job failed with:
code: 9
message: "The DAG failed because some tasks failed. The failed tasks are: [model-batch-predict].; Job (project_id = statmike-mlops-349915, job_id = 6105911600336601088) is failed due to the above error.; Failed to handle the job: {project_number = 1026793852137, job_id = 6105911600336601088}"


In [None]:
evaluation = evaluateJob.get_model_evaluation()
evaluation.metrics

---
## Model Evaluations: With Pipeline Components

Evaluations for the batch prediction job run above.  It could also be included in the pipeline using the [ModelBatchPredictOp](https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-2.8.0/api/v1/batch_predict_job.html).

- https://cloud.google.com/vertex-ai/docs/pipelines/components-introduction
- https://cloud.google.com/vertex-ai/docs/pipelines/use-components#use_an_importer_node
- https://cloud.google.com/vertex-ai/docs/evaluation/introduction?
- https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-2.8.0/api/v1/model_evaluation.html

In [615]:
kfp.__version__

'2.7.0'

In [617]:
import google_cloud_pipeline_components
google_cloud_pipeline_components.__version__

'2.10.0'

- https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/_implementation/model/__init__.py

In [746]:
from google_cloud_pipeline_components._implementation.model.get_model.component import get_vertex_model as GetVertexModelOp

Check out the new component, ModelGetOp: https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-2.10.0/api/v1/model.html#v1.model.ModelGetOp

In [747]:
from google_cloud_pipeline_components.v1.model import ModelGetOp

In [648]:
import kfp
from google_cloud_pipeline_components.types import artifact_types
#from google_cloud_pipeline_components.v1.model_evaluation import (ModelEvaluationClassificationOp, ModelImportEvaluationOp)

In [630]:
@kfp.dsl.pipeline(
    name = f'{SERIES}-{EXPERIMENT}-evaluation',
    description = 'A simple pipeline for model evaluation',
    pipeline_root = f'gs://{GCS_BUCKET}/{SERIES}/{EXPERIMENT}/pipeline_root'
)
def eval_pipeline(
    project: str,
    region: str,
    root_dir: str,
    model_name: str,
    bq_project: str,
    bq_dataset: str,
    bq_table: str,
    class_labels: list,
    var_target: str  
):
    vertex_model = kfp.dsl.importer(
        artifact_uri = f'https://{region}-aiplatform.googleapis.com/v1/{model_name}',
        artifact_class = artifact_types.VertexModel,
        metadata = dict(
            model_resource_name = model_name
        )
    ).set_display_name('Get Vertex AI Model')

    bq_preds_table = kfp.dsl.importer(
        artifact_uri = f'bq://{bq_project}.{bq_dataset}.{bq_table}',
        artifact_class = artifact_types.BQTable,
        metadata = dict(
            projectId = bq_project,
            datasetId = bq_dataset,
            tableId = bq_table
        )
    ).set_display_name('Get BigQuery Table of Predictions')
    
    # Run the evaluation based on prediction type
    evaluation = ModelEvaluationClassificationOp(
        project = project,
        location = region,
        class_labels = class_labels,
        predictions_format = 'bigquery',
        predictions_bigquery_source = bq_preds_table.output,
        prediction_label_column = '',
        prediction_score_column = 'prediction',
        target_field_name = var_target,
        ground_truth_format = 'bigquery',
        ground_truth_bigquery_source = f'bq://{bq_project}.{bq_dataset}.{bq_table}',
        model = vertex_model.output
    )

In [631]:
kfp.compiler.Compiler().compile(
    pipeline_func = eval_pipeline,
    package_path = f'{DIR}/pipeline/eval.yaml'
)

In [632]:
parameters = dict(
    project = PROJECT_ID,
    region = REGION,
    root_dir = f'gs://{GCS_BUCKET}/{SERIES}/{EXPERIMENT}/pipelines/eval',
    model_name = custom_model.versioned_resource_name,
    bq_project = BQ_PROJECT,
    bq_dataset = BQ_DATASET,
    bq_table = bqBatchJob.output_info.bigquery_output_table,
    class_labels = [0, 1],
    var_target = 'Class'
)

In [633]:
pipeline_job = aiplatform.PipelineJob(
    display_name = f"eval-pipeline",
    template_path = f"{DIR}/pipeline/eval.yaml",
    parameter_values = parameters,
    pipeline_root = f'gs://{GCS_BUCKET}/{SERIES}/{EXPERIMENT}/pipelines/eval',
    enable_caching = False # True (enabled), False (disable), None (defer to component level caching) 
)

In [634]:
response = pipeline_job.submit(
    service_account = SERVICE_ACCOUNT
)

Creating PipelineJob
PipelineJob created. Resource name: projects/1026793852137/locations/us-central1/pipelineJobs/dev-sklearn-workflow-evaluation-20240315215438
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/1026793852137/locations/us-central1/pipelineJobs/dev-sklearn-workflow-evaluation-20240315215438')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/dev-sklearn-workflow-evaluation-20240315215438?project=1026793852137


In [635]:
print(f'The Dashboard can be viewed here:\n{pipeline_job._dashboard_uri()}')

The Dashboard can be viewed here:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/dev-sklearn-workflow-evaluation-20240315215438?project=1026793852137


In [636]:
pipeline_job.wait()

PipelineJob projects/1026793852137/locations/us-central1/pipelineJobs/dev-sklearn-workflow-evaluation-20240315215438 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/1026793852137/locations/us-central1/pipelineJobs/dev-sklearn-workflow-evaluation-20240315215438 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/1026793852137/locations/us-central1/pipelineJobs/dev-sklearn-workflow-evaluation-20240315215438 current state:
PipelineState.PIPELINE_STATE_RUNNING


RuntimeError: Job failed with:
code: 9
message: "The DAG failed because some tasks failed. The failed tasks are: [model-evaluation-classification].; Job (project_id = statmike-mlops-349915, job_id = 1201491606130130944) is failed due to the above error.; Failed to handle the job: {project_number = 1026793852137, job_id = 1201491606130130944}"


In [None]:
aiplatform.get_pipeline_df(pipeline = f'{SERIES}-{EXPERIMENT}-evaluation')

In [None]:
tasks = {task.task_name: task for task in pipeline_job.task_details}

In [None]:
for task in tasks:
  print(task, tasks[task].task_name, tasks[task].state)

In [None]:
tasks['importer']

---
## Model Registry: Optional Additions

In [255]:
model.to_dict()

{'name': 'projects/1026793852137/locations/us-central1/models/model_dev_sklearn-workflow@1',
 'displayName': 'dev_sklearn-workflow',
 'predictSchemata': {},
 'metadata': None,
 'containerSpec': {'imageUri': 'us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest'},
 'supportedDeploymentResourcesTypes': ['DEDICATED_RESOURCES'],
 'supportedInputStorageFormats': ['jsonl',
  'bigquery',
  'csv',
  'tf-record',
  'tf-record-gzip',
  'file-list'],
 'supportedOutputStorageFormats': ['jsonl', 'bigquery'],
 'createTime': '2024-02-26T00:58:28.590188Z',
 'updateTime': '2024-02-26T00:58:29.560475Z',
 'etag': 'AMEw9yOee46PG34bICm1RvqIgYGKCF0MPG7B3VMJNHf4v2v6L8YkI-MSiR4L3DY75Cbr',
 'supportedExportFormats': [{'id': 'custom-trained',
   'exportableContents': ['ARTIFACT']}],
 'artifactUri': 'gs://statmike-mlops-349915/dev/sklearn-workflow/models/20240225203750/model',
 'versionId': '1',
 'versionAliases': ['run-20240225203750', 'default'],
 'versionDescription': 'run-20240225203750',
 'versionC

In [270]:

# example model from BQML
test = aiplatform.Model(
    project = PROJECT_ID,
    location = REGION,
    model_name = 'bqml_bqml_random-forest'
)
test.to_dict()

{'name': 'projects/1026793852137/locations/us-central1/models/bqml_bqml_random-forest',
 'displayName': 'bqml_bqml_random-forest',
 'supportedDeploymentResourcesTypes': ['DEDICATED_RESOURCES'],
 'supportedInputStorageFormats': ['jsonl',
  'bigquery',
  'csv',
  'tf-record',
  'tf-record-gzip',
  'file-list'],
 'supportedOutputStorageFormats': ['jsonl', 'bigquery'],
 'createTime': '2023-06-27T12:30:07.561556Z',
 'updateTime': '2023-06-27T12:32:06.390904Z',
 'etag': 'AMEw9yOim4p6j_hCCCHgN_pxHj9ot-6hpRtHMNxJJrOHhbkPklVRulB6UzcmEv5yVKNX',
 'labels': {'experiment': 'random-forest', 'series': 'bqml'},
 'explanationSpec': {'parameters': {'sampledShapleyAttribution': {'pathCount': 5}},
  'metadata': {'inputs': {'V3': {},
    'V18': {},
    'V21': {},
    'V13': {},
    'V11': {},
    'Amount': {},
    'V8': {},
    'V15': {},
    'V27': {},
    'V22': {},
    'V4': {},
    'V24': {},
    'Time': {},
    'V17': {},
    'V16': {},
    'V9': {},
    'V1': {},
    'V14': {},
    'V10': {},
    'V5

In [256]:
# example model from Training Pipeline (not a Vertex Pipeline)
test = aiplatform.Model(
    project = PROJECT_ID,
    location = REGION,
    model_name = 'model_05_05f'
)
test.to_dict()

{'name': 'projects/1026793852137/locations/us-central1/models/model_05_05f',
 'displayName': '05_05f',
 'predictSchemata': {},
 'metadata': None,
 'trainingPipeline': 'projects/1026793852137/locations/us-central1/trainingPipelines/6528349671444709376',
 'containerSpec': {'imageUri': 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-12:latest'},
 'supportedDeploymentResourcesTypes': ['DEDICATED_RESOURCES',
  'SHARED_RESOURCES'],
 'supportedInputStorageFormats': ['jsonl',
  'bigquery',
  'csv',
  'tf-record',
  'tf-record-gzip',
  'file-list'],
 'supportedOutputStorageFormats': ['jsonl', 'bigquery'],
 'createTime': '2022-09-27T19:21:47.583324Z',
 'updateTime': '2024-02-22T21:13:55.966054Z',
 'etag': 'AMEw9yObhb_VzCdWLiEzm7nRwKtvJG_ZDxWv_Yc7GywHp_M3phooJkxLMLMoJtkK5fQE',
 'labels': {'run_name': 'run-20240222211353',
  'experiment_name': 'experiment-05-05f-tf-classification-dnn',
  'series': '05',
  'experiment': '05f'},
 'supportedExportFormats': [{'id': 'custom-trained',
   'exportableCo

In [431]:
# example autoML model
test = aiplatform.Model(
    project = PROJECT_ID,
    location = REGION,
    model_name = '3955644813528793088'
)
test.to_dict()

{'name': 'projects/1026793852137/locations/us-central1/models/3955644813528793088',
 'displayName': '02c_fraud_20230825120638',
 'predictSchemata': {'instanceSchemaUri': 'https://storage.googleapis.com/caip-tenant-c9fb6c93-0c9f-41d7-96df-f5ebf0a7575c/schema/predict/instance.yaml?GoogleAccessId=service-1026793852137@gcp-sa-aiplatform.iam.gserviceaccount.com&Expires=1710524139&Signature=fUD9YFFTWD86GADAuN%252Fu2sYIeDYqBdpqiqHIqTWj1V8TjEm%252FeyhVN40nR1otASw1rBTwe1iPYt8gRTnxOc32cMWcKQGSWTgDaXUDJ5ktQaYD%252FOVQiUZ1hp25UzOIYkWKa1gEVMlDpXkyo30u5bjuGG20kd1VR5uceUUQ31%252FmTDd993%252BYz3V3JZWWRwL5hfu0%252B%252Bm%252FCkMotjaabrX1zKNl98wagbrNnuymNyk5wVuzqnHVuBStkDcL290RKcfPZHKL%252BtlLQT4KiIoOTpAcAa3Rho24pzIw4GufxfwVcOWD%252BgKKm7EDEcWi9Mse5iYsyM77Q2vjTrtIH1GycubLFISprg%253D%253D',
  'predictionSchemaUri': 'https://storage.googleapis.com/caip-tenant-c9fb6c93-0c9f-41d7-96df-f5ebf0a7575c/schema/predict/prediction.yaml?GoogleAccessId=service-1026793852137@gcp-sa-aiplatform.iam.gserviceaccount.com&E

In [264]:
test.to_dict()['predictSchemata']['instanceSchemaUri']

'https://storage.googleapis.com/caip-tenant-c9fb6c93-0c9f-41d7-96df-f5ebf0a7575c/schema/predict/instance.yaml?GoogleAccessId=service-1026793852137@gcp-sa-aiplatform.iam.gserviceaccount.com&Expires=1708976819&Signature=PKHI%252Bv6b5EBvMjEBGEJEkAJFPxHnAzoofOa5Ac23gHcN8QShcu%252BD53FR7qkUf7sDX7DHBNqecWORycOVrYsKLA0gMTbRE2BatdSl8p6pWJymfUTuiiur%252BiOolSQAZkDBXC3Yt0D8JtqYr%252BYiyo60QTKtOVoFf3uyKFiR0%252BMB3g8aralZGixuf8Ty1g3xPfYeby2ljRGAYk4GUxSqmbWgssq2zXXKLs5ORNgRchEb0k9ZlogHphc8ZskB2Zl%252FpeC%252BjNGfq2hxNyQzQo7IPYq98LToeVAOV14F6uN3bp8y1HiQxYdJwImSj%252FNFRYeWBMW6IM74TQNzzTW8ynDj6eKYdA%253D%253D'

In [267]:
test.to_dict()['predictSchemata']['predictionSchemaUri']

'https://storage.googleapis.com/caip-tenant-c9fb6c93-0c9f-41d7-96df-f5ebf0a7575c/schema/predict/prediction.yaml?GoogleAccessId=service-1026793852137@gcp-sa-aiplatform.iam.gserviceaccount.com&Expires=1708976819&Signature=JXZzs4r6dQ3wsuP77DlxfrAD8CJ6MDdWHVs8Mb6rFPDnIWoVEvffM3SPsE1VCMwaHZzLdhII1jKD7nD7vXtlpxTToBPPRLldhpEEqRVtlo%252Fv7CDTmzS5kk%252FiL8cnW179rlyI%252FLX7SigRfebrLMMZMI%252BwaPLeNPTc%252FYwPy0TwXJT23SVVaroEsODwvINZ4QYNMP%252F9ZYk1G%252BK0MQFiNVTTHVB3DMHJ9y8Gj1Yw5sECwUfxQ%252FQzTEEVA9v2jNhPP1IBxdkC3%252F9aUrvGRtT85h0M4IGajJP23KSpYw0CFl4aLRMYNzNKXr3zl3XwHZ5wuBPPRipRvZAp6zzUc3Vv38ajRg%253D%253D'

In [269]:
test.to_dict()['metadataSchemaUri']

'https://storage.googleapis.com/google-cloud-aiplatform/schema/model/metadata/automl_tabular_1.0.0.yaml'