### Prerequisites:
-  01 -  BigQuery - Table Data Source

---
## Setup

In [34]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'fifth-sprite-402605'

In [35]:
REGION = 'us-central1'
DATANAME = 'fraud'
NOTEBOOK = '02c'

# Resources
DEPLOY_COMPUTE = 'n1-standard-1'

# Model Training
VAR_TARGET = 'Class'
VAR_OMIT = 'transaction_id' # add more variables to the string with space delimiters

In [36]:
from google.cloud import aiplatform
from datetime import datetime
import kfp
from kfp.v2 import compiler
#import kfp.v2.dsl as dsl
#import google_cloud_pipeline_components as gcc_aip
from google_cloud_pipeline_components.v1.dataset import TabularDatasetCreateOp
from google_cloud_pipeline_components.v1.automl.training_job import AutoMLTabularTrainingJobRunOp
from google_cloud_pipeline_components.v1.endpoint import EndpointCreateOp, ModelDeployOp

from google.cloud import bigquery
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value
import json
import numpy as np

Set client objects:

In [37]:
aiplatform.init(project=PROJECT_ID, location=REGION)
bq = bigquery.Client()

Set parameters:

In [38]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET = PROJECT_ID
URI = f"gs://{BUCKET}/{DATANAME}/models/{NOTEBOOK}"
DIR = f"temp/{NOTEBOOK}"

## Authenticate to GCP
- Note: Open shell and run below commanduth list


```
sudo rm -rf ~/.gsutil
sudo rm -rf ~/.config/gcloud
git pull
sudo apt install unzip 
unzip -n fifth-sprite-402605-f2105f411b52.zip
gcloud auth login --cred-file=fifth-sprite-402605-f2105f411b52.json
gcloud init --console-only
gcloud auth application-default login
gcloud config set project PROJECT_ID
gcloud auth list
```

In [39]:
SERVICE_ACCOUNT = !gcloud config list --format='value(core.account)' 
SERVICE_ACCOUNT = SERVICE_ACCOUNT[0]
SERVICE_ACCOUNT

'117917517031-compute@developer.gserviceaccount.com'

In [40]:
!gcloud projects get-iam-policy $PROJECT_ID --filter="bindings.members:$SERVICE_ACCOUNT" --format='table(bindings.role)' --flatten="bindings[].members"

ROLE
roles/editor
roles/owner
roles/storage.objectAdmin


>Note: If the resulting list is missing [roles/storage.objectAdmin](https://cloud.google.com/storage/docs/access-control/iam-roles) then [revisit the setup notebook](../00%20-%20Setup/00%20-%20Environment%20Setup.ipynb#permissions) and add this permission to the service account with the provided instructions.

Clean environment:

In [41]:
!rm -rf {DIR}
!mkdir -p {DIR}

---
## Pipeline (KFP) Definition
- Flow
    - Create Vertex AI Dataset from link to BigQuery table
    - Create Vertex AI AutoML Tabular Training Job
    - Create Endpoint and Depoy trained model
    
Define a Job:
- Consider Weighting
- Model Type
- Optimization Objective

In [53]:
CPU_LIMIT = "1"  # vCPUs
MEMORY_LIMIT = "1G"

@kfp.dsl.pipeline(
    name = f'kfp-{NOTEBOOK}-{DATANAME}-{TIMESTAMP}',
    pipeline_root = URI+'/'+str(TIMESTAMP)+'/kfp/'
)
def pipeline(
    project: str,
    dataname: str,
    display_name: str,
    deploy_machine: str,
    bq_source: str,
    var_target: str,
    var_omit: str,
    features: dict,
    labels: dict 
):
    
    # dataset
    dataset = TabularDatasetCreateOp(
        project = project,
        display_name = display_name,
        bq_source = bq_source,
        labels = labels
    ).set_cpu_limit(CPU_LIMIT).set_memory_limit(MEMORY_LIMIT)
    
    # training
    model = AutoMLTabularTrainingJobRunOp(
        project = project,
        display_name = display_name,
        optimization_prediction_type = "classification",
        optimization_objective = "maximize-au-prc",
        budget_milli_node_hours = 1000,
        disable_early_stopping=False,
        column_specs = features,
        dataset = dataset.outputs['dataset'],
        target_column = var_target,
        predefined_split_column_name = 'splits',
        labels = labels
    ).set_cpu_limit(CPU_LIMIT).set_memory_limit(MEMORY_LIMIT)
    
    # Endpoint: Creation
    endpoint = EndpointCreateOp(
        project = project,
        display_name = display_name,
        labels = labels
    )
    
    # Endpoint: Deployment of Model
    deployment = ModelDeployOp(
        model = model.outputs["model"],
        endpoint = endpoint.outputs["endpoint"],
        dedicated_resources_min_replica_count = 1,
        dedicated_resources_max_replica_count = 1,
        traffic_split = {"0": 100},
        dedicated_resources_machine_type= deploy_machine
    )

---
## Compile Pipeline

In [54]:
compiler.Compiler().compile(
    pipeline_func = pipeline,
    package_path = f"{DIR}/{NOTEBOOK}.json"
)

Move compiled pipeline files to GCS Bucket
- Note: Before running below command, nake sure that you executed below command on the shell cli
   - gcloud auth login

In [55]:
URI

'gs://fifth-sprite-402605/fraud/models/02c'

In [56]:
!gsutil cp {DIR}/{NOTEBOOK}.json {URI}/{TIMESTAMP}/kfp/

Copying file://temp/02c/02c.json [Content-Type=application/json]...
/ [1 files][ 46.7 KiB/ 46.7 KiB]                                                
Operation completed over 1 objects/46.7 KiB.                                     


---
## Create Vertex AI Pipeline Job

Get features dictionary for the pipeline input:

In [57]:
# get feature names
query = f"SELECT * FROM {DATANAME}.INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{DATANAME}_prepped'"
schema = bq.query(query).to_dataframe()
OMIT = VAR_OMIT.split() + [VAR_TARGET, 'splits']
features = schema[~schema.column_name.isin(OMIT)].column_name.tolist()
features = dict.fromkeys(features, 'auto')

Run The pipeline:

In [58]:
DEPLOY_COMPUTE

'n1-standard-1'

In [59]:
pipeline = aiplatform.PipelineJob(
    display_name = f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
    template_path = f"{URI}/{TIMESTAMP}/kfp/{NOTEBOOK}.json",
    parameter_values = {
        "project" : PROJECT_ID,
        "dataname" : DATANAME,
        "display_name" : f'{NOTEBOOK}_{DATANAME}_{TIMESTAMP}',
        "deploy_machine" : DEPLOY_COMPUTE,
        "bq_source" : f'bq://{PROJECT_ID}.{DATANAME}.{DATANAME}_prepped',
        "var_target" : VAR_TARGET,
        "var_omit" : VAR_OMIT,
        "features" : features,
        "labels" : {'notebook': NOTEBOOK}       
    },
    labels = {'notebook': NOTEBOOK},
    enable_caching=False
)

In [60]:
SERVICE_ACCOUNT

'117917517031-compute@developer.gserviceaccount.com'

In [61]:
response = pipeline.run(
    service_account = SERVICE_ACCOUNT
)

Creating PipelineJob
PipelineJob created. Resource name: projects/117917517031/locations/us-central1/pipelineJobs/kfp-02c-fraud-20231024031909-20231024032709
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/117917517031/locations/us-central1/pipelineJobs/kfp-02c-fraud-20231024031909-20231024032709')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/kfp-02c-fraud-20231024031909-20231024032709?project=117917517031
PipelineJob projects/117917517031/locations/us-central1/pipelineJobs/kfp-02c-fraud-20231024031909-20231024032709 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/117917517031/locations/us-central1/pipelineJobs/kfp-02c-fraud-20231024031909-20231024032709 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/117917517031/locations/us-central1/pipelineJobs/kfp-02c-fraud-20231024031909-20231024032709 current state:
PipelineState.PIPELINE_STATE_R

RuntimeError: Job failed with:
code: 9
message: "The DAG failed because some tasks failed. The failed tasks are: [endpoint-create, tabular-dataset-create].; Job (project_id = fifth-sprite-402605, job_id = 7595808404967260160) is failed due to the above error.; Failed to handle the job: {project_number = 117917517031, job_id = 7595808404967260160}"


Visual Representation of the pipeline can be viewed in the colsole:

In [None]:
print(f"Review the Pipeline as it runs here:\nhttps://console.cloud.google.com/vertex-ai/locations/{REGION}/pipelines/runs/{pipeline.resource_name.split('/')[-1]}?project={PROJECT_ID}")

Retrieve the pipeline information:

In [None]:
aiplatform.get_pipeline_df(pipeline = f'kfp-{NOTEBOOK}-{DATANAME}-{TIMESTAMP}')

---
## Evaluation
While the model above was trained using AutoML with the API, it is still possible to review the evaluation metrics directly in the Google Cloud Console.  Just visit the Models section of Vertex AI service and select the model and it will present the evaluation metrics with many helpful visuals.

It is also possible to retrieve the evaluation metrics for you model using the API. 

For more information review [this page](https://cloud.google.com/vertex-ai/docs/training/evaluating-automl-models).

Get the Model:

In [None]:
models = aiplatform.Model.list(filter=f'labels.notebook={NOTEBOOK}')

In [None]:
model = models[0]
model.resource_name

Retrives the aggregate model evalution metrics for the model as a whole.  

Either:
- First, use `model.list_model_evaluations()` to retrieve the evaluation id, then use `model.get_model_evaluation(evaluation_id = )` for the evaluation id
- Or, use `.get_model_evaluation()` and it will retrieve the first model evaluation

In [None]:
evaluation = model.get_model_evaluation().to_dict() # get first evaluation

In [None]:
evaluation.keys()

In [None]:
evaluation['metrics'].keys()

In [None]:
evaluation['metrics']['auPrc']

In [None]:
evaluation['metrics']['confidenceMetrics'][3]

Review several of the metrics included in the evaluation.  Also, compare these to the results in the console view.

In [None]:
print(f"Review this model in the console:\nhttps://console.cloud.google.com/vertex-ai/locations/{REGION}/models/{model.name}/versions/{model.version_id}/evaluations/{evaluation['name'].split('/')[-1]}?project={PROJECT_ID}")

In [None]:
evaluation['metrics']['auPrc']

In [None]:
for i in range(len(evaluation['metrics']['confusionMatrix']['annotationSpecs'])):
    print('True Label = ', evaluation['metrics']['confusionMatrix']['annotationSpecs'][i]['displayName'], ' has Predicted labels = ', evaluation['metrics']['confusionMatrix']['rows'][i])

For models with labels you can retrieve the evaluation metrics for each slice of the model as well using the .gapic api version:

In [None]:
model_client = aiplatform.gapic.ModelServiceClient(
    client_options = {
        'api_endpoint' : f'{REGION}-aiplatform.googleapis.com'
    }
)

In [None]:
slices = model_client.list_model_evaluation_slices(parent = evaluation['name'])

In [None]:
for slice in slices:
    print('Label = ', slice.slice_.value, 'has auPrc = ', slice.metrics['auPrc'])

---
## Prediction

### Prepare a record for prediction: instance and parameters lists

In [None]:
pred = bq.query(
    query = f"""
        SELECT * EXCEPT({VAR_TARGET}, splits, {VAR_OMIT})
        FROM {DATANAME}.{DATANAME}_prepped
        WHERE splits='TEST'
        LIMIT 10
    """
).to_dataframe()

In [None]:
pred.head(4)

In [None]:
pred['Time'] = pred['Time'].astype(str)
newobs = pred.to_dict(orient='records')
newobs[0]

Need to understand the format of variables that the predictions expect.  AutoML may convert the type of some variables. The following cells retrieve the model from the endpoint and its schemata:

In [None]:
instances = [json_format.ParseDict(newob, Value()) for newob in newobs]

### Get Predictions: Python Client

In [None]:
aiplatform.Endpoint.list(filter=f'labels.notebook={NOTEBOOK}')

In [None]:
endpoint = aiplatform.Endpoint.list(filter=f'labels.notebook={NOTEBOOK}')[0]
endpoint.display_name

In [None]:
prediction = endpoint.predict(instances = instances) # or instances = newobs
prediction.predictions[0]

In [None]:
prediction.predictions[0]['classes'][np.argmax(prediction.predictions[0]['scores'])]

### Get Predictions: REST

In [None]:
with open(f'{DIR}/request.json','w') as file:
    file.write(json.dumps({"instances": [newobs[0]]}))

In [None]:
!curl -X POST \
-H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
-H "Content-Type: application/json; charset=utf-8" \
-d @{DIR}/request.json \
https://{REGION}-aiplatform.googleapis.com/v1/{endpoint.resource_name}:predict

### Get Predictions: gcloud (CLI)

In [None]:
!gcloud beta ai endpoints predict {endpoint.name.rsplit('/',1)[-1]} --region={REGION} --json-request={DIR}/request.json

---
## Explanations
Interpretation Guide
- https://cloud.google.com/vertex-ai/docs/predictions/interpreting-results-automl#tabular

In [None]:
explanation = endpoint.explain(instances = instances)

In [None]:
explanation.predictions[0]

In [None]:
print("attribution:")
print("baseline output",explanation.explanations[0].attributions[0].baseline_output_value)
print("instance output",explanation.explanations[0].attributions[0].instance_output_value)
print("output_index",explanation.explanations[0].attributions[0].output_index)
print("output display value",explanation.explanations[0].attributions[0].output_display_name)
print("approximation error",explanation.explanations[0].attributions[0].approximation_error)

In [None]:
import matplotlib.pyplot as plt
features = []
scores = []
for k in explanation.explanations[0].attributions[0].feature_attributions:
    features.append(k)
    scores.append(explanation.explanations[0].attributions[0].feature_attributions[k])
features = [x for _, x in sorted(zip(scores, features))]
scores = sorted(scores)
fig, ax = plt.subplots()
fig.set_size_inches(9, 9)
ax.barh(features, scores)
fig.show()