In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Fraudfinder - Model training and deployment using Vertex AI

<table align="left">
  <td>
    <a href="https://console.cloud.google.com/ai-platform/notebooks/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/fraudfinder/blob/main/vertex_ai/05_model_training_xgboost_formalization.ipynb">
       <img src="https://www.gstatic.com/cloud/images/navigation/vertex-ai.svg" alt="Google Cloud Notebooks">Open in Cloud Notebook
    </a>
  </td> 
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/fraudfinder/blob/main/vertex_ai/05_model_training_xgboost_formalization.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Open in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/fraudfinder/blob/main/vertex_ai/05_model_training_xgboost_formalization.ipynb">
        <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

### Load configuration settings from the setup notebook

Set the constants used in this notebook and load the config settings from the `00_environment_setup.ipynb` notebook.

## Overview

[Fraudfinder](https://github.com/googlecloudplatform/fraudfinder) is a series of labs on how to build a real-time fraud detection system on Google Cloud. Throughout the Fraudfinder labs, you will learn how to read historical bank transaction data stored in data warehouse, read from a live stream of new transactions, perform exploratory data analysis (EDA), do feature engineering, ingest features into a feature store, train a model using feature store, register your model in a model registry, evaluate your model, deploy your model to an endpoint, do real-time inference on your model with feature store, and monitor your model.

### Objective

In the following notebook, you will learn how to:

* Build a Vertex AI dataset
* Build a Docker container and train a custom Pytorch model using Vertex AI
* Evaluate the model locally
* Deploy the model to Vertex AI as an endpoint. 

This tutorial uses the following Google Cloud data analytics and services:

- [BigQuery](https://cloud.google.com/bigquery/)
- [Vertex AI](https://cloud.google.com/vertex-ai/)

### Costs 

This tutorial uses billable components of Google Cloud:

* BigQuery
* Vertex AI

Learn about [BigQuery Pricing](https://cloud.google.com/bigquery/pricing), [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## ***Load old config, and create a new one.

In [1]:
import json

try:
    with open("../config_path.json", "r") as f:
        config_path = json.load(f)
except FileNotFoundError:
    print("config_path.json not found. Please make sure the file exists.")
    ID = None

In [2]:
from utils import read_from_bucket, VertexConfig


config = read_from_bucket(config_path["bucket"], config_path["conf_uri"])
config = VertexConfig(**config)

PROJECT_NUM = !gcloud projects list --filter="$config.PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]

### Import libraries

In [4]:
# General
import os
from typing import Union
from datetime import datetime, timedelta

# Data Preprocessing
import pandas as pd

# Model Training with Vertex AI
from google.cloud import bigquery

# Model Deployment and Evaluation
from sklearn.metrics import precision_recall_fscore_support

### Define constants

In [5]:
# General
import os


DATA_DIR = os.path.join(os.pardir, "data")
TRAIN_DATA_DIR = os.path.join(DATA_DIR, "train")
os.makedirs(TRAIN_DATA_DIR, exist_ok=True)

# Feature Store
START_DATE_TRAIN = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")

### Initialize Vertex AI SDK and BigQuery Client for Python

In [6]:
bq_client = bigquery.Client(project=config.PROJECT_ID, location=config.REGION)

In [7]:
# Dynamically retrieve Persistent Resource location
PERSISTENT_RESOURCE_REGION = ""
check_regions = ["us-central1", "asia-southeast1", "europe-west4"]

for region in check_regions:
    shell_output = !gcloud ai persistent-resources list --project=$config.PROJECT_ID --region=$region
    if "Listed 0 items." not in shell_output:
        print(f"Persistent Resource found in {region}")
        PERSISTENT_RESOURCE_REGION = region

In [8]:
import re

DESCRIBE_PR_OUTPUT = !gcloud ai persistent-resources describe $config.PERSISTENT_RESOURCE_ID --project=$config.PROJECT_ID --region=$PERSISTENT_RESOURCE_REGION
#print(DESCRIBE_PR_OUTPUT) 

# Join the output lines with spaces
PR_DETAILS = " ".join(DESCRIBE_PR_OUTPUT)  

# Extract machine type
match = re.search(r"machineType: (\w+-\w+-\w+)", PR_DETAILS)
if match:
    MACHINE_TYPE = match.group(1)
    print(f"MACHINE_TYPE: {MACHINE_TYPE}")
else:
    MACHINE_TYPE = config.TRAIN_COMPUTE
    print("MACHINE_TYPE not found in output.")

# Extract replica count
match = re.search(r"replicaCount: '(\d+)'", PR_DETAILS)
if match:
    REPLICA_COUNT = int(match.group(1))
    print(f"REPLICA_COUNT: {REPLICA_COUNT}")
else:
    REPLICA_COUNT = config.REPLICA_COUNT
    print("REPLICA_COUNT not found in output.")
    
# Extract Accelerator Count
match = re.search(r"acceleratorCount: (\d+)", PR_DETAILS)
if match:
    ACCELERATOR_COUNT = int(match.group(1))
    print(f"ACCELERATOR COUNT: {ACCELERATOR_COUNT}")
else:
    ACCELERATOR_COUNT = 0
    print("ACCELERATOR COUNT not found in output.")

# Extract Accelerator Type
match = re.search(r"acceleratorType: (\w+)", PR_DETAILS)
if match:
    ACCELERATOR_TYPE = match.group(1)
    print(f"ACCELERATOR TYPE: {ACCELERATOR_TYPE}")
else:
    ACCELERATOR_TYPE = "ACCELERATOR_TYPE_UNSPECIFIED"
    print("ACCELERATOR TYPE not found in output.")

MACHINE_TYPE not found in output.
REPLICA_COUNT not found in output.
ACCELERATOR COUNT not found in output.
ACCELERATOR TYPE not found in output.


In [9]:
from google.cloud import aiplatform as vertex_ai


# Set the default region for launching jobs.
REGION = PERSISTENT_RESOURCE_REGION

vertex_ai.init(
    project=config.PROJECT_ID,
    location=REGION,
    staging_bucket=config.BUCKET_NAME,
    experiment=config.EXPERIMENT_NAME,
)

### Helper Functions
You will now run some helper functions that we will use throughout the notebook.

We're also using the BigQuery helper function. 

In [10]:
# Wrapper to use BigQuery client to run query/job, return job ID or result as DF
def run_bq_query(sql: str) -> Union[str, pd.DataFrame]:
    """
    Run a BigQuery query and return the job ID or result as a DataFrame
    Args:
        sql: SQL query, as a string, to execute in BigQuery
    Returns:
        df: DataFrame of results from query,  or error, if any
    """

    bq_client = bigquery.Client()

    # Try dry run before executing query to catch any errors
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    bq_client.query(sql, job_config=job_config)

    # If dry run succeeds without errors, proceed to run query
    job_config = bigquery.QueryJobConfig()
    client_result = bq_client.query(sql, job_config=job_config)

    job_id = client_result.job_id

    # Wait for query/job to finish running. then get & return data frame
    df = client_result.result().to_arrow().to_pandas()
    print(f"Finished job_id: {job_id}")
    return df

## Fetching feature values for model training

To fetch training data, we have to specify the following inputs to batch serving:

- a file containing a "query", with the entities and timestamps for each label
- a list of feature values to fetch
- the destination location and format


### Read-instance list

In our case, we need a csv file with content formatted like the table below:

|customer                     |terminal|timestamp                                    |
|-----------------------------|--------|---------------------------------------------|
|xxx3859                         |xxx8811    |2021-07-07 00:01:10 UTC                      |
|xxx4165                         |xxx8810    |2021-07-07 00:01:55 UTC                      |
|xxx2289                         |xxx2081    |2021-07-07 00:02:12 UTC                      |
|xxx3227                         |xxx3011    |2021-07-07 00:03:23 UTC                      |
|xxx2819                         |xxx6263    |2021-07-07 00:05:30 UTC                      |

where the column names are the names of entities in Feature Store and the timestamps represents the time an event occurred.

In [11]:
read_instances_query = f"""
    SELECT
        gt.timestamp,
        gt.customer_id,
        gt.terminal_id,
        gt.tx_amount,
        gt.tx_fraud,
    FROM 
        `{config.PROJECT_ID}.tx.{config.READ_INSTANCES_TABLE}` as gt;
"""

query_df = run_bq_query(read_instances_query)
query_df.head(4)

Finished job_id: 416cd057-7252-49b8-9f81-a74e6446617f


Unnamed: 0,timestamp,customer_id,terminal_id,tx_amount,tx_fraud
0,2025-05-23 00:16:27+00:00,3840595290689182,64542,68.06,0
1,2025-05-23 11:50:50+00:00,4176271867313310,64542,80.96,0
2,2025-05-23 00:35:20+00:00,1663084588852665,64542,31.71,0
3,2025-05-23 17:27:18+00:00,9500124409117407,64542,38.75,0


### Get Feature Store historical training data
repeat what you have done in previous module.

Next fetch a batch of data from the Vertex AI Feature Store. 

In [12]:
import bigframes
import bigframes.pandas
from vertexai.resources.preview.feature_store import FeatureGroup, offline_store


customer_fg = FeatureGroup(name=config.CUSTOMER_ENTITY_ID)
customer_features_str = [
    "customer_id_nb_tx_1day_window",
    "customer_id_nb_tx_7day_window",
    "customer_id_nb_tx_14day_window",
    "customer_id_avg_amount_1day_window",
    "customer_id_avg_amount_7day_window",
    "customer_id_avg_amount_14day_window",
    "customer_id_nb_tx_15min_window",
    "customer_id_nb_tx_30min_window",
    "customer_id_nb_tx_60min_window",
    "customer_id_avg_amount_15min_window",
    "customer_id_avg_amount_30min_window",
    "customer_id_avg_amount_60min_window",
]
customer_features = [customer_fg.get_feature(c_feat) for c_feat in customer_features_str]
terminal_fg = FeatureGroup(name=config.TERMINAL_ENTITY_ID)
terminal_features_str = [
    "terminal_id_nb_tx_1day_window",
    "terminal_id_nb_tx_7day_window",
    "terminal_id_nb_tx_14day_window",
    "terminal_id_risk_1day_window",
    "terminal_id_risk_7day_window",
    "terminal_id_risk_14day_window",
    "terminal_id_nb_tx_15min_window",
    "terminal_id_nb_tx_30min_window",
    "terminal_id_nb_tx_60min_window",
    "terminal_id_avg_amount_15min_window",
    "terminal_id_avg_amount_30min_window",
    "terminal_id_avg_amount_60min_window",
]
terminal_features = [terminal_fg.get_feature(t_feat) for t_feat in terminal_features_str]
sample_df = offline_store.fetch_historical_feature_values(
    entity_df=query_df,
    features=customer_features + terminal_features,
)
sample_df = sample_df.to_pandas()

Now you will create a copy of the training data in your local notebook instance so that you can use it later for testing the model.

In [13]:
# Save the data to both gcs and local
import io
from utils import gcs_write


csv_io = io.StringIO()
sample_df.to_csv(csv_io, index=False)
gcs_write(config.PROJECT_ID, config.BUCKET_NAME, config.TRAIN_DATA_URI, csv_io)
sample_df.to_csv(os.path.join(TRAIN_DATA_DIR, "train.csv"), index=False)

Exporting the features into cloud storage will generate a csv file. Let's list the local file:

In [14]:
!ls $TRAIN_DATA_DIR

train.csv


## Building a fraud detection model using Vertex AI custom training

#### Building a Vertex AI dataset
In this section, you will create a managed [Vertex AI dataset](https://cloud.google.com/vertex-ai/docs/training/using-managed-datasets). Vertex AI datasets can be used to train AutoML models or custom-trained models.  

In [15]:
# retrieve list of local files
flist = !ls $TRAIN_DATA_DIR
obj_list = [os.path.join("gs://", config.BUCKET_NAME, config.TRAIN_DATA_URI) for fname in flist]
obj_list

['gs://ai-takeoff-2025-fraudfinder-ucx2v/data/train.csv']

In [16]:
# create Vertex AI managed dataset
dataset = vertex_ai.TabularDataset.create(
    display_name=config.DATASET_NAME,
    gcs_source=obj_list[0],
)
print("Dataset:", f"{dataset.display_name}")
print("Name: \t", f"{dataset.resource_name}")

Creating TabularDataset
Create TabularDataset backing LRO: projects/401714874268/locations/us-central1/datasets/5726527587073654784/operations/727518634754179072
TabularDataset created. Resource name: projects/401714874268/locations/us-central1/datasets/5726527587073654784
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/401714874268/locations/us-central1/datasets/5726527587073654784')
Dataset: fraud_finder_dataset_ucx2v
Name: 	 projects/401714874268/locations/us-central1/datasets/5726527587073654784


In [17]:
# Load the created dataset, only needed when you want to reload, otherwise the previously created dataset is sufficient
# dataset = vertex_ai.TabularDataset('projects/<project-number>/locations/us-central1/datasets/<dataset-id>')

In [18]:
dataset.gca_resource.metadata['inputConfig']['gcsSource']['uri']

['gs://ai-takeoff-2025-fraudfinder-ucx2v/data/train.csv']

### Train a custom model

In this section, you will need to train a Pytorch model on Vertex AI custom training. Custom training on Vertex AI requires a container, which contains all of the necessary code, files, and code dependencies needed to train the model.

#### Create the training job with Pytorch and Dask

To perform custom training, you can use either a pre-built container or build your container. In this notebook we will being use Pytorch with the Dask framework, and so we will need to build a custom container for Pytorch and use it to train a model with the Vertex AI custom training service.

You will use Dask. Dask is a parallel computing library built on Python. Dask allows easy management of distributed workers and excels at handling large distributed data science workflows. Moreover, when you read/write data from data source, Dask allows regex and multiple file loading/writing.

##### Vertex AI and containers
The first step is to write your training code. Then, you will need to write a Dockerfile and build a container image based on it. The following cell writes our code into `train_torch.py`. We will copy this code into our container to run through the Vertex AI training service.

A custom container is a Docker image that you create to run your training application. By running your machine learning (ML) training job in a custom container, you can use ML frameworks, non-ML dependencies, libraries, and binaries that are not otherwise supported on Vertex AI. You can read more in our [documentation](https://cloud.google.com/vertex-ai/docs/training/containers-overview). 

In [19]:
# create a folder for all container-related files
!mkdir -p -m 777 build_training

In [20]:
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, "a") as f:
        f.write(cell.format(**globals()))

In [21]:
%%writefile build_training/train_torch.py

"""
train_xgb.py is the module for training a XGBClassifier pipeline
"""

# Libraries --------------------------------------------------------------------------------------------------------------------------

import argparse
from typing import List, Union
import numpy as np
import pandas as pd
import os
import json
import logging
from pathlib import Path
import dask.dataframe as dask_df
import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn.metrics import (roc_curve, confusion_matrix, average_precision_score, f1_score, 
                            log_loss, precision_score, recall_score)

# Variables --------------------------------------------------------------------------------------------------------------------------

## Read environmental variables
def gcs_path_to_local_path(old_path: str) -> str:
    new_path = old_path.replace("gs://", "/gcs/")
    return new_path

TRAINING_DATA_PATH = gcs_path_to_local_path(os.environ["AIP_TRAINING_DATA_URI"])
TEST_DATA_PATH = gcs_path_to_local_path(os.environ["AIP_TEST_DATA_URI"])
MODEL_DIR = gcs_path_to_local_path(os.environ["AIP_MODEL_DIR"])
MODEL_PATH = os.path.join(MODEL_DIR, "model.pt")



Writing build_training/train_torch.py


In [22]:
%%writetemplate build_training/train_torch.py

TARGET_COLUMN = "{config.TARGET_COLUMN}"
FEAT_COLUMNS = {config.FEAT_COLUMNS}
DROP_COLUMNS = {config.DROP_COLUMNS}
DATA_SCHEMA = {config.DATA_SCHEMA}

In [23]:
%%writefile -a build_training/train_torch.py

## Training variables
# Helpers -----------------------------------------------------------------------------------------------------------------------------
def get_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()

    # Data files arguments
    parser.add_argument("--bucket", dest="bucket", type=str,
                        required=True, help="Bucket uri")
    
    return parser.parse_args()

def resample(df: pd.DataFrame, replace: bool, frac: float = 1, random_state: int = 8) -> pd.DataFrame:
    shuffled_df = df.sample(frac=frac, replace=replace, random_state=random_state)
    return shuffled_df

def preprocess(df: pd.DataFrame, drop_cols: List[str] = None) -> pd.DataFrame:
    if drop_cols:
        df = df.drop(columns=drop_cols, errors="ignore")

    # Drop rows with NaN"s
    df = df.dropna()

    # Convert integer valued (numeric) columns to floating point
    numeric_columns = df.select_dtypes(["float32", "float64"]).columns
    numeric_format = {col:"float32" for col in numeric_columns}
    df = df.astype(numeric_format)

    return df

def evaluate_model(model: nn.Module, x_true: torch.Tensor, y_true: torch.Tensor, thresh: float) -> dict:
    #calculate metrics
    metrics={}
    
    y_score = model(x_true)[:, 1].detach().numpy()
    y_true = y_true.detach().numpy()
    fpr, tpr, thr = roc_curve(
         y_true=y_true, y_score=y_score, pos_label=True
    )
    fpr_list = fpr.tolist()[::1000]
    tpr_list = tpr.tolist()[::1000]
    thr_list = thr.tolist()[::1000]

    y_pred = np.where(y_score >= thresh, 1, 0)
    c_matrix = confusion_matrix(y_true, y_pred)
    
    avg_precision_score = round(average_precision_score(y_true, y_score), 3)
    f1 = round(f1_score(y_true, y_pred), 3)
    lg_loss = round(log_loss(y_true, y_pred), 3)
    prec_score = round(precision_score(y_true, y_pred), 3)
    rec_score = round(recall_score(y_true, y_pred), 3)
    
    metrics["fpr"] = [round(f, 3) for f in fpr_list]
    metrics["tpr"] = [round(f, 3) for f in tpr_list]
    metrics["thrs"] = [round(f, 3) for f in thr_list]
    metrics["confusion_matrix"] = c_matrix.tolist()
    metrics["avg_precision_score"] = avg_precision_score
    metrics["f1_score"] = f1
    metrics["log_loss"] = lg_loss
    metrics["precision_score"] = prec_score
    metrics["recall_score"] = rec_score
    
    return metrics


class Model(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.l1 = nn.Linear(input_dim, 100)
        self.l2 = nn.Linear(100, 30)
        self.l3 = nn.Linear(30, 2)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = F.relu(self.l3(x))
        x = F.softmax(x, dim=1)
        return x


def main():
    args = get_args()
    THRESHOLD = 0.5
    device = "cuda" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)
    # variables
    bucket = gcs_path_to_local_path(args.bucket)
    deliverable_uri = (Path(bucket)/"deliverables")
    metrics_uri = (deliverable_uri/"metrics.json")

    # read data
    train_df = dask_df.read_csv(TRAINING_DATA_PATH, dtype=DATA_SCHEMA)
    train_df = train_df.compute()
    test_df = dask_df.read_csv(TEST_DATA_PATH, dtype=DATA_SCHEMA)
    test_df = test_df.compute()
    
    # preprocessing
    preprocessed_train_df = preprocess(train_df, DROP_COLUMNS)
    preprocessed_test_df = preprocess(test_df, DROP_COLUMNS)
    
    # downsampling
    train_nfraud_df = preprocessed_train_df[preprocessed_train_df[TARGET_COLUMN]==0]
    train_fraud_df = preprocessed_train_df[preprocessed_train_df[TARGET_COLUMN]==1]
    train_nfraud_downsample = resample(train_nfraud_df,
                          replace=True,
                          frac=len(train_fraud_df)/len(train_df))
    ds_preprocessed_train_df = pd.concat([train_nfraud_downsample, train_fraud_df])
    
    # target, features split
    x_train = ds_preprocessed_train_df[FEAT_COLUMNS].astype(np.float32).values
    x_train = Variable(torch.from_numpy(x_train)).float().to(device)
    y_train = ds_preprocessed_train_df.loc[:, TARGET_COLUMN].astype(int).values
    y_train = Variable(torch.from_numpy(y_train)).long().to(device)
    x_true = preprocessed_test_df[FEAT_COLUMNS].astype(np.float32).values
    x_true = Variable(torch.from_numpy(x_true)).float().to(device)
    y_true = preprocessed_test_df.loc[:, TARGET_COLUMN].astype(int).values
    y_true = Variable(torch.from_numpy(y_true)).long().to(device)
    
    # train model; for demo purpose, no validation/early stopping is implemented
    EPOCHS = 5
    model = Model(x_train.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()
    for _ in tqdm.trange(EPOCHS):
        y_pred = model(x_train)
        loss = loss_fn(y_pred, y_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if not Path(MODEL_DIR).exists():
        Path(MODEL_DIR).mkdir(parents=True, exist_ok=True)
    model_jit = torch.jit.trace(model, example_kwarg_inputs={"x": torch.from_numpy(np.random.random((10, x_train.shape[1]))).float()})
    torch.jit.save(model_jit, MODEL_PATH)
    
    #generate metrics
    metrics = evaluate_model(model, x_true, y_true, THRESHOLD)
    if not Path(deliverable_uri).exists():
        Path(deliverable_uri).mkdir(parents=True, exist_ok=True)
    with open(metrics_uri, "w") as file:
        json.dump(metrics, file, sort_keys = True, indent = 4)


if __name__ == "__main__":
    main()

Appending to build_training/train_torch.py


#### Define a custom image for model training

Now you will build a custom container. By running your training job in a custom container, you can use any ML framework, non-ML dependencies, libraries, and binaries. Next you will package your training code into a Docker container image, push the container image to Artifact Registry, and create a custom job on Vertex AI, which will use the container image on Artifact Registry. As the evolution of Container Registry, Artifact Registry is a single place for your organization to manage container images and language packages. It's fullly intergrated with the Vertex AI platform. You can read more in our [documentation](https://cloud.google.com/artifact-registry). 

In [24]:
# Create image repository
!gcloud artifacts repositories create $config.IMAGE_REPOSITORY      --repository-format=docker      --location=$config.REGION     --description="FraudFinder Docker Image repository"

# List repositories under the project
!gcloud artifacts repositories list

# Get info on the repository
!gcloud artifacts repositories describe $config.IMAGE_REPOSITORY --location=$config.REGION

Create request issued for: [fraudfinder-ucx2v]
Waiting for operation [projects/ai-takeoff-2025/locations/us-central1/operation
s/0b35a29f-b192-4705-a2b1-a17eff934761] to complete...done.                    
Created repository [fraudfinder-ucx2v].
Listing items under project ai-takeoff-2025, across all locations.

                                                                                ARTIFACT_REGISTRY
REPOSITORY         FORMAT  MODE                 DESCRIPTION                          LOCATION     LABELS  ENCRYPTION          CREATE_TIME          UPDATE_TIME          SIZE (MB)
fraudfinder-ucx2v  DOCKER  STANDARD_REPOSITORY  FraudFinder Docker Image repository  us-central1          Google-managed key  2025-05-24T13:07:13  2025-05-24T13:07:13  0
Encryption: Google-managed key
Repository Size: 0.000MB
createTime: '2025-05-24T13:07:13.782081Z'
description: FraudFinder Docker Image repository
format: DOCKER
mode: STANDARD_REPOSITORY
name: projects/ai-takeoff-2025/locations/us-central

Run the follow cell to allow this notebook to push to Artifact Registry

In [25]:
!gcloud auth configure-docker $config.REGION-docker.pkg.dev -q


{
  "credHelpers": {
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.


Next you need to write your Dockerfile in order to create your container. 

In [26]:
%%writefile build_training/Dockerfile
# Specifies base image and tag
FROM python:3.12
WORKDIR /root


# Copies the trainer code to the docker image.
COPY ./train_torch.py /root/

# Installs additional packages
RUN pip install dask[dataframe]==2025.3.0 numpy==2.2.4 pandas==2.2.3 tqdm==4.67.1 torch==2.6.0 scikit-learn==1.6.1
# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python3", "train_torch.py"]

Writing build_training/Dockerfile


Next, build and push the Docker container. 

In [27]:
# Build and push Docker container
!docker build -t $config.IMAGE_URI ./build_training/
!docker push $config.IMAGE_URI

print("Done")

Sending build context to Docker daemon  11.78kB
Step 1/5 : FROM python:3.12
3.12: Pulling from library/python

[1B9d1a9511: Pulling fs layer 
[1B7ed901b1: Pulling fs layer 
[1Bf47ad444: Pulling fs layer 
[1B099911d6: Pulling fs layer 
[1B7412051f: Pulling fs layer 
[3B099911d6: Waiting fs layer 
[1B187bfbf4: Pull complete  249B/249B4MBB[6A[2K[6A[2K[5A[2K[5A[2K[5A[2K[7A[2K[4A[2K[7A[2K[4A[2K[7A[2K[4A[2K[7A[2K[4A[2K[3A[2K[4A[2K[7A[2K[4A[2K[7A[2K[2A[2K[2A[2K[2A[2K[1A[2K[4A[2K[7A[2K[4A[2K[4A[2K[4A[2K[7A[2K[4A[2K[7A[2K[4A[2K[7A[2K[4A[2K[7A[2K[4A[2K[7A[2K[4A[2K[7A[2K[4A[2K[7A[2K[4A[2K[4A[2K[4A[2K[4A[2K[4A[2K[4A[2K[7A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[6A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[5A[2K[4A[2

### Custom serving with FastAPI
We will also include XAI capability in our pytorch model deployment. In order to do so, we also need to build a custom model serving image.

In [28]:
# create a folder for all container-related files
!mkdir -p -m 777 build_serving

In [29]:
%%writefile build_serving/__init__.py
# package init

Writing build_serving/__init__.py


In [30]:
%%writefile build_serving/classifier.py
import torch
import torch.nn as nn



Writing build_serving/classifier.py


In [31]:
%%writetemplate build_serving/classifier.py

FEAT_COLUMNS = {config.FEAT_COLUMNS}



In [32]:
%%writefile -a build_serving/classifier.py

class Classifier:
    def __init__(self, model: nn.Module):
        self.net = model
        self.classes = {
            0: "no_fraud",
            1: "is_fraud",
        }
        
    def predict(self, features: dict):
        self.net.eval()
        X = [features[feat_col] for feat_col in FEAT_COLUMNS]
        X = torch.tensor(X, dtype=torch.float32)
        X = torch.unsqueeze(X, 0)
        with torch.no_grad():
            output = self.net(X)
            prob, clas = output.max(1)

        return {'class': self.classes[int(clas.cpu().detach().numpy()[0])],
                'probability': float(prob.cpu().detach().numpy()[0])}

Appending to build_serving/classifier.py


In [33]:
%%writefile build_serving/main.py
from fastapi import FastAPI, Request
from starlette.responses import JSONResponse
import uvicorn

import os
from contextlib import asynccontextmanager
import torch

from google.cloud import storage
from classifier import Classifier


@asynccontextmanager
async def lifespan(app: FastAPI):
    global my_model
    global my_classifier
    gcs_client = storage.Client()
    local_model_path = "model.pt"
    with open(local_model_path, "wb") as model_f:
        gcs_client.download_blob_to_file(
            f"{os.environ['AIP_STORAGE_URI']}/{local_model_path}", model_f
        )
    my_model = torch.jit.load(local_model_path)
    my_classifier = Classifier(my_model)
    yield


app = FastAPI(lifespan=lifespan)


@app.get(os.environ['AIP_HEALTH_ROUTE'], status_code=200)
def health():
    return {}


@app.post(os.environ['AIP_PREDICT_ROUTE'])
async def predict(request: Request):
    body = await request.json()
    
    instances = body["instances"]
    output = []
    for i in instances:
        output.append(my_classifier.predict(i))
    #return 'class' and 'probability'
    return JSONResponse({"predictions": output})


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8080)

Writing build_serving/main.py


In [34]:
%%writefile build_serving/Dockerfile
# Specifies base image and tag
FROM python:3.12
WORKDIR /root


# Copies the trainer code to the docker image.
COPY ./* /root/

# Installs additional packages
RUN pip install numpy==2.2.4 torch==2.6.0 "fastapi[standard]==0.115.7" uvicorn==0.34.0 google-cloud-storage==2.18.2
# Sets up the entry point to invoke the trainer.
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]

Writing build_serving/Dockerfile


In [35]:
# Build and push serving Docker container
!docker build -t $config.MODEL_SERVING_IMAGE_URI ./build_serving/
!docker push $config.MODEL_SERVING_IMAGE_URI

print("Done")

Sending build context to Docker daemon   7.68kB
Step 1/5 : FROM python:3.12
 ---> 1b0cfae6e4d8
Step 2/5 : WORKDIR /root
 ---> Using cache
 ---> dafde1a95ff0
Step 3/5 : COPY ./* /root/
 ---> 5de85eec661f
Step 4/5 : RUN pip install numpy==2.2.4 torch==2.6.0 "fastapi[standard]==0.115.7" uvicorn==0.34.0 google-cloud-storage==2.18.2
 ---> Running in fc6316fe1896
Collecting numpy==2.2.4
  Downloading numpy-2.2.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting torch==2.6.0
  Downloading torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting fastapi==0.115.7 (from fastapi[standard]==0.115.7)
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn==0.34.0
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting google-cloud-storage==2.18.2
  Downloading google_cloud_storage-2.18.2-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting filelock (from torch==2.6.0)
  Downloading filelock-3.18.0-py3-n

Both images belong to the same repository, and we can use tag:latest if multiple versions exist.
![ar](../misc/images/vertexai-custom-train-images.png)

#### Start a custom training job on Vertex AI
Now that you have created your custom container, you will create a training job on Vertex AI. This will create a custom training job, load our dataset and register the model to Vertex AI Model Registry  after the training job is successfully completed. Learn more about the creaton of custom jobs [here](https://cloud.google.com/vertex-ai/docs/training/create-custom-job).

In [36]:
# Check if Service Account is enabled on Persistent Resource
SA_ENABLED = ""

DESCRIBE_PR_OUTPUT = !gcloud ai persistent-resources describe $config.PERSISTENT_RESOURCE_ID --project=$config.PROJECT_ID --region=$PERSISTENT_RESOURCE_REGION
PR_DETAILS = " ".join(DESCRIBE_PR_OUTPUT)

if "enableCustomServiceAccount: true" in PR_DETAILS:
    SA_ENABLED = True
    print(f"Service Account is ENABLED on Persistent Resource")
    print("----------------------------------------------------------------\n")
    print("Please proceed to next step to start the training job!")
else:
    SA_ENABLED = False
    print(f"Service Account is NOT ENABLED on Persistent Resource")
    print("----------------------------------------------------------------\n")
    print("Please open the Terminal in a new tab and run the command -> gcloud auth login")
    print("Once done, continue on to start the training job")

Service Account is NOT ENABLED on Persistent Resource
----------------------------------------------------------------

Please open the Terminal in a new tab and run the command -> gcloud auth login
Once done, continue on to start the training job


In [37]:
MODEL_NAME = f"{config.MODEL_NAME}_torch_{config.ID}"

job = vertex_ai.CustomContainerTrainingJob(
    display_name=config.JOB_NAME,
    container_uri=config.IMAGE_URI,
    model_serving_container_image_uri=config.MODEL_SERVING_IMAGE_URI,
)

CMDARGS = [
    f"--bucket={config.BUCKET_NAME}",
]

if SA_ENABLED:
    model = job.run(
    dataset=dataset,
    model_display_name=MODEL_NAME,
    args=CMDARGS,
    replica_count=1,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,    
    # persistent_resource_id=config.PERSISTENT_RESOURCE_ID,
    service_account=config.SERVICE_ACCOUNT,
)

else:
    model = job.run(
    dataset=dataset,
    model_display_name=MODEL_NAME,
    args=CMDARGS,
    replica_count=1,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    # persistent_resource_id=config.PERSISTENT_RESOURCE_ID,
)

Training Output directory:
gs://ai-takeoff-2025-fraudfinder-ucx2v/aiplatform-custom-training-2025-05-24-13:46:11.110 
No dataset split provided. The service will use a default split.
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6715483200417693696?project=401714874268
CustomContainerTrainingJob projects/401714874268/locations/us-central1/trainingPipelines/6715483200417693696 current state:
3
CustomContainerTrainingJob projects/401714874268/locations/us-central1/trainingPipelines/6715483200417693696 current state:
3
CustomContainerTrainingJob projects/401714874268/locations/us-central1/trainingPipelines/6715483200417693696 current state:
3
CustomContainerTrainingJob projects/401714874268/locations/us-central1/trainingPipelines/6715483200417693696 current state:
3
CustomContainerTrainingJob projects/401714874268/locations/us-central1/trainingPipelines/6715483200417693696 current state:
3
View backing custom job:
https://console.cloud.google.c

While the model is training, you can visit the model URL, or go to the console page for [Vertex AI training jobs](https://console.cloud.google.com/vertex-ai/training/training-pipelines) to track its progress.

#### Evaluate the model locally

Before you can run the model via an endpoint, you need to transform the data so that the model can perform a prediction on that.

In [None]:
!gsutil cp -r $model.uri .

In [None]:
# load the model and get feature names
import torch


model_local = torch.jit.load("./model/model.pt")

In [19]:
import numpy as np

NA_VALUES = ["NA", "."]


def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """Converts categorical features to numeric. Removes unused columns.

    Args:
      df: Pandas df with raw data

    Returns:
      df with preprocessed data
    """
    df = df.drop(columns=config.DROP_COLUMNS)

    # Drop rows with NaN's
    df = df.dropna()

    # Convert integer valued (numeric) columns to floating point
    numeric_columns = df.select_dtypes(["int32", "float32", "float64"]).columns
    df[numeric_columns] = df[numeric_columns].astype("float32")

    dummy_columns = list(df.dtypes[df.dtypes == "category"].index)
    df = pd.get_dummies(df, columns=dummy_columns)

    return df


# test set
train_sample_path = os.path.join(TRAIN_DATA_DIR, "train.csv")
df_test = pd.read_csv(train_sample_path)
preprocessed_test_Data = preprocess(df_test)

x_test = preprocessed_test_Data[config.FEAT_COLUMNS].astype(np.float32).values
x_test = torch.from_numpy(x_test)
y_test = preprocessed_test_Data.loc[:, config.TARGET_COLUMN].astype(int)

Next you will copy the model artifact to the local directory to evaluate the model localy before deploying the model:

Now it's time to test the model.

In [None]:
y_pred_prob = model_local(x_test)
y_pred = torch.argmax(y_pred_prob, dim=1)
precision_recall_fscore_support(y_test.values, y_pred, average="weighted")

(0.9864392960493612, 0.987903903785099, 0.9861961150759684, None)

#### Deploy the model
Before you use your model to make predictions, you need to deploy it to an Endpoint. You can do this by calling the deploy function on the Model resource. While our model training does not include XAI config, this will do two things:

- Upload the model with XAI configuration
- create an Endpoint resource
- deploy the Model resource to the Endpoint resource


In [39]:
from google.cloud.aiplatform import explain


explanation_params = explain.ExplanationParameters(
    sampled_shapley_attribution=explain.SampledShapleyAttribution(
        path_count=10,
    ),
)
explanation_metadata = explain.ExplanationMetadata(
    inputs={
        feat: {} for feat in config.FEAT_COLUMNS
    },
    outputs={
        "probability": {}
    }
)

new_model = vertex_ai.Model.upload(
    serving_container_image_uri=config.MODEL_SERVING_IMAGE_URI,
    artifact_uri=model.uri,
    display_name=MODEL_NAME,
    description="Vertex AI Pipeline Custom Model",
    explanation_metadata=explanation_metadata,
    explanation_parameters=explanation_params,
    upload_request_timeout=1800,
    serving_container_deployment_timeout=1800,
)

Creating Model
Create Model backing LRO: projects/401714874268/locations/us-central1/models/5590448179729399808/operations/7998017283190423552
Model created. Resource name: projects/401714874268/locations/us-central1/models/5590448179729399808@1
To use this Model in another session:
model = aiplatform.Model('projects/401714874268/locations/us-central1/models/5590448179729399808@1')


In [40]:
# Percentage of traffic that the model will receive in the endpoint
TRAFFIC_SPLIT = {"0": 100}

# Parameters to configure the minimum and maximum nodes during autoscaling
MIN_NODES = 1
MAX_NODES = 1

endpoint = new_model.deploy(
    deployed_model_display_name=MODEL_NAME,
    traffic_split=TRAFFIC_SPLIT,
    machine_type=config.DEPLOY_COMPUTE,
    accelerator_count=0,
    min_replica_count=MIN_NODES,
    max_replica_count=MAX_NODES,
)

Creating Endpoint
Create Endpoint backing LRO: projects/401714874268/locations/us-central1/endpoints/6306461146853408768/operations/6379744421478400
Endpoint created. Resource name: projects/401714874268/locations/us-central1/endpoints/6306461146853408768
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/401714874268/locations/us-central1/endpoints/6306461146853408768')
Deploying model to Endpoint : projects/401714874268/locations/us-central1/endpoints/6306461146853408768
Deploy Endpoint model backing LRO: projects/401714874268/locations/us-central1/endpoints/6306461146853408768/operations/1164930748562538496
Endpoint model deployed. Resource name: projects/401714874268/locations/us-central1/endpoints/6306461146853408768


The online prediction will show 1 active model with 100% traffic.
![model](../misc/images/vertexai-online-prediction.png)

#### Test the deployed model (Make an online prediction request)
Send an online prediction request to your deployed model. To make sure your deployed model is working, test it out by sending a request to the endpoint.

Let's first get a test data.

In [22]:
for item in x_test[:2]:
    print(item)

tensor([  8.4062,   0.0000,   9.3820,   0.0000,   0.0000,   8.3183,  47.0000,
          0.0000,   5.0000,   0.0000,   0.0000,  24.0000,   0.0000,   0.0000,
          0.0000, 217.0000,   0.0000,  25.0000,   0.0000,   0.0000, 170.0000,
          0.0000,   0.0000,   0.0000,   9.8100])
tensor([4.0091e+01, 0.0000e+00, 4.5390e+01, 0.0000e+00, 0.0000e+00, 3.9517e+01,
        5.3000e+01, 0.0000e+00, 3.0000e+00, 0.0000e+00, 0.0000e+00, 2.8000e+01,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 1.5000e+02, 0.0000e+00, 1.3000e+01,
        0.0000e+00, 0.0000e+00, 1.1500e+02, 6.6667e-03, 0.0000e+00, 0.0000e+00,
        4.9430e+01])


In [35]:
payload = {
    "instances": [{feat: val for feat, val in zip(config.FEAT_COLUMNS, vals)} for vals in x_test.detach().numpy()[:2].tolist()]
}

In [43]:
resp = endpoint.predict(payload['instances'])
print(resp.predictions)

[{'class': 'no_fraud', 'probability': 0.9968664050102234}, {'class': 'no_fraud', 'probability': 0.9397484064102173}]


In [44]:
resp = endpoint.explain(payload['instances'])
print(resp.explanations)

[attributions {
  baseline_output_value: 0.5
  instance_output_value: 0.99686640501022339
  feature_attributions {
    struct_value {
      fields {
        key: "tx_amount"
        value {
          number_value: 0.0025898218154907231
        }
      }
      fields {
        key: "terminal_id_risk_7day_window"
        value {
          number_value: 0
        }
      }
      fields {
        key: "terminal_id_risk_1day_window"
        value {
          number_value: 0
        }
      }
      fields {
        key: "terminal_id_risk_14day_window"
        value {
          number_value: 0
        }
      }
      fields {
        key: "terminal_id_nb_tx_7day_window"
        value {
          number_value: 0.3711762487888336
        }
      }
      fields {
        key: "terminal_id_nb_tx_60min_window"
        value {
          number_value: 0
        }
      }
      fields {
        key: "terminal_id_nb_tx_30min_window"
        value {
          number_value: 0
        }
      }
      fie

Now that we understand we packaged our pytorch model and started a custom training job on Vertex AI we can take the ML workflow and formalize it into a Vertex AI Pipeline.

You can continue with the next Notebook: `06_formalization.ipynb`.