# Vertex AI Pipelines Handson(Pytorch GPU training)
- このハンズオンでは Vertex AI Pipelines で GPU を利用した Custom Training を実行するパイプラインを作成します。
- 主に Continuous Training を意識したパイプラインになっています。

## パッケージのインストール

In [None]:
# 2025/02/12 時点では、Workbench で実行した場合にはこの辺がインストールされている。
# KFP SDK version: 2.5.0
# google-cloud-aiplatform==1.75.0
# kfp==2.5.0
# kfp-pipeline-spec==0.2.2
# kfp-server-api==2.0.5

! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! pip3 freeze | grep -e aiplatform -e kfp

In [None]:
# Workbench Instances などを利用している場合など、必要に応じて実施する。
# uninstall については、バグ回避のために入れている。


# !pip uninstall -y protobuf python3-protobuf
# !pip install --no-cache-dir --upgrade "kfp>2" \
#                                         google-cloud-aiplatform

In [None]:
# 2025/02/12 時点では、Workbench で実行した場合には google-cloud-pipeline-components==2.18.0 がインストールされる。

!pip3 install -U google-cloud-pipeline-components
!pip3 freeze | grep google-cloud-pipeline-components

## 環境変数の設定

In [47]:
shell_output = !gcloud config get project
PROJECT_ID = shell_output[0]
PROJECT_ID

'yuyaono-dev1'

In [48]:
REGION = "us-central1"
BQ_REGION = REGION.split("-")[0].upper()

In [49]:
BUCKET_URI = f"gs://your-bucket-name-{PROJECT_ID}-unique"  # @param {type:"string"}

In [50]:
# Workbench 等を利用する時に特別な設定を行ったいない場合は、Default の GCE のサービスアカウントが利用される。
SERVICE_ACCOUNT = ""  # @param {type:"string"}

In [51]:
import sys

IS_COLAB = "google.colab" in sys.modules
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    else:  # IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

Service Account: 635273855015-compute@developer.gserviceaccount.com


In [52]:
PATH = %env PATH
%env PATH={PATH}:/home/jupyter/.local/bin
date_string = !date '+%Y%m%d%H%M%S'
YYYYMMDDHHmmSS = date_string[0]


PIPELINE_ROOT = f"{BUCKET_URI}/vai_pipelines_handson_pipeline_gpu_training_{YYYYMMDDHHmmSS}"  # This is where all pipeline artifacts are sent. You'll need to ensure the bucket is created ahead of time
PIPELINE_ROOT
print(f"PIPELINE_ROOT: {PIPELINE_ROOT}")


CONTAINER_IMAGE_URL=f"us-central1-docker.pkg.dev/{PROJECT_ID}/custom-prediction-pytorch-cpu/custom-prediction-pytorch-cpu:latest"
print(CONTAINER_IMAGE_URL)

env: PATH=/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/home/jupyter/.local/bin:/home/jupyter/.local/bin:/home/jupyter/.local/bin
PIPELINE_ROOT: gs://your-bucket-name-yuyaono-dev1-unique/vai_pipelines_handson_pipeline_gpu_training_20250214110836
us-central1-docker.pkg.dev/yuyaono-dev1/custom-prediction-pytorch-cpu/custom-prediction-pytorch-cpu:latest


## 環境構築

In [53]:
! gcloud services enable aiplatform.googleapis.com

In [54]:
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

Creating gs://your-bucket-name-yuyaono-dev1-unique/...
ServiceException: 409 A Cloud Storage bucket named 'your-bucket-name-yuyaono-dev1-unique' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [55]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

No changes made to gs://your-bucket-name-yuyaono-dev1-unique/
No changes made to gs://your-bucket-name-yuyaono-dev1-unique/


## Vertex AI Pipelines の利用準備

### ライブラリのインポート

In [56]:
import google.cloud.aiplatform as aiplatform
import kfp
from kfp import compiler, dsl
from kfp.dsl import Artifact, Dataset, Input, Metrics, Model, Output, component
from google_cloud_pipeline_components.v1.vertex_notification_email import VertexNotificationEmailOp
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from google_cloud_pipeline_components.v1.custom_job import create_custom_training_job_from_component
from google_cloud_pipeline_components.types import artifact_types
from kfp.dsl import importer_node
from typing import NamedTuple

### Vertex AI の初期化

In [57]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

## パイプラインコンポーネントの定義

### GPU をつかったカスタム トレーニングを行うコンポーネント

In [58]:
@component(
    base_image='us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.2-3.py310:latest',
    packages_to_install=[
        "torch==2.6.0",
        "torchvision==0.21.0",
        "numpy==1.26.4"
    ],
)
def pytorch_training(
    model: Output[Model],
    metrics: Output[Metrics],
) -> NamedTuple("Outputs", [("auc", float), ("model_uri", str)]) :
    # https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
    import torch
    from torch import nn
    from torch.utils.data import DataLoader
    from torchvision import datasets
    from torchvision.transforms import ToTensor
 
    import os
    
    batch_size = 64
    
    device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
    print(f"Using {device} device")
    
    
    # Download training data from open datasets.
    training_data = datasets.FashionMNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor(),
    )

    # Download test data from open datasets.
    test_data = datasets.FashionMNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor(),
    )
    
    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    for X, y in test_dataloader:
        print(f"Shape of X [N, C, H, W]: {X.shape}")
        print(f"Shape of y: {y.shape} {y.dtype}")
        break

        
    # Define model
    class NeuralNetwork(nn.Module):
        def __init__(self):
            super().__init__()
            self.flatten = nn.Flatten()
            self.linear_relu_stack = nn.Sequential(
                nn.Linear(28*28, 512),
                nn.ReLU(),
                nn.Linear(512, 512),
                nn.ReLU(),
                nn.Linear(512, 10)
            )

        def forward(self, x):
            x = self.flatten(x)
            logits = self.linear_relu_stack(x)
            return logits

    nn_model = NeuralNetwork().to(device)
    print(nn_model)
    
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(nn_model.parameters(), lr=1e-3)
    
    
    def train(dataloader, model, loss_fn, optimizer):
        size = len(dataloader.dataset)
        model.train()
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)

            # Compute prediction error
            pred = model(X)
            loss = loss_fn(pred, y)

            # Backpropagation
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if batch % 100 == 0:
                loss, current = loss.item(), (batch + 1) * len(X)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
                
                
    def test(dataloader, model, loss_fn):
        size = len(dataloader.dataset)
        num_batches = len(dataloader)
        model.eval()
        test_loss, correct = 0, 0
        with torch.no_grad():
            for X, y in dataloader:
                X, y = X.to(device), y.to(device)
                pred = model(X)
                test_loss += loss_fn(pred, y).item()
                correct += (pred.argmax(1) == y).type(torch.float).sum().item()
        test_loss /= num_batches
        correct /= size
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    
    
    epochs = 5
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, nn_model, loss_fn, optimizer)
        test(test_dataloader, nn_model, loss_fn)
    print("Done!")
    

    # Export the model to a file
    print(f'model.path: {model.path}')
    print(f'os.path.join(model.path, "model.pth"): {os.path.join(model.path, "model.pth")}')
    os.makedirs(model.path, exist_ok=True)
    print(f'The directory has been created.')
    torch.save(nn_model.state_dict(), os.path.join(model.path, "model.pth"))
    print("Saved PyTorch Model State to model.pth")
    
    
    auc = 0.9

    
    return (auc, model.uri)

In [59]:
PROJECT_NUMBER = !gcloud projects describe {PROJECT_ID} --format="value(projectNumber)"
PROJECT_NUMBER = PROJECT_NUMBER[0]
print(f'PROJECT_NUMBER: {PROJECT_NUMBER}')

# NETWORK = !gcloud compute networks describe default --format="value(id)"
# NETWORK = NETWORK[0]
# NETWORK='default'
# print(f'NETWORK: {NETWORK}')

PROJECT_NUMBER: 635273855015


In [60]:
custom_training_job = create_custom_training_job_from_component(
    pytorch_training,
    display_name = 'pytorch_training',
    machine_type = 'g2-standard-16',
    accelerator_type='NVIDIA_L4',
    accelerator_count='1',
    boot_disk_type='pd-ssd',
    boot_disk_size_gb='100',
    # network=f'projects/{PROJECT_NUMBER}/global/networks/{NETWORK}'
)


## モデルサービングのためのコンテナを作成する

### Artifact Registry にリポジトリを作成

In [61]:
!gcloud artifacts repositories create custom-prediction-pytorch-cpu \
 --repository-format=docker \
 --location=us-central1
!gcloud artifacts repositories list

[1;31mERROR:[0m (gcloud.artifacts.repositories.create) ALREADY_EXISTS: the repository already exists
Listing items under project yuyaono-dev1, across all locations.

                                                                                    ARTIFACT_REGISTRY
REPOSITORY                     FORMAT  MODE                 DESCRIPTION                   LOCATION         LABELS  ENCRYPTION          CREATE_TIME          UPDATE_TIME          SIZE (MB)
gke-dojo                       DOCKER  STANDARD_REPOSITORY                                asia-northeast1          Google-managed key  2024-01-12T08:57:19  2024-01-13T08:03:13  340.095
beans-model-trainer            DOCKER  STANDARD_REPOSITORY                                us-central1              Google-managed key  2025-02-04T13:54:08  2025-02-05T03:05:14  2070.043
cicd-sample-repo               DOCKER  STANDARD_REPOSITORY                                us-central1              Google-managed key  2023-11-08T07:16:13  2023-11-08T07:16

### コンテナを作成

In [62]:
!docker build \
  --tag=us-central1-docker.pkg.dev/{PROJECT_ID}/custom-prediction-pytorch-cpu/custom-prediction-pytorch-cpu \
  -f app/Dockerfile \
  app_prediction_pytorch_cpu

Sending build context to Docker daemon  18.99kB
Step 1/9 : FROM python:3.9-slim
 ---> 096343841dd9
Step 2/9 : ENV PYTHONUNBUFFERED True
 ---> Using cache
 ---> e961af5d249b
Step 3/9 : ENV APP_HOME /app
 ---> Using cache
 ---> 22a279ee07c1
Step 4/9 : WORKDIR $APP_HOME
 ---> Using cache
 ---> b429102d4bdc
Step 5/9 : COPY . ./
 ---> 98db5e2244b3
Step 6/9 : RUN apt-get update -y     && apt-get install -y libgomp1
 ---> Running in 5f4f9a6f5516
Get:1 http://deb.debian.org/debian bookworm InRelease [151 kB]
Get:2 http://deb.debian.org/debian bookworm-updates InRelease [55.4 kB]
Get:3 http://deb.debian.org/debian-security bookworm-security InRelease [48.0 kB]
Get:4 http://deb.debian.org/debian bookworm/main amd64 Packages [8792 kB]
Get:5 http://deb.debian.org/debian bookworm-updates/main amd64 Packages [13.5 kB]
Get:6 http://deb.debian.org/debian-security bookworm-security/main amd64 Packages [245 kB]
Fetched 9305 kB in 2s (5797 kB/s)
Reading package lists...
Reading package lists...
Building 

### Artifact Registry に登録（Push）

In [63]:
!gcloud auth configure-docker --quiet us-central1-docker.pkg.dev
!docker push us-central1-docker.pkg.dev/{PROJECT_ID}/custom-prediction-pytorch-cpu/custom-prediction-pytorch-cpu


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.
Using default tag: latest
The push refers to repository [us-central1-docker.pkg.dev/yuyaono-dev1/custom-prediction-pytorch-cpu/custom-prediction-pytorch-cpu]

[1B61004087: Preparing 
[1Bf750d681: Preparing 
[1B67b96c04: Preparing 
[1Bfea1f92e: Preparing 
[1Bf983fc2e: Preparing 
[1Be9b5727d: Preparing 
[1Bbff797f9: Preparing 
[1B13317391: Preparing 
[9B61004087: Pushed   9.096GB/9.068GB[9A[2K[4A[2K[9A[2K[2A[2K[7A[2K[9A[2K[7A[2K[9A[2K[7A[2K[8A[2K[7A[2K[8A[2K[7A[2K[9A[2K[8A[2K[9A[2K[7A[2K[9A[2K[7A[2K[9A[2K[8A[2K[9A[2K[8A[2K[9A[2K[8A[2K[7A[2K[9A[2K[8A[2K[9A[2K[8A[2K[9A[

## パイプラインの定義（定義したコンポーネントを利用）

In [64]:
CONTAINER_IMAGE_URL

'us-central1-docker.pkg.dev/yuyaono-dev1/custom-prediction-pytorch-cpu/custom-prediction-pytorch-cpu:latest'

In [65]:
@dsl.pipeline(
    name="vai-pipelines-handson-gpu-training",
)
def pipeline():
    
    training_job_task = custom_training_job(
        project=PROJECT_ID,
        location=REGION,
    ).set_display_name('training-job-task')
    
    
    import_unmanaged_model_task = importer_node.importer(
        artifact_uri=training_job_task.outputs["model_uri"],
        artifact_class=artifact_types.UnmanagedContainerModel,
        metadata={
            "artifactUri": training_job_task.outputs["model_uri"],
            "containerSpec": {
                "imageUri": CONTAINER_IMAGE_URL,
                "healthRoute": "/",
                "predictRoute": "/predict",
                "env": [
                    {
                        "name": "SRC_MODEL_URI",
                        "value": training_job_task.outputs["model_uri"]
                    }
                ]
            },
        },
    )


    model_task = ModelUploadOp(
        project=PROJECT_ID,
        display_name="custom-prediction-pytorch-cpu",
        unmanaged_container_model=import_unmanaged_model_task.outputs["artifact"],
    )


## パイプラインのコンパイル（YAML 生成）

In [66]:
compiler.Compiler().compile(pipeline_func=pipeline, package_path="pipeline-gpu-training.yaml")

## パイプラインの実行

In [67]:
job = aiplatform.PipelineJob(
    display_name="vai-pipelines-handson-gpu-training",
    template_path="pipeline-gpu-training.yaml",
    pipeline_root=PIPELINE_ROOT,
)

job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/635273855015/locations/us-central1/pipelineJobs/vai-pipelines-handson-gpu-training-20250214112035
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/635273855015/locations/us-central1/pipelineJobs/vai-pipelines-handson-gpu-training-20250214112035')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/vai-pipelines-handson-gpu-training-20250214112035?project=635273855015
PipelineJob projects/635273855015/locations/us-central1/pipelineJobs/vai-pipelines-handson-gpu-training-20250214112035 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/635273855015/locations/us-central1/pipelineJobs/vai-pipelines-handson-gpu-training-20250214112035 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/635273855015/locations/us-central1/pipelineJobs/vai-pipelines-handson-gpu-training-20250214112035 current s