In [None]:
#####################################################################
#
# extract table from from BQ to GCS CSVs
#
#####################################################################

In [52]:
# set params
P = ! gcloud config list --format 'value(core.project)'
PROJECT_ID = P[0]
REGION = "us-central1"

In [55]:
BQ_DATASET = "ds_central1"
BQ_TABLE = "tab_class_732inps_1600krows_tra"
TRAINING_DATA_BUCKET = "ap-alto-ml-1000-bucket-us-central1"

In [57]:
! bq extract \
  --destination_format=CSV \
  --field_delimiter=',' \
  --print_header=false \
  {BQ_DATASET}.{BQ_TABLE} \
  gs://{TRAINING_DATA_BUCKET}/{BQ_TABLE}_*.csv

Waiting on bqjob_rbddeba452ee2275_00000185a8933615_1 ... (108s) Current status: DONE   


In [None]:
#####################################################################
#
# combine CSVs into a single file
#
#####################################################################

In [15]:
import csv
from google.cloud import storage

In [None]:
# write file containing column headers
l = list()
l.append("label")
for i in range(732):
    l.append(f"double_field_{i}")


file_with_headers = f"{BQ_TABLE}_headers.csv"
with open(file_with_headers, 'w', newline='') as csvfile:
    w = csv.writer(csvfile, delimiter=',')
    w.writerow(l)
    
# copy file to GCS dir with data
! gsutil cp {file_with_headers} gs://{TRAINING_DATA_BUCKET}/{file_with_headers}

In [61]:
# get source files for GCS compose
storage_client = storage.Client()
bucket = storage_client.bucket(TRAINING_DATA_BUCKET)

sources = list()
sources.append(bucket.get_blob(file_with_headers))

# get blob names from GCS
blobs = bucket.list_blobs()
for b in bucket.list_blobs():
    if b.name != file_with_headers:
        sources.append(bucket.get_blob(b.name))
    else:
        pass

# chunk for GCS compose
n = 30
source_lists = [sources[i:i + n] for i in range(0, len(sources), n)]

if len(source_lists) == 1:
    destination_blob_name = f"{BQ_TABLE}.csv"
    destination = bucket.blob(destination_blob_name)
    destination.content_type = "text/plain"
    destination.compose(source_lists[0])
else:
    for idx, source_list in enumerate(source_lists):
        if idx==0 and idx < len(source_lists)-1:
            destination_blob_name = f"{BQ_TABLE}_temp_{idx}.csv"
            destination = bucket.blob(destination_blob_name)
            destination.content_type = "text/plain"
            destination.compose(source_list)
        if idx > 0 and idx < len(source_lists)-1:
            last_temp = [bucket.get_blob(destination.name)]
            destination_blob_name = f"{BQ_TABLE}_temp_{idx}.csv"
            destination = bucket.blob(destination_blob_name)
            destination.content_type = "text/plain"
            destination.compose( last_temp + source_list)
        elif idx == len(source_lists)-1:
            last_temp = [bucket.get_blob(destination.name)]
            destination_blob_name = f"{BQ_TABLE}.csv"
            destination = bucket.blob(destination_blob_name)
            destination.content_type = "text/plain"
            destination.compose( last_temp + source_list)

In [None]:
GCS_TRAIN_URI = f"gs://{TRAINING_DATA_BUCKET}/{BQ_TABLE}.csv"

In [None]:
#####################################################################
#
# training pipeline
#
#####################################################################

In [3]:
# imports for this notebook to run
import sys
from datetime import datetime
from typing import NamedTuple

from google.cloud import aiplatform as vertex

import kfp
from kfp.v2 import dsl, compiler

In [4]:
# specify parameters
P = ! gcloud config list --format 'value(core.project)'
PROJECT_ID = P[0]
REGION = "us-central1"
BUCKET_NAME = f"bkt-{PROJECT_ID}-vpipelines"
BUCKET_PATH = f"gs://{BUCKET_NAME}"
PIPELINE_ROOT = f"{BUCKET_PATH}/pipeline_root"
PIPELINE_DATA = f"{BUCKET_PATH}/data"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

In [5]:
# train model
@dsl.component(
    base_image="us-central1-docker.pkg.dev/ap-alto-ml-1000/ap-docker-repo/autogluon_0.5.2-cpu-framework-ubuntu20.04-py3.8:latest"
    , packages_to_install=["protobuf<=3.18.1"]
    , output_component_file="component_autogluon_train.yaml"
)
def autogluon_train(
    dataset: dsl.Input[dsl.Dataset]
    , model: dsl.Output[dsl.Model]
):
    
    # build the default autogluon model
    from autogluon.tabular import TabularDataset, TabularPredictor
    
    label = 'label'
    train_data = TabularDataset(dataset.path)
    predictor = TabularPredictor(label=label, path=model.path).fit(train_data)
    model.uri = model.uri  + "/predictor.pkl"

In [6]:
#####################################################################
#
# define the pipeline
#
#####################################################################

In [7]:
# define a pipeline
@dsl.pipeline(name="autogluon-testing", description="my pipeline description")

# specify all the inputs the pipeline needs to run
def my_pipeline(
    project_id: str
    , region: str
    , gcs_train_uri: str
):

    # specify the nodes in the pipeline
    dataset_task = dsl.importer(
        artifact_uri = gcs_train_uri,
        artifact_class=dsl.Dataset,
        reimport=True
    )
    
    model_task = (autogluon_train(dataset_task.output)
                 ).set_memory_limit('384G').set_cpu_limit('96')

In [8]:
#####################################################################
#
# compile and run the pipeline
#
#####################################################################

In [None]:
# compile the pipeline
my_package_path = 'my_vertex_pipeline_specification_file.json'
compiler.Compiler().compile(pipeline_func=my_pipeline, package_path=my_package_path)

In [10]:
# runtime parameters to pass to pipeline
pipeline_params = { "project_id": PROJECT_ID
                   , "region": REGION
                   , "gcs_train_uri" : GCS_TRAIN_URI
                  }

# run the pipeline
vertex.init(project=PROJECT_ID)

job = vertex.PipelineJob(
    display_name = "my-pipeline-job-name",
    template_path = my_package_path,
    pipeline_root = PIPELINE_ROOT,
    parameter_values = pipeline_params,
    enable_caching = False
)

job.submit(service_account = f"sa-vertex-pipelines@{PROJECT_ID}.iam.gserviceaccount.com")

Creating PipelineJob
PipelineJob created. Resource name: projects/720376660491/locations/us-central1/pipelineJobs/autogluon-testing-20230112225517
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/720376660491/locations/us-central1/pipelineJobs/autogluon-testing-20230112225517')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/autogluon-testing-20230112225517?project=720376660491
