# Training and deploying a tabular model using Vertex AutoML

![Training pipeline](../images/automl.png)

## Import the required packages

In [6]:
import os
import pprint
import pandas as pd
import time

import matplotlib.pyplot as plt

from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform_v1beta1 import types
from google.cloud import bigquery
from google.cloud import exceptions

## Configure GCP settings

*Before running the notebook make sure to follow the repo's README file to install the pre-requisites and configure GCP authentication.*

In [7]:
PROJECT = 'jk-vertex-demos'
REGION = 'us-central1'

STAGING_BUCKET = f'gs://jk-vertex-demos-bucket'
VERTEX_SA = f'training-sa@{PROJECT}.iam.gserviceaccount.com'

## Create data splits

In [8]:
BQ_DATASET_NAME = 'chicago_taxi_dataset' 
BQ_TABLE_NAME = 'features'
BQ_LOCATION = 'US'
SAMPLE_SIZE = 500000
YEAR = 2020

#### Create a BQ dataset to host the splits

In [9]:
client = bigquery.Client()

dataset_id = f'{PROJECT}.{BQ_DATASET_NAME}'
dataset = bigquery.Dataset(dataset_id)
dataset.location = BQ_LOCATION

try:
    dataset = client.create_dataset(dataset, timeout=30)
    print('Created dataset: ', dataset_id)
except exceptions.Conflict:
    print('Dataset {} already exists'.format(dataset_id))

Dataset jk-vertex-demos.chicago_taxi_dataset already exists


#### Create a table with training features

In [10]:
sample_size = 1000000
year = 2020

sql_script_template = '''
CREATE OR REPLACE TABLE `@PROJECT.@DATASET.@TABLE` 
AS (
    WITH
      taxitrips AS (
      SELECT
        FORMAT_DATETIME('%Y-%d-%m', trip_start_timestamp) AS date,
        trip_start_timestamp,
        trip_seconds,
        trip_miles,
        payment_type,
        pickup_longitude,
        pickup_latitude,
        dropoff_longitude,
        dropoff_latitude,
        tips,
        fare
      FROM
        `bigquery-public-data.chicago_taxi_trips.taxi_trips`
      WHERE 1=1 
      AND pickup_longitude IS NOT NULL
      AND pickup_latitude IS NOT NULL
      AND dropoff_longitude IS NOT NULL
      AND dropoff_latitude IS NOT NULL
      AND trip_miles > 0
      AND trip_seconds > 0
      AND fare > 0
      AND EXTRACT(YEAR FROM trip_start_timestamp) = @YEAR
    )

    SELECT
      trip_start_timestamp,
      EXTRACT(MONTH from trip_start_timestamp) as trip_month,
      EXTRACT(DAY from trip_start_timestamp) as trip_day,
      EXTRACT(DAYOFWEEK from trip_start_timestamp) as trip_day_of_week,
      EXTRACT(HOUR from trip_start_timestamp) as trip_hour,
      trip_seconds,
      trip_miles,
      payment_type,
      ST_AsText(
          ST_SnapToGrid(ST_GeogPoint(pickup_longitude, pickup_latitude), 0.1)
      ) AS pickup_grid,
      ST_AsText(
          ST_SnapToGrid(ST_GeogPoint(dropoff_longitude, dropoff_latitude), 0.1)
      ) AS dropoff_grid,
      ST_Distance(
          ST_GeogPoint(pickup_longitude, pickup_latitude), 
          ST_GeogPoint(dropoff_longitude, dropoff_latitude)
      ) AS euclidean,
      IF((tips/fare >= 0.2), 1, 0) AS tip_bin,
      CASE (ABS(MOD(FARM_FINGERPRINT(date),10))) 
          WHEN 9 THEN 'TEST'
          WHEN 8 THEN 'VALIDATE'
          ELSE 'TRAIN' END AS data_split
    FROM
      taxitrips
    LIMIT @LIMIT
)
'''

sql_script = sql_script_template.replace(
    '@PROJECT', PROJECT).replace(
    '@DATASET', BQ_DATASET_NAME).replace(
    '@TABLE', BQ_TABLE_NAME).replace(
    '@YEAR', str(year)).replace(
    '@LIMIT', str(sample_size))

job = client.query(sql_script)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f916a7de310>

#### Review the created features

In [11]:
sql_script = f'''
SELECT * EXCEPT (trip_start_timestamp)
FROM {PROJECT}.{BQ_DATASET_NAME}.{BQ_TABLE_NAME} 
'''
df = client.query(sql_script).result().to_dataframe()
df.head().T

Unnamed: 0,0,1,2,3,4
trip_month,6,6,6,6,6
trip_day,19,19,19,19,19
trip_day_of_week,6,6,6,6,6
trip_hour,0,0,0,0,0
trip_seconds,383,241,1780,1200,240
trip_miles,0.8,0.77,9.64,4.1,0.9
payment_type,Credit Card,Cash,Prcard,Cash,Cash
pickup_grid,POINT(-87.6 41.9),POINT(-87.7 41.9),POINT(-87.7 41.8),POINT(-87.6 41.8),POINT(-87.7 42)
dropoff_grid,POINT(-87.6 41.9),POINT(-87.7 41.9),POINT(-87.6 41.7),POINT(-87.7 41.9),POINT(-87.7 42)
euclidean,0.0,0.0,12414.729032,2210.049904,2406.040979


## Creating a tabular dataset in Vertex

### Initialize Vertex AI SDK

In [12]:
vertex_ai.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

### Create a dataset and import data

In [13]:
display_name = 'Chicago taxi trips'
bq_source_uri = f'bq://{PROJECT}.{BQ_DATASET_NAME}.{BQ_TABLE_NAME}'

filter = f'display_name="{display_name}"'

dataset = vertex_ai.TabularDataset.list(filter=filter)
if not dataset:
    print("Creating a new dataset.")
    dataset = vertex_ai.TabularDataset.create(
        display_name=display_name, bq_source=bq_source_uri,
    )

    dataset.wait()
else:
    print("Using existing dataset: ", dataset[0].resource_name)
    dataset = vertex_ai.TabularDataset(dataset_name=dataset[0].resource_name)

Creating a new dataset.
INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/1026026909625/locations/us-central1/datasets/2983010230528376832/operations/8322841502258561024
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/1026026909625/locations/us-central1/datasets/2983010230528376832
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/1026026909625/locations/us-central1/datasets/2983010230528376832')


## Launching an AutoML training job

In [14]:
display_name = 'Chicago Taxi classifier training'
model_display_name = 'Chicago Taxi classifier'
target_column = 'tip_bin'
optimization_prediction_type = 'classification'
optimization_objective = 'maximize-recall-at-precision'
optimization_objective_precision_value = 0.7
split_column = 'data_split'
budget_milli_node_hours = 1000

column_transformations = [
    {'categorical': {'column_name': 'trip_month'}},
    {'categorical': {'column_name': 'trip_day'}},
    {'categorical': {'column_name': 'trip_day_of_week'}},
    {'categorical': {'column_name': 'trip_hour'}},
    {'categorical': {'column_name': 'payment_type'}},
    {'categorical': {'column_name': 'pickup_grid'}},
    {'categorical': {'column_name': 'dropoff_grid'}},
    {'numeric': {'column_name': 'trip_seconds'}},
    {'numeric': {'column_name': 'euclidean'}},
    {'numeric': {'column_name': 'trip_miles'}},
]

job = vertex_ai.AutoMLTabularTrainingJob(
    display_name=display_name,
    optimization_prediction_type=optimization_prediction_type,
    optimization_objective=optimization_objective,
    optimization_objective_precision_value=optimization_objective_precision_value,
    column_transformations=column_transformations,
)

model = job.run(
    dataset=dataset,
    target_column=target_column,
    budget_milli_node_hours=budget_milli_node_hours,
    model_display_name=model_display_name,
    predefined_split_column_name=split_column,
    sync=False
)

INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/8759892701375102976?project=1026026909625
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/1026026909625/locations/us-central1/trainingPipelines/8759892701375102976 current state:
PipelineState.PIPELINE_STATE_PENDING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/1026026909625/locations/us-central1/trainingPipelines/8759892701375102976 current state:
PipelineState.PIPELINE_STATE_PENDING
