# Imports and Setup

In [11]:
import os 
import sys
import jsonlines
from google.cloud import aiplatform, storage
from google.protobuf import json_format
from datetime import datetime
import tensorflow as tf 
from tfx import v1 as tfx

%env GOOGLE_APPLICATION_CREDENTIALS /media/david/warehaus1/youtube_series/proven-script.json

GOOGLE_CLOUD_REGION = 'us-central1'
GOOGLE_CLOUD_PROJECT  = 'proven-script-347020'
GCS_BUCKET_NAME = 'salary-pipeline-347020'

PIPELINE_NAME = 'salary-pipeline'

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
# Path to various pipeline artifact.
PIPELINE_ROOT = f'gs://{GCS_BUCKET_NAME}/pipeline_root/{PIPELINE_NAME}'

# Paths for users' Python module.
MODULE_ROOT = f'gs://{GCS_BUCKET_NAME}/pipeline_module/{PIPELINE_NAME}'

# Paths for users' data.
DATA_ROOT = f'gs://{GCS_BUCKET_NAME}/data/{PIPELINE_NAME}'

# Name of Vertex AI Endpoint.
ENDPOINT_NAME = PIPELINE_NAME +'-'+ TIMESTAMP


env: GOOGLE_APPLICATION_CREDENTIALS=/media/david/warehaus1/youtube_series/proven-script.json


In [5]:
'''
!pip install tfx==1.4.0
!pip install "kfp<2"
'''


'\n!pip install tfx==1.4.0\n!pip install "kfp<2"\n'

In [6]:
!gsutil mb -l {GOOGLE_CLOUD_REGION} gs://{GCS_BUCKET_NAME}
!gsutil cp data.csv {DATA_ROOT}/

Creating gs://salary-pipeline-347020/...
Copying file://data.csv [Content-Type=text/csv]...
\ [1 files][  3.2 MiB/  3.2 MiB]                                                
Operation completed over 1 objects/3.2 MiB.                                      


In [10]:
import pandas as pd  
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


# Create Pipeline

In [12]:
def _create_pipeline(pipeline_name:str, pipeline_root:str, data_root:str,
                    module_file:str, endpoint_name:str, project_id:str,
                    region:str, use_gpu:bool) -> tfx.dsl.Pipeline:
    # ingest data
    example_gen = tfx.components.CsvExampleGen(input_base=data_root)
    
    #components
    components = [
        example_gen
    ]
    
    return tfx.dsl.Pipeline(
    pipeline_name=pipeline_name,
    pipeline_root=pipeline_root,
    components=components
    )

# Write Module code

# Run Pipeline

In [13]:
PIPELINE_DEFINITION_FILE = PIPELINE_NAME + '_pipeline.json'

runner = tfx.orchestration.experimental.KubeflowV2DagRunner(
    config=tfx.orchestration.experimental.KubeflowV2DagRunnerConfig(),
    output_filename=PIPELINE_DEFINITION_FILE
    )
_ = runner.run(
        _create_pipeline(
            pipeline_name=PIPELINE_NAME,
            pipeline_root=PIPELINE_ROOT,
            data_root=DATA_ROOT,
            module_file='',
            endpoint_name='',
            project_id=GOOGLE_CLOUD_PROJECT,
            region=GOOGLE_CLOUD_REGION,
            use_gpu=False
        )
)

# Submit job to vertex ai platform

In [14]:
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs
import logging 

logging.getLogger().setLevel(logging.INFO)

aiplatform.init(project=GOOGLE_CLOUD_PROJECT, location=GOOGLE_CLOUD_REGION)

job = pipeline_jobs.PipelineJob(template_path=PIPELINE_DEFINITION_FILE,
                               display_name=PIPELINE_NAME)
job.submit()

Creating PipelineJob


INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob


PipelineJob created. Resource name: projects/167901156608/locations/us-central1/pipelineJobs/salary-pipeline-20220504143126


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/167901156608/locations/us-central1/pipelineJobs/salary-pipeline-20220504143126


To use this PipelineJob in another session:


INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:


pipeline_job = aiplatform.PipelineJob.get('projects/167901156608/locations/us-central1/pipelineJobs/salary-pipeline-20220504143126')


INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/167901156608/locations/us-central1/pipelineJobs/salary-pipeline-20220504143126')


View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/salary-pipeline-20220504143126?project=167901156608


INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/salary-pipeline-20220504143126?project=167901156608
