### Vertex AI with kubeflow implementation - Basic

Steps for implementation:
1. Set up all the neasseary api's (Aiplatform, Container Registery and Compute  )
    You can do this by going using Console or using Google shell
2. Look for API's such as 
    - Compute API
    - Container Registery
    - AI Platform
3. If you want to enable api's from shell run the below command in shell
    - `gcloud servicea enable compute.googleapis.com`
    - `gcloud servicea enable aiplaform.googleapis.com`
    - `gcloud servicea enable containerregistery.googleapis.com`

4. Create a bucket, either from gcloud console or gcloud shell using below command
    - `gsutil mb -l '<location>'  <BUCKET_NAME> ` 

5. Assign relevent IAM permissions to bucket for serviceAccount

In [2]:
from kfp import compiler, dsl
from kfp.dsl import  component, Dataset, Input, Metrics, \
Model, Output, OutputPath, InputPath, pipeline, Artifact
from google.cloud import aiplatform

from google.cloud import aiplatform_v1
import os 
import matplotlib.pyplot as plt
import warnings 
import pandas as pd
warnings.filterwarnings("ignore")

In [3]:
!pip install -q --user kfp
import kfp
kfp.__version__

'2.0.1'

In [4]:
PROJECT_ID=""
if not os.getenv("IS_TESTING"):
    shell_out = !gcloud config list --format 'value(core.project)'
    PROJECT_ID = shell_out[0]

In [5]:
BUCKET_NAME = "gs://absolute-water-398310-2/"

In [None]:
# Listing Bucket
!gcloud storage ls

gs://absolute-water-398310-2/
gs://absolute-water-398310-bucket/


In [6]:
PATH = %env PATH
%env PATH = {PATH}:/home/jupyter/.local/bin
REGION = "US-central1"

PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline/root"

env: PATH=/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/home/jupyter/.local/bin


In [7]:
@component(
    packages_to_install=["google-cloud-bigquery", "pandas", "pyarrow", "db-dtypes","packaging",'google-resumable-media'],
    base_image="python:3.9",
    output_component_file="create_dataset.yaml",

)
def get_dataframe_func(
    bq_table: str,
    output_data_path: OutputPath("Dataset"),
    
):
    from google.cloud import bigquery
    import pandas as pd
    import os

    project_number = os.environ["CLOUD_ML_PROJECT_ID"]
    bqclient = bigquery.Client(project=project_number)
    table = bigquery.TableReference.from_string(
        bq_table
    )
    rows = bqclient.list_rows(
        table
    )
    dataframe = rows.to_dataframe(
        create_bqstorage_client=True,
    )
    dataframe = dataframe.sample(frac=1, random_state=2)
    dataframe.to_csv(output_data_path)

In [8]:

@component(
    packages_to_install=["sklearn","pandas",'joblib','db-dtypes'],
    base_image="python:3.9",
    output_component_file="beans_model_component.yaml",
)
def sklearn_train(
    dataset : Input[Dataset],
    metrics: Output[Metrics],
    model : Output[Model]
):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import roc_curve
    from sklearn.model_selection import train_test_split
    from joblib import dump
    
    import pandas as pd
    
    df = pd.read_csv(dataset.path)
    labels = df.pop("Class").tolist()
    data = df.values.tolist()
    
    X_train, X_test, y_train, y_test = train_test_split(data, labels)
    
    skmodel = DecisionTreeClassifier()
    skmodel.fit(X_train, X_test)
    score  = skmodel.score(X_test, y_test)
    print('accuracy is :', score)
    
    metrics.log_metric("accuracy",(score * 100.0))
    metrics.log_metric("framework", "Scikit Learn")
    metrics.log_metric("dataset_size", len(df))
    dump(skmodel, model.path + ".joblib")

In [9]:
@component(
    packages_to_install=['google-cloud-aiplatform'],
    base_image="python:3.9",
    output_component_file='beans_deploy_component.yaml',
)
def deploy_model(
    model: Input[Model],
    project:str,
    region: str,
    vertex_endpoint : Output[Artifact],
    vertex_model : Output[Model]
):
    from goole.cloud import aiplatform
    aiplatform.init(project = project, location = region)
    
    deployed_model = aiplatform.Model.upload(
        artifact_uri = model.uri.replace("model",""),
        display_name = 'beans_model_pipeline',
        serving_container_image_uri = 'us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-24:latest',
    )
    endpoint = deployed_model.deploy(machine_type="n1-standard-4")
    
    vertex_endpoint.uri = endpoint.resource_name
    vertex_model.uri  = deployed_model.resource_name
    
    

In [10]:
@pipeline(
    pipeline_root=PIPELINE_ROOT,
    name="mlmd-pipeline",
    display_name = 'mlmd-pipeline'
)
def pipeline_fun(
    bq_table: str = "",
    output_data_path: str = "data.csv",
    project: str = PROJECT_ID,
    region: str = REGION
):
    bq_table=""
    dataset_task = get_dataframe_func(bq_table=bq_table)

    model_task = sklearn_train(
        dataset=dataset_task.output
    )

    deploy_task = deploy_model(
        model=model_task.outputs["model"],
        project=project,
        region=region
    )

In [11]:
compiler.Compiler().compile(
    pipeline_func=pipeline_fun, package_path='initail_pipeline.json')

In [12]:
# Creating a pipeline

from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
run1 = aiplatform.PipelineJob(display_name="beans_pipeline",
                              template_path = "initail_pipeline.json",
                              job_id = 'mldm-pipeline-small-{0}'.format(TIMESTAMP),
                              parameter_values={
                                  "bq_table" :"sara-vertex-demos.beans_demo.small_dataset"
                              },
                              enable_caching=True
                             )

In [13]:
run1.submit()