# Kubeflow - VertexAI pipelines tutorial
## Installing required libraries

In [None]:
! pip3 install --no-cache-dir --upgrade "kfp>2" \
                                        google-cloud-aiplatform

In [None]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! pip3 freeze | grep aiplatform

## Define your values

In [None]:
import random
import string
PROJECT_ID = "your-project-id"
LOCATION = "us-central1"
random_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8)) # Comenta esto y reemplaza con el valor que se imprime al ejecutar la celda para evitar multiples buckets
print("Este es el valor a reemplazar en random_suffix: "+str(random_suffix))

BUCKET_NAME = f"{PROJECT_ID}-bucket-{random_suffix}"
PIPELINE_ROOT = f"gs://{BUCKET_NAME}/pipeline_root/"

BQ_LOCATION = LOCATION.split("-")[0].upper()
BUCKET_URI = "gs://"+BUCKET_NAME

In [None]:
# Service account
shell_output = !gcloud auth list 2>/dev/null
SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()
print(SERVICE_ACCOUNT)

In [None]:
!gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI
!gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI
!gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_URI

## Initialize Vertex AI pipelines

In [None]:
import google.cloud.aiplatform as aiplatform
import kfp
from kfp import compiler, dsl
from kfp.dsl import Artifact, Dataset, Input, Metrics, Model, Output, component

In [None]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

## Linear pipeline

In [None]:
@component
def start_step() -> str:
    return 'hello world'

@component
def print_step(my_var: str) -> str:
    print(f'the data artifact is: {my_var}')
    return my_var

@component
def final_step(my_var: str):
    print(f'the data artifact is still: {my_var}')

@dsl.pipeline
def linear_pipeline():
    my_var = start_step()
    my_var_2 = print_step(my_var=my_var.output)
    final_step(my_var=my_var_2.output)


compiler.Compiler().compile(pipeline_func=linear_pipeline, package_path='linear_pipeline.yaml')

In [None]:
job = aiplatform.PipelineJob(
    display_name="linear_pipeline",
    template_path="linear_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
)

job.run()

## Branch pipeline

In [None]:
@component
def step_a() -> int:
    return 1

@component
def step_b() -> int:
    return 2

@component
def join_results(a: int, b: int):
    print(f'a is {a}')
    print(f'b is {b}')
    print(f'total is {a + b}')


@dsl.pipeline
def branch_pipeline():
    a_result = step_a()
    b_result = step_b()

    join_results(a=a_result.output, b=b_result.output)



compiler.Compiler().compile(pipeline_func=branch_pipeline, package_path='branch_pipeline.yaml')

job = aiplatform.PipelineJob(
    display_name="branch_pipeline",
    template_path="branch_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
)

job.run()

## Foreach pipeline

In [None]:
from kfp import dsl
from kfp.dsl import component, PipelineTask
from kfp.compiler import Compiler

@component
def process_title(title: str) -> str:
    return f"{title} processed"

@component
def join_results(processed_titles: list):
    for result in processed_titles:
        print(result)

@dsl.pipeline
def foreach_pipeline():
    titles = ['Stranger Things', 'House of Cards', 'Narcos']

    processed_tasks: list[PipelineTask] = []

    for title in titles:
        task = process_title(title=title)
        processed_tasks.append(task)

    join_results(processed_titles=[t.output for t in processed_tasks])


compiler.Compiler().compile(pipeline_func=foreach_pipeline, package_path='foreach_pipeline.yaml')

In [None]:
job = aiplatform.PipelineJob(
    display_name="foreach_pipeline",
    template_path="foreach_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
)

job.run()

## Input parameters

In [None]:
from kfp import dsl
from kfp.dsl import component
from kfp.compiler import Compiler

@component
def print_alpha(alpha: float):
    print(f'alpha is {alpha}')

@dsl.pipeline
def parameter_pipeline(alpha: float = 0.01):
    print_alpha(alpha=alpha)


compiler.Compiler().compile(pipeline_func=parameter_pipeline, package_path='parameter_pipeline.yaml')

In [None]:
job = aiplatform.PipelineJob(
    display_name="parameter_pipeline",
    template_path="parameter_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={'alpha': 0.199},  # your custom alpha value
)

job.run()

## Dealing with artifacts (datasets, models, etc.)

In [None]:
from kfp.dsl import component, Input, Output, Dataset

@component(packages_to_install=['pandas'])
def generate_data(data_out: Output[Dataset]):
    import pandas as pd
    df = pd.DataFrame({'col': [1, 2, 3]})
    
    # Ensure the output directory exists
    import os
    os.makedirs(data_out.path, exist_ok=True)
    
    df.to_csv(f"{data_out.path}/data.csv", index=False)

@component(packages_to_install=['pandas'])
def consume_data(data_in: Input[Dataset]):
    import pandas as pd
    df = pd.read_csv(f"{data_in.path}/data.csv")
    print(df)
    
@dsl.pipeline
def dataset_pipeline():
    data = generate_data()
    consume_data(data_in = data.outputs["data_out"])
    
compiler.Compiler().compile(pipeline_func=dataset_pipeline, package_path='dataset_pipeline.yaml')

In [None]:
job = aiplatform.PipelineJob(
    display_name="dataset_pipeline",
    template_path="dataset_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
)

job.run()