# Kubeflow - VertexAI pipelines tutorial
## Installing required libraries

In [None]:
! pip3 install --no-cache-dir --upgrade "kfp>2" \
                                        google-cloud-aiplatform

In [None]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! pip3 freeze | grep aiplatform

## Define your values

In [None]:
import random
import string
PROJECT_ID = "your-project-id"
LOCATION = "us-central1"
random_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8)) # Comenta esto y reemplaza con el valor que se imprime al ejecutar la celda para evitar multiples buckets
print("Este es el valor a reemplazar en random_suffix: "+str(random_suffix))

BUCKET_NAME = f"{PROJECT_ID}-bucket-{random_suffix}"
PIPELINE_ROOT = f"gs://{BUCKET_NAME}/pipeline_root/"

BQ_LOCATION = LOCATION.split("-")[0].upper()
BUCKET_URI = "gs://"+BUCKET_NAME

In [None]:
# Service account
shell_output = !gcloud auth list 2>/dev/null
SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()
print(SERVICE_ACCOUNT)

In [None]:
!gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI
!gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI
!gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.admin $BUCKET_URI

## Initialize Vertex AI pipelines

In [None]:
import google.cloud.aiplatform as aiplatform
import kfp
from kfp import compiler, dsl
from kfp.dsl import Artifact, Dataset, Input, Metrics, Model, Output, component

In [None]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

# Exercise: build a training pipeline 

In [None]:
from kfp import dsl
from kfp.dsl import component, Input, Output, Dataset, Model
from kfp.compiler import Compiler

@component(packages_to_install=['scikit-learn', 'numpy'])
def ingest_data(X_out: Output[Dataset], y_out: Output[Dataset]):
    # Ejemplo del primer componente utilizando tipos Output y guardando outputs en disco en lugar de enviarlos directamente como una variable u objeto
    from sklearn.datasets import load_iris
    import numpy as np
    import os

    iris = load_iris()

    os.makedirs(X_out.path, exist_ok=True)
    os.makedirs(y_out.path, exist_ok=True)

    np.save(f"{X_out.path}/X.npy", iris.data)
    np.save(f"{y_out.path}/y.npy", iris.target)

@component(packages_to_install=['scikit-learn', 'numpy'])
def split_data(
    X_in: # TODO,
    y_in: # TODO,
    X_train_out: # TODO,
    X_test_out: # TODO,
    y_train_out: # TODO,
    y_test_out: # TODO,
    test_size: float = 0.2
):
    from sklearn.model_selection import train_test_split
    import numpy as np
    import os

    # TODO: Carga los ficheros guardados en el componente anterior, recuerda utilizar el nombre del objeto y la variable correspondiente

    # TODO: Train - test split

    # TODO: Crea las carpetas correspondientes para evitar errores de escritura (fíjate en el componente anterior)

    # TODO: Guarda los splits creados (fíjate en el componente anterior)
    

@component(packages_to_install=['scikit-learn', 'numpy'])
def train(
    X_train: # TODO,
    y_train: # TODO,
    model_out: # TODO,
    max_depth: # TODO,
    n_estimators: # TODO,
    random_state: # TODO
):
    from sklearn.ensemble import RandomForestClassifier
    import pickle, numpy as np, os

    # TODO: Carga X e Y de entrenamiento

    # TODO: Entrena el modelo
    
    # TODO: Crea las carpeta correspondiente para evitar errores de escritura (fíjate en el componente anterior)

    # TODO: Guarda el modelo entrenado

@component(packages_to_install=['scikit-learn', 'numpy'])
def show_metrics(
    model: # TODO, 
    X_test: # TODO, 
    y_test: # TODO
):
    from sklearn.metrics import classification_report, confusion_matrix
    import pickle, numpy as np

    # TODO: Carga X e Y de test
    
    # TODO: Carga el modelo

    # TODO: Predice y haz un print de las métricas correspondientes


@dsl.pipeline
def training_pipeline(max_depth: int = 2, n_estimators: int = 100, random_state: int = 0):
    # Dos primeros componentes como ejemplo para los siguientes:
    ingest = ingest_data()
    
    split = split_data(
        X_in=ingest.outputs['X_out'],
        y_in=ingest.outputs['y_out'],
    )

    model = train(
        # TODO
    )

    show_metrics(
        # TODO
    )

compiler.Compiler().compile(pipeline_func=training_pipeline, package_path='training_pipeline.yaml')


In [None]:
job = aiplatform.PipelineJob(
    display_name="training_pipeline",
    template_path="training_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
)

job.run()