# MLflow v2 Kubeflow pipeline integration
In this example jupyterm notebook we create a simple three step kubeflow pipeline. The steps consist of downloading the dataset, preprocessing the dataset and training a classiffier. The training step uploads the resulting model artifact directly to mlflow registry. Following environment variables are expected to be set (in case of charmed kubeflow they are set for you):

* MLFLOW_S3_ENDPOINT_URL: endpoint for object storage 
* MLFLOW_TRACKING_URI: endpoint for mlflow server
* AWS_SECRET_ACCESS_KEY: secret key for object storage
* AWS_ACCESS_KEY_ID: username for object storage

In [1]:
# Installing extra libs for mlflow integration
!pip install mlflow boto3 awscli pyarrow scikit-learn mlflow



In [2]:
# Make sure to use up to date kfp (kubeflow pipeline python SDK)
!pip install kfp==1.8.22 --upgrade -q

In [3]:
import kfp
import os

from kfp import dsl

In [4]:
# In airgapped environment upload data manually
!wget https://raw.githubusercontent.com/Barteus/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv

--2023-05-31 11:28:32--  https://raw.githubusercontent.com/Barteus/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84199 (82K) [text/plain]
Saving to: ‘winequality-red.csv.3’


2023-05-31 11:28:33 (68.4 MB/s) - ‘winequality-red.csv.3’ saved [84199/84199]



In [5]:
# We can create pipeline component from remote code with the SDK
web_downloader_op = kfp.components.load_component_from_url(
            'https://raw.githubusercontent.com/kubeflow/pipelines/master/components/contrib/web/Download/component.yaml')

In [6]:
# Next we can create as many data Preprocessing task as we need. We put the imports inside function
def preprocess(file_path: kfp.components.InputPath('CSV'),
              output_file: kfp.components.OutputPath('parquet')):
    import pandas as pd
    df = pd.read_csv(file_path, header=0, sep=";")
    df.columns = [c.lower().replace(' ', '_') for c in df.columns]
    df.to_parquet(output_file)

In [7]:
# Running the task locally for testing purposes
preprocess('winequality-red.csv', 'preprocessed.parquet')

In [8]:
# Creating a component from python function with SDK
preprocess_op = kfp.components.create_component_from_func(
        func=preprocess,
        output_component_file='preprocess-component.yaml', # This is optional. It saves the component spec for future use.
        base_image='python:3.9', # Any base image we need
        packages_to_install=['pandas', 'pyarrow']) # Install additional packages

In [9]:
def trainning(file_path: kfp.components.InputPath('parquet'))->str:
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report
    import mlflow
    from sklearn.linear_model import ElasticNet
    
    df = pd.read_parquet(file_path)
    
    target_column='quality'
    train_x, test_x, train_y, test_y = train_test_split(df.drop(columns=[target_column]),
                                                    df[target_column], test_size=.25,
                                                    random_state=1337, stratify=df[target_column])    
   
    with mlflow.start_run(run_name='elastic_net_models'):
        alpha =  0.5
        l1_ratio =  0.5
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)
        result = mlflow.sklearn.log_model(lr, "model", registered_model_name="wine-elasticnet")
        return f"{mlflow.get_artifact_uri()}/{result.artifact_path}"

In [10]:
# Local test 
trainning('preprocessed.parquet')

Registered model 'wine-elasticnet' already exists. Creating a new version of this model...
2023/05/31 11:28:42 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: wine-elasticnet, version 6
Created version '6' of model 'wine-elasticnet'.


's3://mlflow/0/0709fcb68d0b4f6f9a5c42afb1fc67de/artifacts/model'

In [11]:
# Same component creation as above
training_op = kfp.components.create_component_from_func(
        func=trainning,
        output_component_file='train-component.yaml', # This is optional. It saves the component spec for future use.
        base_image='python:3.9.13',
        packages_to_install=['pandas', 'pyarrow', 'scikit-learn', 'mlflow', 'boto3'])

In [12]:
from kubernetes.client.models import V1EnvVar
from kfp.onprem import use_k8s_secret

@dsl.pipeline(
    name="e2e_wine_pipeline",
    description="WINE pipeline",
)
def wine_pipeline(url):
    web_downloader_task = web_downloader_op(url=url)
    preprocess_task = preprocess_op(file=web_downloader_task.outputs['data'])
    
    train_task = (training_op(file=preprocess_task.outputs['output'])
                 .add_env_variable(V1EnvVar(name='MLFLOW_TRACKING_URI', value=os.getenv('MLFLOW_TRACKING_URI')))
                 .add_env_variable(V1EnvVar(name='MLFLOW_S3_ENDPOINT_URL', value=os.getenv('MLFLOW_S3_ENDPOINT_URL')))
                 .add_env_variable(V1EnvVar(name='AWS_ACCESS_KEY_ID', value=os.getenv('AWS_ACCESS_KEY_ID')))
                 .add_env_variable(V1EnvVar(name='AWS_SECRET_ACCESS_KEY', value=os.getenv('AWS_SECRET_ACCESS_KEY'))))

In [13]:
client = kfp.Client()
client.create_run_from_pipeline_func(
    wine_pipeline,
    arguments={
        "url": "https://raw.githubusercontent.com/Barteus/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv",
    })

RunPipelineResult(run_id=a04858a2-8836-42f7-aa80-32f01423e9a9)