# E2E scenario for Wine dataset as KFP

Steps:
- download
- clean/preprocess
- train/hyperparam tunning with results in mlflow+minio
- seldon serving
- example inference

Artefacts:
- raw data, preprocessed
- model per experiment
- experiment metadata and results

In [1]:
!pip install mlflow==2.1.1 boto3 awscli pyarrow scikit-learn==1.2.2 "numpy<1.20"



In [8]:
!pip install "kfp<2.0" --upgrade -q

In [13]:
!pip install "mlflow==2.1.1" --upgrade

Collecting mlflow==2.1.1
  Downloading mlflow-2.1.1-py3-none-any.whl (16.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.7/16.7 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting Flask<3 (from mlflow==2.1.1)
  Obtaining dependency information for Flask<3 from https://files.pythonhosted.org/packages/fd/56/26f0be8adc2b4257df20c1c4260ddd0aa396cf8e75d90ab2f7ff99bc34f9/flask-2.3.3-py3-none-any.whl.metadata
  Downloading flask-2.3.3-py3-none-any.whl.metadata (3.6 kB)
Collecting importlib-metadata!=4.7.0,<6,>=3.7.0 (from mlflow==2.1.1)
  Downloading importlib_metadata-5.2.0-py3-none-any.whl (21 kB)
Collecting packaging<23 (from mlflow==2.1.1)
  Downloading packaging-22.0-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow<11,>=4.0.0 (from mlflow==2.1.1)
  Downloading pyarrow-10.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_

In [9]:
import kfp
from kfp import dsl

# Local development

For most cases not running below lines or changing their values is needed!

In [4]:
!aws --endpoint-url $MLFLOW_S3_ENDPOINT_URL s3 ls

2023-11-09 09:42:43 mlflow
2023-11-09 09:42:32 mlpipeline


# Download data

In [15]:
#In airgapped environment upload data manually
!wget https://raw.githubusercontent.com/canonical/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv

--2023-11-09 10:37:29--  https://raw.githubusercontent.com/canonical/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84199 (82K) [text/plain]
Saving to: ‘winequality-red.csv.1’


2023-11-09 10:37:29 (4.61 MB/s) - ‘winequality-red.csv.1’ saved [84199/84199]



In [23]:
web_downloader_op = kfp.components.load_component_from_url(
            'https://raw.githubusercontent.com/kubeflow/pipelines/master/components/contrib/web/Download/component.yaml')

# Preprocess data

In [24]:
def preprocess(file_path: kfp.components.InputPath('CSV'),
              output_file: kfp.components.OutputPath('parquet')):
    import pandas as pd
    df = pd.read_csv(file_path, header=0, sep=";")
    df.columns = [c.lower().replace(' ', '_') for c in df.columns]
    df.to_parquet(output_file)

In [16]:
#local development
preprocess('winequality-red.csv', 'preprocessed.parquet')

In [25]:
#workflow component
preprocess_op = kfp.components.create_component_from_func(
        func=preprocess,
        output_component_file='preprocess-component.yaml', # This is optional. It saves the component spec for future use.
        base_image='python:3.9.15',
        packages_to_install=['pandas', 'pyarrow'])

# Train model

In [26]:
def trainning(file_path: kfp.components.InputPath('parquet'))->str:
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report
    import mlflow
    from sklearn.linear_model import ElasticNet
    
    df = pd.read_parquet(file_path)
    
    target_column='quality'
    train_x, test_x, train_y, test_y = train_test_split(df.drop(columns=[target_column]),
                                                    df[target_column], test_size=.25,
                                                    random_state=1337, stratify=df[target_column])

    mlflow.sklearn.autolog()
    with mlflow.start_run(run_name='elastic_net_models') as run:
        alpha =  0.5
        l1_ratio =  0.5
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)
        model_dir = "model"
        env = {
            'name': 'mlflow-env',
            'channels': ['defaults'],
            'dependencies': [
                'python=3.8.10',
                'pip'],
            
            'pip':[
                'mlflow==2.1.1',
                'scikit-learn==0.23.2'
            ]
        }
        mlflow.sklearn.log_model(lr, model_dir, registered_model_name="wine-elasticnet", conda_env=env)
        return f"{run.info.artifact_uri}/{model_dir}"

In [16]:
!pip freeze | grep mlflow

mlflow==2.1.1


In [15]:
trainning('preprocessed.parquet')

Registered model 'wine-elasticnet' already exists. Creating a new version of this model...
2023/11/09 10:44:28 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: wine-elasticnet, version 2
Created version '2' of model 'wine-elasticnet'.


's3://mlflow/0/fbef245cea284c1dbedf145b8e1e2853/artifacts/model'

In [27]:
#workflow component
training_op = kfp.components.create_component_from_func(
        func=trainning,
        output_component_file='train-component.yaml', # This is optional. It saves the component spec for future use.
        base_image='python:3.8.10',
        packages_to_install=['pandas', 'pyarrow', 'scikit-learn==1.2.2', 'mlflow==2.1.1', 'boto3', "numpy<1.20"])

# Deploy model

Known issues:
- https://githubhot.com/repo/SeldonIO/seldon-core/issues/4017 

In [28]:
def deploy(model_uri:str = "default_model_uri"):
    import subprocess
    
    with open("/tmp/manifest.yaml", "w") as f:
        manifest = """
apiVersion: machinelearning.seldon.io/v1
kind: SeldonDeployment
metadata:
  name: mlflow
spec:
  name: wines
  predictors:
  - componentSpecs:
    - spec:
        containers:
        - name: classifier
          image: seldonio/mlflowserver:1.16.0
          imagePullPolicy: Always
          livenessProbe:
            initialDelaySeconds: 80
            failureThreshold: 200
            periodSeconds: 5
            successThreshold: 1
            httpGet:
              path: /health/ping
              port: http
              scheme: HTTP
          readinessProbe:
            initialDelaySeconds: 80
            failureThreshold: 200
            periodSeconds: 5
            successThreshold: 1
            httpGet:
              path: /health/ping
              port: http
              scheme: HTTP
    graph:
      children: []
      implementation: MLFLOW_SERVER
      modelUri: """+model_uri+"""
      envSecretRefName: mlflow-server-seldon-rclone-secret
      name: classifier
    name: wine-super-model
    replicas: 1
        """
        print(manifest)
        f.write(manifest)
    
    result = subprocess.call(['kubectl', 'apply', '-f', '/tmp/manifest.yaml', '-n', 'admin'])
    assert result == 0
    

In [29]:
deploy_op = kfp.components.create_component_from_func(
        func=deploy,
        output_component_file='deploy-component.yaml', # This is optional. It saves the component spec for future use.
        base_image='bponieckiklotz/seldon-deploy:0.1',
        packages_to_install=[])

# Create pipeline

In [30]:
from kubernetes.client.models import V1EnvVar
from kfp.onprem import use_k8s_secret

@dsl.pipeline(
    name="e2e_wine_pipeline",
    description="WINE pipeline",
)
def wine_pipeline(url):
    web_downloader_task = web_downloader_op(url=url)
    preprocess_task = preprocess_op(file=web_downloader_task.outputs['data'])
    
    train_task = (training_op(file=preprocess_task.outputs['output'])
                 .add_env_variable(V1EnvVar(name='MLFLOW_TRACKING_URI', value='http://mlflow-server.kubeflow.svc.cluster.local:5000'))
                 .add_env_variable(V1EnvVar(name='MLFLOW_S3_ENDPOINT_URL', value='http://minio.kubeflow.svc.cluster.local:9000'))
                 .add_env_variable(V1EnvVar(name='PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION', value='python')) 
                 #https://kubeflow-pipelines.readthedocs.io/en/stable/source/kfp.extensions.html#kfp.onprem.use_k8s_secret
                  .apply(use_k8s_secret(secret_name='mlpipeline-minio-artifact', k8s_secret_key_to_env={
                     'accesskey': 'AWS_ACCESS_KEY_ID',
                     'secretkey': 'AWS_SECRET_ACCESS_KEY',
                 })))
    deploy_task = deploy_op(model_uri=train_task.output)
    

In [31]:
client = kfp.Client()

In [32]:
client.create_run_from_pipeline_func(
    wine_pipeline,
    arguments={
        "url": "https://raw.githubusercontent.com/canonical/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv",
    })

RunPipelineResult(run_id=1a2868af-00d8-49f3-ad00-23587b021380)

In [None]:
kfp.compiler.Compiler().compile(wine_pipeline, 'wine-pipeline.yaml')

In [None]:
!pip freeze > nb-requirements.txt