# E2E scenario for Wine dataset on KFP

Steps:
- download
- clean/preprocess
- train/hyperparam tuning with results in mlflow+minio
- seldon serving
- example inference

Artifacts:
- raw data, preprocessed
- model per experiment
- experiment metadata and results

In [1]:
!pip install kfp==1.8.22 mlflow==2.1.1 boto3 awscli pyarrow scikit-learn==1.2.2 "numpy<1.20" -q

In [2]:
import kfp
from kfp import dsl

## Local Development

For most cases not running below lines or changing their values is needed!

In [3]:
!aws --endpoint-url $MLFLOW_S3_ENDPOINT_URL s3 ls

2023-07-03 12:06:17 mlflow


## Download Data

In [4]:
#In airgapped environment upload data manually
!wget https://raw.githubusercontent.com/canonical/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv

--2023-07-06 08:50:11--  https://raw.githubusercontent.com/canonical/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84199 (82K) [text/plain]
Saving to: ‘winequality-red.csv.1’


2023-07-06 08:50:11 (64.2 MB/s) - ‘winequality-red.csv.1’ saved [84199/84199]



In [5]:
web_downloader_op = kfp.components.load_component_from_url(
    "https://raw.githubusercontent.com/kubeflow/pipelines/master/components/contrib/web/Download/component.yaml"
)

## Preprocess Data

In [6]:
def preprocess(
    file_path: kfp.components.InputPath("CSV"),
    output_file: kfp.components.OutputPath("parquet")
):
    import pandas as pd
    df = pd.read_csv(file_path, header=0, sep=";")
    df.columns = [c.lower().replace(" ", "_") for c in df.columns]
    df.to_parquet(output_file)

In [7]:
#local development
preprocess("winequality-red.csv", "preprocessed.parquet")

In [8]:
#workflow component
preprocess_op = kfp.components.create_component_from_func(
    func=preprocess,
    output_component_file="preprocess-component.yaml", # This is optional. It saves the component spec for future use.
    base_image="python:3.9.15",
    packages_to_install=["pandas", "pyarrow"],
)

## Train Model

In [9]:
def training(file_path: kfp.components.InputPath("parquet")) -> str:
    import mlflow
    import pandas as pd
    from sklearn.linear_model import ElasticNet
    from sklearn.metrics import classification_report
    from sklearn.model_selection import train_test_split
    
    df = pd.read_parquet(file_path)
    
    target_column="quality"
    train_x, test_x, train_y, test_y = train_test_split(
        df.drop(columns=[target_column]),
        df[target_column], test_size=.25,
        random_state=1337, stratify=df[target_column]
    )

    mlflow.sklearn.autolog()
    with mlflow.start_run(run_name="elastic_net_models") as run:
        alpha =  0.5
        l1_ratio =  0.5
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)
        model_dir = "model"
        env = {
            "name": "mlflow-env",
            "channels": ["defaults"],
            "dependencies": [
                "python=3.8.10",
                "pip"],
            
            "pip":[
                "mlflow==2.1.1",
                "scikit-learn==0.23.2"
            ]
        }
        mlflow.sklearn.log_model(lr, model_dir, registered_model_name="wine-elasticnet", conda_env=env)
        return f"{run.info.artifact_uri}/{model_dir}"

In [10]:
training("preprocessed.parquet")

Registered model 'wine-elasticnet' already exists. Creating a new version of this model...
2023/07/06 08:50:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: wine-elasticnet, version 13
Created version '13' of model 'wine-elasticnet'.


's3://mlflow/0/1e627fdb29e045d4bcfea4d0413d007a/artifacts/model'

In [11]:
#workflow component
training_op = kfp.components.create_component_from_func(
    func=training,
    output_component_file="train-component.yaml", # This is optional. It saves the component spec for future use.
    base_image="python:3.8.10",
    packages_to_install=["pandas", "pyarrow", "scikit-learn==1.2.2", "mlflow==2.1.1", "boto3", "numpy<1.20"],
)

## Deploy Model

Known issues:
- https://githubhot.com/repo/SeldonIO/seldon-core/issues/4017 

In [12]:
def deploy(model_uri: str = "default_model_uri"):
    import subprocess
    
    with open("/tmp/manifest.yaml", "w") as f:
        manifest = """
apiVersion: machinelearning.seldon.io/v1
kind: SeldonDeployment
metadata:
  name: mlflow
spec:
  name: wines
  predictors:
  - componentSpecs:
    - spec:
        containers:
        - name: classifier
          image: seldonio/mlflowserver:1.16.0
          imagePullPolicy: Always
          livenessProbe:
            initialDelaySeconds: 80
            failureThreshold: 200
            periodSeconds: 5
            successThreshold: 1
            httpGet:
              path: /health/ping
              port: http
              scheme: HTTP
          readinessProbe:
            initialDelaySeconds: 80
            failureThreshold: 200
            periodSeconds: 5
            successThreshold: 1
            httpGet:
              path: /health/ping
              port: http
              scheme: HTTP
    graph:
      children: []
      implementation: MLFLOW_SERVER
      modelUri: """+model_uri+"""
      envSecretRefName: seldon-init-container-secret
      name: classifier
    name: wine-super-model
    replicas: 1
        """
        print(manifest)
        f.write(manifest)
    
    result = subprocess.call(["kubectl", "apply", "-f", "/tmp/manifest.yaml", "-n", "admin"])
    assert result == 0

In [13]:
deploy_op = kfp.components.create_component_from_func(
    func=deploy,
    output_component_file="deploy-component.yaml", # This is optional. It saves the component spec for future use.
    base_image="bponieckiklotz/seldon-deploy:0.1",
    packages_to_install=[],
)

## Create Pipeline

In [14]:
from kubernetes.client.models import V1EnvVar
from kfp.onprem import use_k8s_secret

@dsl.pipeline(
    name="e2e_wine_pipeline",
    description="WINE pipeline",
)
def wine_pipeline(url):
    web_downloader_task = web_downloader_op(url=url)
    preprocess_task = preprocess_op(file=web_downloader_task.outputs["data"])
    
    train_task = (
        training_op(file=preprocess_task.outputs["output"])
        .add_env_variable(V1EnvVar(name="MLFLOW_TRACKING_URI", value="http://mlflow-server.kubeflow.svc.cluster.local:5000"))
        .add_env_variable(V1EnvVar(name="MLFLOW_S3_ENDPOINT_URL", value="http://minio.kubeflow.svc.cluster.local:9000"))
        .add_env_variable(V1EnvVar(name="PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION", value="python")) 
        #https://kubeflow-pipelines.readthedocs.io/en/stable/source/kfp.extensions.html#kfp.onprem.use_k8s_secret
        .apply(
            use_k8s_secret(
                secret_name="mlpipeline-minio-artifact",
                k8s_secret_key_to_env={
                    "accesskey": "AWS_ACCESS_KEY_ID",
                    "secretkey": "AWS_SECRET_ACCESS_KEY",
                },
            )
        )
    )
    deploy_task = deploy_op(model_uri=train_task.output)

In [15]:
client = kfp.Client()

In [16]:
client.create_run_from_pipeline_func(
    wine_pipeline,
    arguments={
        "url": "https://raw.githubusercontent.com/canonical/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv",
    },
)

RunPipelineResult(run_id=659cd89c-5e18-4b0e-87ae-bf0d642cdf63)

In [17]:
kfp.compiler.Compiler().compile(wine_pipeline, "wine-pipeline.yaml")

In [18]:
!pip freeze > nb-requirements.txt

In [19]:
import os
os.environ

environ{'SHELL': '/bin/bash',
        'KUBERNETES_SERVICE_PORT_HTTPS': '443',
        'UMOYA_PORT_80_TCP': 'tcp://10.152.183.59:80',
        'ML_PIPELINE_VISUALIZATIONSERVER_PORT_8888_TCP_PROTO': 'tcp',
        'KUBERNETES_SERVICE_PORT': '443',
        'UMHLANGA_SERVICE_HOST': '10.152.183.60',
        'UMOYA_PORT_80_TCP_PORT': '80',
        'HOSTNAME': 'umhlanga-0',
        'LANGUAGE': 'en_US.UTF-8',
        'UMHLANGA_SERVICE_PORT': '80',
        'ML_PIPELINE_VISUALIZATIONSERVER_SERVICE_PORT': '8888',
        'KF_PIPELINES_SA_TOKEN_PATH': '/var/run/secrets/kubeflow/pipelines/token',
        'UMHLANGA_PORT_80_TCP_PROTO': 'tcp',
        'NB_UID': '1000',
        'ML_PIPELINE_UI_ARTIFACT_PORT_80_TCP': 'tcp://10.152.183.52:80',
        'PWD': '/home/jovyan',
        'NB_PREFIX': '/notebook/test/umhlanga',
        'MLFLOW_S3_ENDPOINT_URL': 'http://mlflow-minio.kubeflow:9000',
        'UMHLANGA_PORT_80_TCP_ADDR': '10.152.183.60',
        'UMHLANGA_SERVICE_PORT_HTTP_UMHLANGA': '80',
        '