# E2E scenario for Wine dataset as KFP

Steps:
- download
- datadrift inference
- clean/preprocess
- train/hyperparam tunning with results in mlflow+minio
- serving
- example inference

Artefacts:
- raw data, preprocessed
- model per experiment
- experiment metadata and results

In [4]:
!pip install mlflow boto3 awscli pyarrow sklearn mlflow -q

You should consider upgrading via the '/home/barteus/Work/DSV/kubeflow-examples/data-drift/venv/bin/python -m pip install --upgrade pip' command.[0m


In [5]:
!pip install kfp --upgrade -q

You should consider upgrading via the '/home/barteus/Work/DSV/kubeflow-examples/data-drift/venv/bin/python -m pip install --upgrade pip' command.[0m


In [6]:
import kfp
from kfp import dsl

# Local development

For most cases not running below lines or changing their values is needed!

In [7]:
import os

os.environ['MLFLOW_TRACKING_URI'] = "http://10.1.100.19:5000"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://10.1.100.27:9000"
# os.environ['MLFLOW_TRACKING_URI'] = "http://mlflow-server.kubeflow.svc.cluster.local:5000"
# os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio.kubeflow.svc.cluster.local:9000"
os.environ["AWS_ACCESS_KEY_ID"] = "minio"
os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123"

# Download data

In [1]:
#In airgapped environment upload data manually
!wget "https://raw.githubusercontent.com/Barteus/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv"

--2022-04-14 07:44:18--  https://raw.githubusercontent.com/Barteus/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84199 (82K) [text/plain]
Saving to: ‘winequality-red.csv.1’


2022-04-14 07:44:19 (310 KB/s) - ‘winequality-red.csv.1’ saved [84199/84199]



In [27]:
web_downloader_op = kfp.components.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/master/components/contrib/web/Download/component.yaml')

# Data drift detection

In [28]:
def datadrift(file_path: kfp.components.InputPath('CSV')):
    import mlflow
    import pandas as pd
    import numpy as np

    df = pd.read_csv(file_path, header=0, sep=";")
    data_drift_input = np.asarray(df.drop(columns=['quality']), np.float32)
    loaded_model = mlflow.pyfunc.load_model('s3://mlflow/0/f8ef7180c6f64ff7b7cbae0f2497d27d/artifacts/datadrift')
    drift_result = loaded_model.predict(data_drift_input)
    print(drift_result)
    return drift_result['data']['is_drift']

In [29]:
datadrift('winequality-red.csv')

{'data': {'is_drift': 0, 'distance': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32), 'p_val': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32), 'threshold': 0.004545454545454546}, 'meta': {'name': 'KSDrift', 'detector_type': 'offline', 'data_type': None, 'version': '0.9.1'}}


0

In [30]:
datadrift_op = kfp.components.create_component_from_func(
    func=datadrift,
    # output_component_file='datadrift-component.yaml',  # This is optional. It saves the component spec for future use.
    base_image='python:3.9',
    packages_to_install=['pandas', 'numpy', 'mlflow', 'boto3'])

# Preprocess data

In [31]:
def preprocess(file_path: kfp.components.InputPath('CSV'),
               output_file: kfp.components.OutputPath('parquet')):
    import pandas as pd
    df = pd.read_csv(file_path, header=0, sep=";")
    df.columns = [c.lower().replace(' ', '_') for c in df.columns]
    df.to_parquet(output_file)

In [32]:
#local development
preprocess('winequality-red.csv', 'preprocessed.parquet')

In [33]:
#workflow component
preprocess_op = kfp.components.create_component_from_func(
    func=preprocess,
    # output_component_file='preprocess-component.yaml',  # This is optional. It saves the component spec for future use.
    base_image='python:3.9',
    packages_to_install=['pandas', 'pyarrow'])

# Train model

In [34]:
def trainning(file_path: kfp.components.InputPath('parquet')) -> str:
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report
    import mlflow
    from sklearn.linear_model import ElasticNet

    df = pd.read_parquet(file_path)

    target_column = 'quality'
    train_x, test_x, train_y, test_y = train_test_split(df.drop(columns=[target_column]),
                                                        df[target_column], test_size=.25,
                                                        random_state=1337, stratify=df[target_column])

    with mlflow.start_run(run_name='elastic_net_models'):
        alpha = 0.5
        l1_ratio = 0.5
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)
        result = mlflow.sklearn.log_model(lr, "model", registered_model_name="wine-elasticnet")
        return f"{mlflow.get_artifact_uri()}/{result.artifact_path}"

In [35]:
trainning('preprocessed.parquet')

Successfully registered model 'wine-elasticnet'.
2022/04/13 14:57:42 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: wine-elasticnet, version 1
Created version '1' of model 'wine-elasticnet'.


's3://mlflow/0/458f69b027ff44c7979ed7dc31e54963/artifacts/model'

In [36]:
#workflow component
training_op = kfp.components.create_component_from_func(
    func=trainning,
    # output_component_file='train-component.yaml',  # This is optional. It saves the component spec for future use.
    base_image='python:3.9',
    packages_to_install=['pandas', 'pyarrow', 'sklearn', 'mlflow', 'boto3'])

# Deploy model

Known issues:
- https://githubhot.com/repo/SeldonIO/seldon-core/issues/4017 

In [None]:
def deploy(model_uri: str = "default_model_uri"):
    import subprocess

    with open("/tmp/manifest.yaml", "w") as f:
        manifest = """
apiVersion: machinelearning.seldon.io/v1alpha2
kind: SeldonDeployment
metadata:
  name: mlflow
spec:
  name: wines
  predictors:
  - componentSpecs:
    - spec:
        containers:
        - name: classifier
          image: seldonio/mlflowserver:1.14.0-dev
          imagePullPolicy: Always
          livenessProbe:
            initialDelaySeconds: 80
            failureThreshold: 200
            periodSeconds: 5
            successThreshold: 1
            httpGet:
              path: /health/ping
              port: http
              scheme: HTTP
          readinessProbe:
            initialDelaySeconds: 80
            failureThreshold: 200
            periodSeconds: 5
            successThreshold: 1
            httpGet:
              path: /health/ping
              port: http
              scheme: HTTP
    graph:
      children: []
      implementation: MLFLOW_SERVER
      modelUri: """ + model_uri + """
      envSecretRefName: seldon-init-container-secret
      logger:
          url: http://broker-ingress.knative-eventing.svc.cluster.local/default/wine-inference-requests
          mode: request
      name: classifier
    name: wine-super-model
    replicas: 1
        """
        f.write(manifest)

    result = subprocess.call(['kubectl', 'apply', '-f', '/tmp/manifest.yaml', '-n', 'admin'])
    assert result == 0


In [None]:
deploy_op = kfp.components.create_component_from_func(
    func=deploy,
    output_component_file='deploy-component.yaml',  # This is optional. It saves the component spec for future use.
    base_image='bponieckiklotz/seldon-deploy:0.1')

# Create pipeline

In [None]:
from kubernetes.client.models import V1EnvVar
from kfp.onprem import use_k8s_secret


@dsl.pipeline(
    name="e2e_wine_pipeline",
    description="WINE pipeline",
)
def wine_pipeline(url):
    env_mlflow_tracking_uri = V1EnvVar(name='MLFLOW_TRACKING_URI',
                                       value='http://mlflow-server.kubeflow.svc.cluster.local:5000')
    env_mlflow_s3_endpoint_url = V1EnvVar(name='MLFLOW_S3_ENDPOINT_URL',
                                          value='http://minio.kubeflow.svc.cluster.local:9000')
    use_k8s_secret_minio = use_k8s_secret(secret_name='mlpipeline-minio-artifact',
                                          k8s_secret_key_to_env={'accesskey': 'AWS_ACCESS_KEY_ID',
                                                                 'secretkey': 'AWS_SECRET_ACCESS_KEY', })

    web_downloader_task = web_downloader_op(url=url)
    datadrift_task = (datadrift_op(file=web_downloader_task.outputs['data'])
                      .add_env_variable(env_mlflow_tracking_uri)
                      .add_env_variable(env_mlflow_s3_endpoint_url)
                      .apply(use_k8s_secret_minio))

    preprocess_task = preprocess_op(file=web_downloader_task.outputs['data'])

    train_task = (training_op(file=preprocess_task.outputs['output'])
                  .add_env_variable(env_mlflow_tracking_uri)
                  .add_env_variable(env_mlflow_s3_endpoint_url)
                  .apply(use_k8s_secret_minio))
    deploy_task = deploy_op(model_uri=train_task.output)


In [None]:
client = kfp.Client()

In [None]:
client.create_run_from_pipeline_func(
    wine_pipeline,
    arguments={
        "url": "https://raw.githubusercontent.com/Barteus/kubeflow-examples/main/e2e-wine-kfp-mlflow/winequality-red.csv",
    })

In [None]:
kfp.compiler.Compiler().compile(wine_pipeline, 'wine-pipeline.yaml')