## Test Evidently and Kubeflow Pipelines

In [1]:
from kfp.components import InputPath, OutputPath, create_component_from_func
import kfp
from kfp import components
import os

import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing

from evidently import ColumnMapping

from evidently.report import Report
from evidently.metric_preset import *
from evidently.metric_preset import TargetDriftPreset

from evidently.test_suite import TestSuite
from evidently.test_preset import DataStabilityTestPreset
from evidently.tests import *

### Load dataset

## Demo Pipeline

In [2]:
import json
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Gets and split dataset
data = load_breast_cancer(as_frame=True)
X, y = data.data, data.target

X_temp, X_test, y_temp, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=123)

X_train, X_valid, y_train, y_valid = \
    train_test_split(X_temp, y_temp, test_size=0.2, random_state=123)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])



Train/Valid/Test sizes: 318 80 171


In [116]:

dataset = X.copy(deep=True)
dataset['target'] = y

report = Report(metrics=[
    DataQualityPreset()
])

report.run(current_data=dataset, reference_data=None)
report.save_html('test-decision-tree.html')


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [124]:
X_train_target = X_train.copy(deep=True)

X_train_target['target'] = y.loc[X_train_target.index]

X_valid_target = X_valid.copy(deep=True)
X_valid_target['target'] = y.loc[X_valid_target.index]

num_target_drift_report = Report(metrics=[
    TargetDriftPreset(),
])

num_target_drift_report.run(reference_data=X_train_target, current_data=X_valid_target)
num_target_drift_report.save_html('num_target_drift_report.html')

In [128]:
data_drift_report = Report(metrics=[
    DataDriftPreset(),
])

data_drift_report.run(reference_data=X_train, current_data=X_valid)
data_drift_report.save_html('data_drift_report.html')

In [112]:
dataset.loc[:, dataset.columns != 'target'].head()
dataset['target'].head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [3]:
from sklearn.ensemble import GradientBoostingClassifier


boost = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=8,
    random_state=1)

boost.fit(X_train, y_train)
    
    
print("Training Accuracy: %0.2f" % boost.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % boost.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % boost.score(X_test, y_test))

with open('test.txt', 'w') as f:
    f.write("Training Accuracy: %0.2f\n" % boost.score(X_train, y_train))
    f.write("Validation Accuracy: %0.2f\n" % boost.score(X_valid, y_valid))
    f.write("Test Accuracy: %0.2f\n" % boost.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 0.96
Test Accuracy: 0.96


In [136]:
y_pred = boost.predict(X_test)
X_test_results = X_test.copy(deep=True)
X_test_results['prediction'] = y_pred
X_test_results['target'] = y_test
X_test_results.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,prediction,target
101,6.981,13.43,43.79,143.5,0.117,0.07568,0.0,0.0,0.193,0.07818,...,50.41,185.2,0.1584,0.1202,0.0,0.0,0.2932,0.09382,1,1
329,16.26,21.88,107.5,826.8,0.1165,0.1283,0.1799,0.07981,0.1869,0.06532,...,113.7,975.2,0.1426,0.2116,0.3344,0.1047,0.2736,0.07953,0,0
42,19.07,24.81,128.3,1104.0,0.09081,0.219,0.2107,0.09961,0.231,0.06343,...,177.4,1651.0,0.1247,0.7444,0.7242,0.2493,0.467,0.1038,0,0
432,20.18,19.54,133.8,1250.0,0.1133,0.1489,0.2133,0.1259,0.1724,0.06053,...,146.0,1479.0,0.1665,0.2942,0.5308,0.2173,0.3032,0.08075,0,0
41,10.95,21.35,71.9,371.1,0.1227,0.1218,0.1044,0.05669,0.1895,0.0687,...,87.22,514.0,0.1909,0.2698,0.4023,0.1424,0.2964,0.09606,0,0


In [139]:
classification_report = Report(
    metrics=[ClassificationPreset()]
)

classification_report.run(reference_data=None, current_data=X_test_results)
classification_report.save_html('classification_report.html')

In [3]:
def download_data(
    output_path: OutputPath('CSV'),
) -> None:
    
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    import pandas as pd
    from pathlib import Path
        
    # Gets and split dataset
    data = load_breast_cancer(as_frame=True)
    X, y = data.data, data.target
    
    dataset = X.copy(deep=True)
    dataset['target'] = y
    
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    dataset.to_csv(output_path)
    
download_data_op = create_component_from_func(
    download_data,
    output_component_file='download_breast_cancer_data.yaml',
    base_image='quay.io/ibm/kubeflow-notebook-image-ppc64le:latest',
    annotations={
        'author':'Adam Shedivy'
    }
)
    

In [42]:
def train_test_split_op(
    input_path: InputPath('CSV'),
    output_path: OutputPath('JSON'),
    test_size: float = 0.2,
    random_sate: int = 123,
) -> None:
    
    import json
    import pandas as pd
    from pathlib import Path
    from sklearn.model_selection import train_test_split
    
    df = pd.read_csv(input_path)
    
    X, y = df.loc[:, df.columns != 'target'], df['target']
    
    X_temp, X_test, y_temp, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=random_sate, stratify=y)

    X_train, X_valid, y_train, y_valid = \
        train_test_split(X_temp, y_temp, test_size=test_size, random_state=random_sate, stratify=y_temp)
        
    DATA = {
        'X_train': X_train.to_dict(),
        'X_valid': X_valid.to_dict(),
        'y_train': y_train.to_dict(),
        'y_valid': y_valid.to_dict()
    }
    
    json_data = json.dumps(DATA)
    
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w') as f:
        f.write(json_data)
        
train_test_split_op = create_component_from_func(
    train_test_split_op,
    output_component_file='train_test_split.yaml',
    base_image='quay.io/ibm/kubeflow-notebook-image-ppc64le:latest',
    packages_to_install=['pandas', 'scikit-learn']
)
    

In [44]:
def run_gradient_boost(
    data_path: InputPath('JSON'),
    output_path: OutputPath(str),
) -> None:
    
    import json
    from pathlib import Path    
    import pandas as pd
    from sklearn.ensemble import GradientBoostingClassifier
    
    with open(data_path, 'r') as f:
        data = json.loads(f.read())
    
    X_train = pd.DataFrame.from_dict(data['X_train'])
    X_valid = pd.DataFrame.from_dict(data['X_valid'])
    y_train = pd.DataFrame.from_dict(data['y_train'])
    y_valid = pd.DataFrame.from_dict(data['y_valid'])
    


    boost = GradientBoostingClassifier(
        learning_rate=0.1,
        n_estimators=100,
        max_depth=8,
        random_state=1)

    boost.fit(X_train, y_train)
        
        
    # print("Training Accuracy: %0.2f" % boost.score(X_train, y_train))
    # print("Validation Accuracy: %0.2f" % boost.score(X_valid, y_valid))
    # print("Test Accuracy: %0.2f" % boost.score(X_test, y_test))
    
    Path.mkdir(output_path.parent, parents=True, exist_ok=True)
    with open(output_path, 'w') as f:
        f.write("Training Accuracy: %0.2f\n" % boost.score(X_train, y_train))
        f.write("Validation Accuracy: %0.2f\n" % boost.score(X_valid, y_valid))
        f.write("Test Accuracy: %0.2f\n" % boost.score(X_test, y_test))
        

run_gradient_boost_op = create_component_from_func(
    run_gradient_boost,
    output_component_file='run_gradient_boost.yaml',
    base_image='quay.io/ibm/kubeflow-notebook-image-ppc64le:latest',
    packages_to_install=[
        'scikit-learn'
    ]
)
        
    
    
    

# Pipeline definition 

In [45]:

EXAMPLES = os.path.abspath("")
ROOT = os.path.dirname(EXAMPLES)
EVIDENTLY = os.path.join(ROOT, 'evidently')
DATA_QUALITY = os.path.join(EVIDENTLY, 'data-quality-report')
HTML_VIEW = os.path.join(ROOT, 'html-viewer')

download_data_op = components.load_component_from_file('download_breast_cancer_data.yaml')
train_test_split_op = components.load_component_from_file('train_test_split.yaml')
run_gradient_boost_op = components.load_component_from_file('run_gradient_boost.yaml')
data_quality_op = components.load_component_from_file(os.path.join(DATA_QUALITY, 'component.yaml'))
html_view_op = components.load_component_from_file(os.path.join(HTML_VIEW, 'component.yaml'))


@kfp.dsl.pipeline(name='test-pipeline')
def test_pipeline():
    data = download_data_op().output
    
    report = data_quality_op(
        cur=data
    ).output
    
    html_view_op(
        html=report
    )
    
    prepared_data = train_test_split_op(
        input=data
    )

    
    run_gradient_boost = run_gradient_boost_op(
        data=prepared_data.outputs
    )
    
kfp_endpoint=None
kfp.compiler.Compiler().compile(test_pipeline, 'testPipeline.yaml')