# Kubeflow Pipeline - Fashion MNIST Example

## Install Packages

In [14]:
!pip install --user --upgrade kfp
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

Requirement already up-to-date: kfp in ./.local/lib/python3.7/site-packages (1.0.4)


In [15]:
# Import Kubeflow SDK
import kfp
import kfp.dsl as dsl
import kfp.components as comp

## Define Pipeline Component Functions

In [16]:
def load_data(data_path):
    
    # since we are using tensorflow image we can directly import without installing
    import pickle
    import tensorflow as tf
    from tensorflow.python import keras
    
    # Download the dataset and split into training and test data. 
    fashion_mnist = keras.datasets.fashion_mnist.load_data()
    
    # Save the dataset to be used in the next component
    with open(f'{data_path}/fashion_mnist', 'wb') as f:
        pickle.dump(fashion_mnist, f)
        
    return

In [38]:
def train_model(data_path, model_file):
    
    # since we are using tensorflow image we can directly import without installing
    import pickle
    import tensorflow as tf
    from tensorflow.python import keras
    
    # load data from volume
    with open(f'{data_path}/fashion_mnist', 'rb') as f:
        fashion_mnist_data = pickle.load(f)
    
    (train_images, train_labels), (test_images, test_labels) = fashion_mnist_data

    # Normalize the data so that the values all fall between 0 and 1.
    train_images = train_images / 255.0
    test_images = test_images / 255.0

    # Define the model using Keras.
    model = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(10)
    ])

    model.compile(optimizer='adam',
                  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    # Run a training job with specified number of epochs
    model.fit(train_images, train_labels, epochs=10)

    # Evaluate the model and print the results
    test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)
    print('Test accuracy:', test_acc)

    # Save the model to the designated
    model.save(f'{data_path}/{model_file}')

    # Save the test_data as a pickle file to be used by the predict component.
    with open(f'{data_path}/test_data', 'wb') as f:
        pickle.dump((test_images,test_labels), f)
        
    return

In [39]:
def predict(data_path, model_file, image_number):
    
    # func_to_container_op requires packages to be imported inside of the function.
    import pickle
    import tensorflow as tf
    from tensorflow import keras
    import numpy as np
    
    # Load the saved Keras model
    model = keras.models.load_model(f'{data_path}/{model_file}')

    # Load and unpack the test_data
    with open(f'{data_path}/test_data','rb') as f:
        test_data = pickle.load(f)
        
    # Separate the test_images from the test_labels.
    test_images, test_labels = test_data
    # Define the class names.
    class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                   'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

    # Define a Softmax layer to define outputs as probabilities
    probability_model = tf.keras.Sequential([model, 
                                            tf.keras.layers.Softmax()])

    # See https://github.com/kubeflow/pipelines/issues/2320 for explanation on this line.
    image_number = int(image_number)

    # Grab an image from the test dataset.
    img = test_images[image_number]

    # Add the image to a batch where it is the only member.
    img = (np.expand_dims(img,0))

    # Predict the label of the image.
    predictions = probability_model.predict(img)

    # Take the prediction with the highest probability
    prediction = np.argmax(predictions[0])

    # Retrieve the true label of the image from the test labels.
    true_label = test_labels[image_number]
    
    class_prediction = class_names[prediction]
    confidence = 100*np.max(predictions)
    actual = class_names[true_label]
    
    
    with open(f'{data_path}/result.txt', 'w') as result:
        result.write(" Prediction: {} | Confidence: {:2.0f}% | Actual: {}".format(class_prediction,
                                                                        confidence,
                                                                        actual))
    
    print('Prediction has be saved successfully!')
    

## Create Pipeline Components from Functions 

In [40]:
# Create load, train, and predict lightweight components
load_op = comp.func_to_container_op(load_data, base_image='tensorflow/tensorflow:latest-gpu-py3')
train_op = comp.func_to_container_op(train_model, base_image='tensorflow/tensorflow:latest-gpu-py3')
predict_op = comp.func_to_container_op(predict, base_image='tensorflow/tensorflow:latest-gpu-py3')

## Define the Kubeflow Pipeline 

In [42]:
@dsl.pipeline(
    name='MNIST Fashion Pipeline',
    description='Demonstration pipeline for getting to know kubeflow pipelines functionality.'
)
def mnist_fashion_pipeline(
    data_path,
    model_file,
    image_number
):
    
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
                name="create_volume",
                resource_name="data-volume", 
                size="1Gi", 
                modes=dsl.VOLUME_MODE_RWO)
    
    # Create MNIST load component
    mnist_loading_container = load_op(data_path).add_pvolumes({data_path: vop.volume})
    
    # Creat MNIST train component
    mnist_training_container = train_op(data_path, model_file).add_pvolumes({data_path:mnist_loading_container.pvolume})
    
     # Creat MNIST predict component
    mnist_predict_container = predict_op(data_path, model_file, image_number).add_pvolumes({data_path:mnist_training_container.pvolume})
    

In [46]:
DATA_PATH = '/mnt'
MODEL_PATH='mnist_model.h5'
IMAGE_NUMBER = 8

In [47]:
pipeline_func = mnist_fashion_pipeline

In [48]:
experiment_name = 'fashion_mnist_kubeflow'
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH,
             "model_file":MODEL_PATH,
             "image_number":IMAGE_NUMBER}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func, '{}.zip'.format(experiment_name))

# Create client object
client = kfp.Client(host='2dba4b335ce47b69-dot-us-east1.pipelines.googleusercontent.com')

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)