In [33]:
!pip3 install mlflow keras boto3

Collecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/66/fa/8b48eaf034186c938b96f121acc0df17ed9ad63d03bfd672ef538b1acfbd/boto3-1.9.188-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 6.9MB/s ta 0:00:01
Collecting jmespath<1.0.0,>=0.7.1 (from boto3)
  Downloading https://files.pythonhosted.org/packages/83/94/7179c3832a6d45b266ddb2aac329e101367fbdb11f425f13771d27f225bb/jmespath-0.9.4-py2.py3-none-any.whl
Collecting botocore<1.13.0,>=1.12.188 (from boto3)
[?25l  Downloading https://files.pythonhosted.org/packages/10/cb/8dcfb3e035a419f228df7d3a0eea5d52b528bde7ca162f62f3096a930472/botocore-1.12.188-py2.py3-none-any.whl (5.6MB)
[K    100% |████████████████████████████████| 5.6MB 9.0MB/s eta 0:00:011
[?25hCollecting s3transfer<0.3.0,>=0.2.0 (from boto3)
[?25l  Downloading https://files.pythonhosted.org/packages/16/8a/1fc3dba0c4923c2a76e1ff0d52b305c44606da63f718d14d3231e21c51b0/s3transfer-0.2.1-py2.py3-none-any.whl (70kB)
[K    100%

In [49]:
from __future__ import print_function

import tensorflow as tf
from tensorflow import keras

# Helper libraries
import numpy as np
import os
import subprocess
import argparse
import time

import mlflow
import mlflow.keras


# Reduce spam logs from s3 client
os.environ['TF_CPP_MIN_LOG_LEVEL']='3'

def preprocessing():
  fashion_mnist = keras.datasets.fashion_mnist
  (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

  # scale the values to 0.0 to 1.0
  train_images = train_images / 255.0
  test_images = test_images / 255.0

  # reshape for feeding into the model
  train_images = train_images.reshape(train_images.shape[0], 28, 28, 1)
  test_images = test_images.reshape(test_images.shape[0], 28, 28, 1)

  class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

  print('\ntrain_images.shape: {}, of {}'.format(train_images.shape, train_images.dtype))
  print('test_images.shape: {}, of {}'.format(test_images.shape, test_images.dtype))

  return train_images, train_labels, test_images, test_labels

def train(train_images, train_labels, epochs, model_summary_path):
  if model_summary_path:
    logdir=model_summary_path # + datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

  model = keras.Sequential([
    keras.layers.Conv2D(input_shape=(28,28,1), filters=8, kernel_size=3,
                        strides=2, activation='relu', name='Conv1'),
    keras.layers.Flatten(),
    keras.layers.Dense(10, activation=tf.nn.softmax, name='Softmax')
  ])
  model.summary()

  model.compile(optimizer=tf.train.AdamOptimizer(),
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy']
                )
  if model_summary_path:
    model.fit(train_images, train_labels, epochs=epochs, batch_size=64, callbacks=[tensorboard_callback])
  else:
    model.fit(train_images, train_labels, epochs=epochs, batch_size=64)

  mlflow.log_param('batch_size', 64)

  return model

def eval(model, test_images, test_labels):
  test_loss, test_acc = model.evaluate(test_images, test_labels)
  print('\nTest accuracy: {}, test loss {}'.format(test_acc, test_loss))
  mlflow.log_metric("accuracy", test_acc)
  mlflow.log_metric("loss", test_loss)

def export_model(model, model_export_path):
  version = 1
  export_path = os.path.join(model_export_path, str(version))

  tf.saved_model.simple_save(
    keras.backend.get_session(),
    export_path,
    inputs={'input_image': model.input},
    outputs={t.name:t for t in model.outputs})

  print('\nSaved model: {}'.format(export_path))


def main(argv=None):
  parser = argparse.ArgumentParser(description='Fashion MNIST Tensorflow Example')
  parser.add_argument('--model_export_path', type=str, help='Model export path')
  parser.add_argument('--model_summary_path', type=str,  help='Model summry files for Tensorboard visualization')
  parser.add_argument('--epochs', type=int, default=5, help='Training epochs')
  args = parser.parse_args(args=['--epochs=10'])

  # File Based Tracking URI. Use NFS in this case
  # users_home = '/tmp/shjiaxin'
  # experiment_base_path = '%s/experiments' % users_home
  # tracking_uri='file://%s' % experiment_base_path
  
  # Remote Tracking Server URI. Use kubernetes Service.
  tracking_uri = "http://mlflow-tracking-server:5000"
  mlflow.set_tracking_uri(tracking_uri)

  experiment_name = 'mlflow'
  mlflow.set_experiment(experiment_name)

  with mlflow.start_run() as run:
    start_time = time.time()
    train_images, train_labels, test_images, test_labels = preprocessing()
    model = train(train_images, train_labels, args.epochs, args.model_summary_path)
    eval(model, test_images, test_labels)

    mlflow.log_param('epochs', args.epochs)

    if args.model_export_path:
      export_model(model, args.model_export_path)

    # Use MLFlow fashion to persist model
    mlflow.keras.log_model(model, 'model_keras')

    # Measure running time
    duration_in_seconds = time.time() - start_time
    print("This model took", duration_in_seconds, "seconds to train and test.")
    mlflow.log_metric("time_duration", duration_in_seconds)

if __name__ == "__main__":
  main()


train_images.shape: (60000, 28, 28, 1), of float64
test_images.shape: (10000, 28, 28, 1), of float64
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Conv1 (Conv2D)               (None, 13, 13, 8)         80        
_________________________________________________________________
flatten_7 (Flatten)          (None, 1352)              0         
_________________________________________________________________
Softmax (Dense)              (None, 10)                13530     
Total params: 13,610
Trainable params: 13,610
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Test accuracy: 0.8758000135421753, test loss 0.3537561374902725
This model took 30.343011379241943 seconds to train and test.
