In [1]:
%reload_ext autoreload
%autoreload 2

import os
import mlflow
import numpy as np
import git
import subprocess
from tqdm import tqdm

import mlflow_info

import keras 
from keras.utils import np_utils
from keras.layers.core import Dense, Dropout, Activation

import warnings
warnings.filterwarnings('ignore')

%reload_ext mlflow_info

## creating a ssh-tunnel to server in the background

In [2]:
""" creating a ssh-tunnel to server in the background """
command = 'ssh -N -L 5000:localhost:5432 artinmajdi@data7-db1.cyverse.org &'
ssh_session = subprocess.Popen('exec ' + command, stdout=subprocess.PIPE, shell=True)

## Load data

In [3]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

num_pixels = x_train.shape[1] * x_train.shape[2]

x_train = x_train.reshape((x_train.shape[0], num_pixels)).astype('float32')
x_test  = x_test.reshape( (x_test.shape[0],  num_pixels)).astype('float32')

x_train = x_train[1:5000] / 255
x_test  = x_test / 255

y_train = np_utils.to_categorical(y_train[1:5000])
y_test  = np_utils.to_categorical(y_test)

num_classes = y_test.shape[1]

## Architecture

In [4]:
mlflow.keras.autolog()

model = keras.models.Sequential()
model.add(Dense(512, input_shape=(784,)))
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(10))
model.add(Activation('softmax'))

## setting up mlflow

In [5]:

""" MLflow settings: 
    The style we should use when running mlflow ui
        Postgres server: server = f'{dialect_driver}://{username}:{password}@{ip}/{database_name}' 
        Local:           server = "file:/Users/artinmac/Documents/Research/Data7/mlflow/mlrun_store" """

postgres_connection_type = { 'direct':    ('5432', 'data7-db1.cyverse.org'),
                            'ssh-tunnel': ('5000', 'localhost') }

port, host = postgres_connection_type['ssh-tunnel']


""" Setting up the artifact server """
username = 'username'
password = 'password'
database_name  = 'resbaz2021'
dialect_driver = 'postgresql'

server = '{dialect_driver}://{username}:{password}@{host}:{port}/{database_name}'

Artifacts = {
    'local':      'file:/{path_to_artifact_store}',
    'hpc':        'sftp://{user}:{password}@filexfer.hpc.arizona.edu:{path_to_artifact_store}',
    'atmosphere': 'sftp://{user}:{password}@{ip_address}:{path_to_artifact_store}',
    'cyverse':    'file:/{path_to_artifact_store}',
    'data7_db1':  'sftp://{user}:{password}@{ip_address}:{path_to_artifact_store}'}

artifact = Artifacts['data7_db1']

In [6]:
server, artifact = mlflow_info.load()

""" setting the trackinng uri """
mlflow.set_tracking_uri(server)

""" Creating/Setting the experiment
    Line below should be commented if the experiment is already created
    If kept commented during the first run of a new experiment, the set_experiment 
    will automatically create the new experiment with local artifact storage """

experiment_name = 'exp_mnist'
mlflow.create_experiment(name=experiment_name, artifact_location=artifact)
mlflow.set_experiment(experiment_name=experiment_name)

## model training and optimization

In [7]:
%%time 

# Starting the MLflow 
run = mlflow.start_run()
mlflow.set_tag(f'mlflow.note.content',f'run_id: {run.info.run_id}')


# model compiling
learning_rate = 0.001
model.compile( optimizer = keras.optimizers.Adam(learning_rate=learning_rate), 
               loss      = keras.losses.categorical_crossentropy,
               metrics   = [keras.metrics.binary_accuracy] )

# model optimization
history = model.fit(x_train, y_train, epochs=5, batch_size=100, validation_data=(x_test, y_test))

# Model evaluation
test_loss, test_acc = model.evaluate(x_test, y_test)
print('Accuracy:', test_acc) 
print('Loss: '   , test_loss)

prediction = model.predict(x_test)
predicted_classes = np.argmax(prediction, axis=1)

# Saving MLflow parameters & metrics
mlflow.log_param("epochs",          history.params['epochs'])
mlflow.log_param("steps_per_epoch", history.params['steps'])
mlflow.log_metric("accuracy",       test_acc)
mlflow.log_metric("test_loss",      test_loss)

# saving git commit hash
repo = git.Repo(search_parent_directories=True)
git_commit_hash = repo.head.object.hexsha
print('git commit hash', git_commit_hash)
mlflow.set_tag('mlflow.source.git.commit', git_commit_hash)


# ending mlflow session
mlflow.end_run()

print('process completed')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.9860796928405762
Loss:  0.23816725611686707
git commit hash 58524e1b77bdba60e3cd5a37e53b636331e81194
process completed
CPU times: user 13.3 s, sys: 2.03 s, total: 15.4 s
Wall time: 9.4 s


## finding the optimum learning rate

In [8]:
# Starting the MLflow 
parent_run = mlflow.start_run(run_name='learning rate')

for learning_rate in tqdm(np.linspace(start=0.01,stop=0.1,num=5)):
    
    learning_rate = np.floor(learning_rate*1000)/1000

    with mlflow.start_run(run_name=f'LR {learning_rate}', nested=True) as child_run:
        mlflow.set_tag(f'mlflow.note.content',f'run_id: {child_run.info.run_id}')

        # model compiling
        model.compile( optimizer = keras.optimizers.Adam(learning_rate=learning_rate), 
                        loss     = keras.losses.categorical_crossentropy,
                        metrics  = [keras.metrics.binary_accuracy] )

        # model optimization
        history = model.fit(x_train, y_train, epochs=5, batch_size=100, validation_data=(x_test, y_test),verbose=0)

        # Model evaluation
        test_loss, test_acc = model.evaluate(x_test, y_test)
        print('\nAccuracy:', test_acc) 
        print('Loss: '   , test_loss,'\n')

        prediction = model.predict(x_test)
        predicted_classes = np.argmax(prediction, axis=1)

        # Saving MLflow parameters & metrics
        mlflow.log_param("epochs",          history.params['epochs'])
        mlflow.log_param("steps_per_epoch", history.params['steps'])
        mlflow.log_metric("accuracy",       test_acc)
        mlflow.log_metric("test_loss",      test_loss)

        # saving git commit hash
        repo = git.Repo(search_parent_directories=True)
        git_commit_hash = repo.head.object.hexsha
        mlflow.set_tag('mlflow.source.git.commit', git_commit_hash)

# ending mlflow session
mlflow.end_run()

print('\nprocess completed')


Accuracy: 0.9867396354675293
Loss:  0.23927639424800873 


Accuracy: 0.9780802726745605
Loss:  0.6071300506591797 


Accuracy: 0.9459601640701294
Loss:  1.3074679374694824 


Accuracy: 0.9023482203483582
Loss:  2.256269693374634 


Accuracy: 0.900047242641449
Loss:  2.302863359451294 

100%|██████████| 5/5 [00:44<00:00,  8.88s/it]
process completed



## finding the optimum batch size

In [9]:
# Starting the MLflow 
parent_run = mlflow.start_run(run_name='batch size')


for batch_size in tqdm(np.linspace(start=50,stop=200,num=6)):
    batch_size = int(batch_size)

    with mlflow.start_run(run_name=f'bsize {batch_size}', nested=True) as child_run:
        mlflow.set_tag(f'mlflow.note.content',f'run_id: {child_run.info.run_id}')

        # model compiling
        model.compile( optimizer = keras.optimizers.Adam(learning_rate=0.001), 
                        loss     = keras.losses.categorical_crossentropy,
                        metrics  = [keras.metrics.binary_accuracy] )

        # model optimization
        history = model.fit(x_train, y_train, epochs=5, batch_size=batch_size, validation_data=(x_test, y_test),verbose=0)

        # Model evaluation
        test_loss, test_acc = model.evaluate(x_test, y_test)
        print('\nAccuracy:', test_acc) 
        print('Loss: '   , test_loss,'\n')

        prediction = model.predict(x_test)
        predicted_classes = np.argmax(prediction, axis=1)

        # Saving MLflow parameters & metrics
        mlflow.log_param("epochs",          history.params['epochs'])
        mlflow.log_param("batch_size",      batch_size)
        mlflow.log_param("steps_per_epoch", history.params['steps'])

        mlflow.log_metric("accuracy",       test_acc)
        mlflow.log_metric("test_loss",      test_loss)

        # saving git commit hash
        repo = git.Repo(search_parent_directories=True)
        git_commit_hash = repo.head.object.hexsha
        mlflow.set_tag('mlflow.source.git.commit', git_commit_hash)

# ending mlflow session
mlflow.end_run()

print('\nprocess completed')


Accuracy: 0.9000972509384155
Loss:  2.2998170852661133 


Accuracy: 0.9000972509384155
Loss:  2.2995998859405518 


Accuracy: 0.9001073241233826
Loss:  2.299405336380005 


Accuracy: 0.9001073241233826
Loss:  2.2993671894073486 


Accuracy: 0.9001073241233826
Loss:  2.2993459701538086 


Accuracy: 0.9001472592353821
Loss:  2.298449754714966 

100%|██████████| 6/6 [00:59<00:00,  9.96s/it]
process completed



## list all runs

In [None]:
mlflow.list_run_infos(experiment_id='4')[0]

## modify an existing run

In [None]:
with mlflow.start_run(run_id='0861236387ba4d7683e589f206dff964') as run: 

    mlflow.set_tag('status','final optimized learning rate')

## downloading artifacts

In [None]:
client = mlflow.tracking.MlflowClient()
client.download_artifacts(run_id='0861236387ba4d7683e589f206dff964', path='model/MLmodel', dst_path='../')

## duplicate a run

In [None]:
source_run = mlflow.get_run(run_id='0861236387ba4d7683e589f206dff964')

mlflow.create_experiment(name='new_exp', artifact_location=artifact)
mlflow.set_experiment(experiment_name='new_exp')

mlflow.log_metrics(source_run.data.metrics)
mlflow.log_params(source_run.data.params)

file_path = client.download_artifacts(run_id='0861236387ba4d7683e589f206dff964', path='', dst_path='../')



mlflow.set_tag('mlflow.source.git.commit', repo.head.object.hexsha)
mlflow.set_tag('mlflow.source.name'      , old_run.data.tags['mlflow.source.name'])
mlflow.set_tag('mlflow.log-model.history', old_run.data.tags['mlflow.log-model.history'])


mlflow.log_artifacts(file_path + '/model')

mlflow.end_run()

## loading models

In [None]:
MODEL_REGISTRY_NAME = 'mnist_classifier'

# loading from model registry using the version of the registered model
model = mlflow.keras.load_model(model_uri=f'models:/{MODEL_REGISTRY_NAME}/1',compile=False)

# loading from model registry using the stage of the registered model
model = mlflow.keras.load_model(model_uri=f'models:/{MODEL_REGISTRY_NAME}/production',compile=False)

# loading from a specific run
run_id = 'f7d6e3b515da4ed89578cdd53412fcf8'
model = mlflow.keras.load_model(model_uri=f'runs:/{run_id}/model',compile=False)


In [11]:
# End the ssh session. If this failed, we can type 'pkill ssh' in the terminal 
ssh_session.kill()

print('Optimization Complete')

Optimization Complete


In [12]:
# mlflow ui --backend-store-uri postgresql://artinmajdi:1234@localhost:5000/chest_db --port 6789