# Project - Day 4 - MLFlow evaluate and fit

## Set parameters
The cell below has been already tagged as `parameters`. So use it to include any papermill parameter you think it would be useful to change from at MLFlow runtime. (e.g. the location of models trained in the previous step)

In [None]:
model_run_uri = "dummy"

## Loading libraries, data and model

### Loading libraries and model from MLFlow

In [2]:
import warnings

warnings.filterwarnings('ignore')
## We will be using Numpy, Pyplot and Tensorflow as our scientific tool box
import numpy as np 
import matplotlib.pyplot as plt
import tensorflow as tf

## BytesIO for defining in-memory file-like objects
from io import BytesIO

## Dask and in particular dask array for defining OOM pipelines
import dask
import dask.array as da

## Progress bars
from tqdm import tqdm

import mlflow


### Reproduce the final result plot based on the new model trained from the pipeline

You should now be able to reproduce the steps of the Day-3 model deployment and adapt it to the MLFlow pipeline:

- load the model from the artifact location of the previous step
  - little help: `mlflow.artifacts.download_artifacts(artifact_uri=model_run_uri, dst_path="./models")`
- evaluate and fit the results, storing the plot as MLFlow artifacts


In [3]:
import boto3
import hashlib

username = 'onofrioa'

hash_object = hashlib.md5(f'{username}'.encode())
password = hash_object.hexdigest()

s3 = boto3.resource(
    's3',
    endpoint_url="https://minio.131.154.99.220.myip.cloud.infn.it",
    aws_access_key_id=username,
    aws_secret_access_key=password
)


s3client = boto3.client('s3',
    aws_access_key_id=username,
    aws_secret_access_key=password,
    endpoint_url="https://minio.131.154.99.220.myip.cloud.infn.it",
    region_name='default',)


def check_latest_uploads(bucket_name):
    object_names = []
    resp = s3client.list_objects(Bucket=bucket_name)
    for object in resp['Contents']:
        print(object['Key'])
        metadata = s3client.head_object(Bucket=bucket_name, Key=object['Key'])
        object_names.append(object['Key'])
    return object_names

object_names_old = check_latest_uploads(username) 

object_names = []
for i in object_names_old:
    if 'cygno-store-preprocess-day3-bis' in i:
        object_names.append(i)
        

292831993526707729/a68a40bb153d485daf1fa023e92f8649/artifacts/test.txt
482129323306701776/2b8632fae5f94915a4e31b4d4aaadcc5/artifacts/model/MLmodel
482129323306701776/2b8632fae5f94915a4e31b4d4aaadcc5/artifacts/model/conda.yaml
482129323306701776/2b8632fae5f94915a4e31b4d4aaadcc5/artifacts/model/model.pkl
482129323306701776/2b8632fae5f94915a4e31b4d4aaadcc5/artifacts/model/python_env.yaml
482129323306701776/2b8632fae5f94915a4e31b4d4aaadcc5/artifacts/model/requirements.txt
482129323306701776/37fef82508e34ef59ac81c44fa25575a/artifacts/dataset.csv
482129323306701776/3cd370b64eb24d369099680eb561187a/artifacts/test.csv
482129323306701776/3cd370b64eb24d369099680eb561187a/artifacts/train.csv
482129323306701776/46d585f75f034787b38189cf7daa531d/artifacts/model/MLmodel
482129323306701776/46d585f75f034787b38189cf7daa531d/artifacts/model/conda.yaml
482129323306701776/46d585f75f034787b38189cf7daa531d/artifacts/model/model.pkl
482129323306701776/46d585f75f034787b38189cf7daa531d/artifacts/model/python_en

In [4]:
def load_npz_from_minio(s3,bucket_name,object_name):
    #load an object from Minio into a numpy array
    obj = s3.Bucket(bucket_name).Object(object_name)
    return np.load(BytesIO(
        obj.get()["Body"].read()
        )
    )
    

def inspect_np(np_file):
    """Display key, shape and dtype of the arrays in a npz file"""
    keys = np_file.keys()
    print ("Keys in file: ", ", ".join(keys))
    for key in keys:
        array = np_file[key]
        print (
            f" - {key:<15s}"
            f"   shape: {str(array.shape):<20s}"
            f"   dtype: {array.dtype}"
          )

npz_file = load_npz_from_minio(s3,username,object_names[-1])
print(npz_file)
inspect_np(npz_file)


<numpy.lib.npyio.NpzFile object at 0x7ff9e949a620>
Keys in file:  image, tstamp
 - image             shape: (10, 128, 128)         dtype: float64
 - tstamp            shape: (10,)                  dtype: datetime64[us]


In [5]:
@dask.delayed
def load_array_from_minio(minio_client, bucket_name, object_name, npz_key):
    """Load an array identified by npz_key from an npz file in Minio"""
    npz = load_npz_from_minio(minio_client, bucket_name, object_name)
    return npz[npz_key]

#test = load_array_from_minio(s3, username, obj, 'image')


delayed_images = [
    da.from_delayed(
        load_array_from_minio(s3, username, obj, 'image'),
        shape=(10, 128, 128),
        dtype=np.uint8
    )
    for obj in object_names
]


my_delayed_img = da.concatenate(delayed_images)
my_delayed_img[-15].compute()

array([[0.425, 0.375, 0.425, ..., 0.325, 0.25 , 0.65 ],
       [0.4  , 0.35 , 0.4  , ..., 0.425, 0.3  , 0.55 ],
       [0.4  , 0.4  , 0.425, ..., 0.6  , 0.375, 0.45 ],
       ...,
       [0.55 , 0.65 , 0.475, ..., 1.   , 0.425, 0.55 ],
       [0.525, 0.5  , 0.575, ..., 0.575, 0.6  , 0.475],
       [0.3  , 0.225, 0.625, ..., 0.4  , 0.5  , 0.75 ]])

In [None]:
with mlflow.start_run(tags={"mlflow.runName": "Test"}) as mlrun:
    local_model_path = mlflow.artifacts.download_artifacts(
        artifact_uri=model_path, dst_path="./models"
    )
    loaded_model = mlflow.pyfunc.load_model(local_model_path)

    predictions = np.concatenate([
    loaded_model.predict_on_batch(x).flatten()
    for x in tqdm(my_delayed_img.blocks, total=my_delayed_img.numblocks[0])
])
    