In [None]:
# Import modules and libraries needed
import mlflow
from mlflow.tracking.client import MlflowClient
from mlflow.tracking import MlflowClient as mf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from hyperopt import tpe, hp, fmin, STATUS_OK, Trials
from hyperopt.pyll import scope
import numpy as np
import pandas as pd
from joblib import dump

# Set Ml Flow experiment for tracking
mlflow.set_experiment("Classification Model")

mlflow.set_tracking_uri("http://localhost:5000")

In [1]:
from functions.train_model import train_model, get_run_id
print(get_run_id(train_model))

100%|██████████| 2/2 [00:20<00:00, 10.08s/trial, best loss: -0.9336666666666666]
a62c477c99234057aab1438b01ca60ca


In [None]:
# try the following using Rapids functions cudf, and cuml --> it enables us run on GPUs which is far faster
df = pd.read_csv("data/diabetes.csv").drop('PatientID', axis=1)

def train_model(params, test_size = 0.3, registered_model_name= None):
    #max_depth, max_features, n_estimators = params
    
    # Split between features and label
    X = df.drop(["Diabetic"], axis = 1)
    y = df["Diabetic"]
    
    # define training and test set based on split in function
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 123)
    
    # model and fit
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    
    # log model, params and artifact(s)
    mlflow.sklearn.log_model(model, artifact_path="Model_Artifacts", registered_model_name=registered_model_name)
    #mlflow.log_params(params)
    
    # Log pickled file --- remove following since mlflow already pickles and logs
    # output_path = "output"
    # dump(model, output_path)
    # mlflow.log_artifact(output_path)

    # mlflow.sklearn.save_model(model, output_path)

 
    # log tags, and metrics
    if test_size > 0.0:
        
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        
    else:
        acc = np.nan
        prec = np.nan
        rec = np.nan
    
    # Since fmin (hyperopt) minimizes loss, we want to maximise acc -- which is the reverse --> return this loss function
    return {'loss': -acc, 'status': STATUS_OK}

In [None]:
# run this in cmd before running th code beneath it
# mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts --host 0.0.0.0

params = {"max_features": 0.7, "n_estimators": 100, "max_depth" : 10}
train_model(params)

In [None]:
# HyperOpt
# Configure search space
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 1, 10, 1)),
   # 'max_features': hp.choice("max_features", ["auto", "sqrt"]),
    'max_features': hp.uniform("max_features", 0.0, 1.0),
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 200, 50))
}

algo = tpe.suggest

# Define spark trials object - for Databricks, this keeps track of all your trials and integrates with databricks UI
trials = Trials()

mlflow.end_run() # close out any run in progress

# Run mlflow with the hyper parameter tuning job
# fmin returns best parameters 
with mlflow.start_run() as run:
    mlflow.set_tags({"model_type": "Classification Model", "resource_type": "POC Work"})
    best = fmin(
                fn = train_model,
                space = search_space,
                algo = algo,
                trials = trials,
                max_evals = 2,
            )
    
    #output_path = "output"

    mlflow.set_tag("best_params", str(best))

    # mlflow.sklearn.save_model(best, output_path) # gives an error if path already exists

    best_params = {"max_features": best["max_features"], "n_estimators": int(best["n_estimators"]), "max_depth" : int(best["max_depth"])}
    # train_model(best_params, test_size = 0.0001, registered_model_name="Diabetes_Prediction") # registered_model_name kept giving an error --> investigate
    # RestException: INVALID_PARAMETER_VALUE:  Model registry functionality is unavailable; got unsupported URI './mlruns' for model registry data storage. Supported URI schemes are: ['postgresql', 'mysql', 'sqlite', 'mssql']. See https://www.mlflow.org/docs/latest/tracking.html#storage for how to run an MLflow server against one of the supported backend storage locations.
    
    mlflow.log_params(best_params)

    # fit the model without a test set --> with the whole data since w enow have best params
    train_model(best_params)

    run_id = mlflow.active_run().info.run_id

mlflow.end_run()

In [None]:
# Run ID and Artifact Path
print(run_id)
artifact_path = "Model_Artifacts"

In [None]:
# Tried:
# # mlflow models serve -m runs:/bce20ce3cc5f409e9de0c9bb4b13d415/model --port 1234
# mlflow models serve --model-uri runs:/bce20ce3cc5f409e9de0c9bb4b13d415/model --no-conda
# Error:
# MlflowException: Run 'bce20ce3cc5f409e9de0c9bb4b13d415' not found

# Tried:
# mlflow models serve -m "models:/Diabetes Prediction Model/Staging" --port 1234
# Error:
# MlflowException: Model Registry features are not supported by the store with URI: 'file:///C:/Users/lugba/OneDrive/Desktop/InfoSys/EY_POC/Models/test_ml_flow/mlruns'. Stores with the following URI schemes are supported: ['databricks', 'http', 'https', 'postgresql', 'mysql', 'sqlite', 'mssql']






In [None]:
# Register Model with Artifact Paths used in function, plus run Id and Model Name you want
result = mlflow.register_model(
    "runs:/" + run_id + "/" + artifact_path,
    "Diabetes Prediction Model"
)

In [None]:
client = mf() # alias for MlflowClient 

model_name = "Diabetes Prediction Model"
filter_string = "name='{}'".format(model_name)
results = client.search_registered_models(filter_string=filter_string)
for res in results:
    for mv in res.latest_versions:
        # print("name={}; run_id={}; version={}".format(mv.name, mv.run_id, mv.version))
        model_version = mv.version
    print(model_version)

In [None]:
# Transition to Staging
client.transition_model_version_stage(
    name = model_name,
    version= model_version,
    stage = 'staging',
)

# # Transition model version and retrieve details using API
# # https://docs.databricks.com/applications/mlflow/model-registry-example.html#:~:text=%20MLflow%20Model%20Registry%20example%20%201%20Load,component%20defines%20functions%20for%20loading%20models...%20More%20?msclkid=79201fa1b94311eca1609084b53d7e5c
# client.transition_model_version_stage(
#   name=model_details.name,
#   version=model_details.version,
#   stage='Production',
# )
# model_version_details = client.get_model_version(
#   name=model_details.name,
#   version=model_details.version,
# )
# print("The current model stage is: '{stage}'".format(stage=model_version_details.current_stage))

# latest_version_info = client.get_latest_versions(model_name, stages=["Production"])
# latest_production_version = latest_version_info[0].version
# print("The latest production version of the model '%s' is '%s'." % (model_name, latest_production_version))

In [None]:
model_recent_run = mlflow.sklearn.load_model("runs:/" + run_id + "/" + artifact_path)
type(model_recent_run)

test_data_no_ID = [0, 171, 80, 34, 23, 43.509726, 1.213191, 21]
test_data_reshaped_noID = np.array(test_data_no_ID).reshape(1,-1)
prediction = model_recent_run.predict(test_data_reshaped_noID)
print(prediction)

In [None]:
import mlflow.pyfunc

# Load recent model
model_version_uri = "models:/{model_name}/{model_version}".format(model_name=model_name, model_version=model_version)

print("Loading registered model version from URI: '{model_uri}'".format(model_uri=model_version_uri))
model_version_latest = mlflow.pyfunc.load_model(model_version_uri)

# load model in staging
model_staging_uri = "models:/{model_name}/staging".format(model_name=model_name)

print("Loading registered model version from URI: '{model_uri}'".format(model_uri=model_staging_uri))
model_staging = mlflow.pyfunc.load_model(model_staging_uri)

# # load model in production
# model_production_uri = "models:/{model_name}/production".format(model_name=model_name)

# print("Loading registered model version from URI: '{model_uri}'".format(model_uri=model_production_uri))
# model_production = mlflow.pyfunc.load_model(model_production_uri)

In [None]:
test_data_no_ID = [0, 171, 80, 34, 23, 43.509726, 1.213191, 21]
test_data_reshaped_noID = np.array(test_data_no_ID).reshape(1,-1)
prediction_latest = model_version_latest.predict(test_data_reshaped_noID)
print(prediction_latest)

In [None]:
# Trying out this function for flask --- don't know why it is not working within flask
def mlflow_model_version():
    client = MlflowClient() # alias for MlflowClient 
    model_name = "Diabetes Prediction Model"
    filter_string = "name='{}'".format(model_name)
    results = client.search_registered_models(filter_string=filter_string)
    for res in results:
        for mv in res.latest_versions:
        # print("name={}; run_id={}; version={}".format(mv.name, mv.run_id, mv.version))
            model_version = mv.version

    return model_name, model_version

def preprocessDataAndPredict(Pregnancies, PlasmaGlucose, DiastolicBloodPressure, TricepsThickness, SerumInsulin, BMI, DiabetesPedigree, Age):
    test_data = [Pregnancies, PlasmaGlucose, DiastolicBloodPressure, TricepsThickness, SerumInsulin, BMI, DiabetesPedigree, Age]
    test_data = np.array(test_data)
    test_data = test_data.reshape(1,-1)
    model_name, model_version = mlflow_model_version()
    model_version_uri = "models:/{model_name}/{model_version}".format(model_name=model_name, model_version=model_version)
    trained_model = mlflow.pyfunc.load_model(model_version_uri)    # trained_model = joblib.load("output/model.pkl")
    prediction = trained_model.predict(test_data)
    return prediction

model_name, model_version = mlflow_model_version()
print("Model Name: {}\n Model Version: {}".format(model_name, model_version))
model_version_uri = "models:/{model_name}/{model_version}".format(model_name=model_name, model_version=model_version)

In [None]:
# Model from latest registered version
test_model = mlflow.pyfunc.load_model(model_version_uri)
test_pred = test_model.predict(test_data_reshaped_noID)
print("Printing Test Prediction Results")
print(test_pred)

print("Printing Function Prediction Results")
func_pred = preprocessDataAndPredict(0, 171, 80, 34, 23, 43.509726, 1.213191, 21)
(func_pred)

In [None]:
mlflow.active_run().info.run_id