In [12]:
# Import modules and libraries needed
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from hyperopt import tpe, hp, fmin, STATUS_OK, Trials
from hyperopt.pyll import scope
import numpy as np
import pandas as pd
from joblib import dump

In [37]:
# try the following using Rapids functions cudf, and cuml --> it enables us run on GPUs which is far faster
df2 = pd.read_csv("data/diabetes.csv").drop('PatientID', axis=1)

def train_model(params, test_size = 0.3, registered_model_name= None):
    #max_depth, max_features, n_estimators = params
    
    # Split between features and label
    X = df2.drop(["Diabetic"], axis = 1)
    y = df2["Diabetic"]
    
    # define training and test set based on split in function
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 123)
    
    # model and fit
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    
    # log tags, and metrics
    if test_size > 0.0:
        
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        
    else:
        acc = np.nan
        prec = np.nan
        rec = np.nan
    
    # Since fmin (hyperopt) minimizes loss, we want to maximise acc -- which is the reverse --> return this loss function
    return {'loss': -acc, 'status': STATUS_OK}

In [38]:
params = {"max_features": 0.7, "n_estimators": 100, "max_depth" : 10}
train_model(params)

{'loss': -0.9376666666666666, 'status': 'ok'}

In [39]:
# HyperOpt
# Configure search space
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 1, 10, 1)),
   # 'max_features': hp.choice("max_features", ["auto", "sqrt"]),
    'max_features': hp.uniform("max_features", 0.0, 1.0),
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 200, 50))
}

algo = tpe.suggest

# Define spark trials object - for Databricks, this keeps track of all your trials and integrates with databricks UI
trials = Trials()

# fmin returns best parameters 
best = fmin(
            fn = train_model,
            space = search_space,
            algo = algo,
            trials = trials,
            max_evals = 2,
        )


best_params = {"max_features": best["max_features"], "n_estimators": int(best["n_estimators"]), "max_depth" : int(best["max_depth"])}
# train_model(best_params, test_size = 0.0001, registered_model_name="Diabetes_Prediction") # registered_model_name kept giving an error --> investigate
# RestException: INVALID_PARAMETER_VALUE:  Model registry functionality is unavailable; got unsupported URI './mlruns' for model registry data storage. Supported URI schemes are: ['postgresql', 'mysql', 'sqlite', 'mssql']. See https://www.mlflow.org/docs/latest/tracking.html#storage for how to run an MLflow server against one of the supported backend storage locations.

X = df2.drop(["Diabetic"], axis = 1).values
y = df2["Diabetic"].values

# fit to all data
best_model = RandomForestClassifier(**best_params).fit(X, y)

100%|██████████| 2/2 [00:01<00:00,  1.64trial/s, best loss: -0.9256666666666666]


In [40]:
import os
if not os.path.exists('output_no_mlflow'):
    os.makedirs('output_no_mlflow')
dump(best_model, "output_no_mlflow/model.pkl")

['output_no_mlflow/model.pkl']

In [42]:
from joblib import load
import pickle
import collections
test_data = [0, 171, 80, 34, 23, 43.509726, 1.213191, 21]
test_data_reshaped = np.array(test_data).reshape(1,-1)

prediction_1 = best_model.predict(test_data_reshaped)
print(prediction_1)

prediction_1_batch = best_model.predict(X)
print(collections.Counter(prediction_1_batch))

loaded_model = load("output_no_mlflow/model.pkl")
prediction_2 = loaded_model.predict(test_data_reshaped)
print(prediction_2)

prediction_2_batch = loaded_model.predict(X)
print(collections.Counter(prediction_2_batch))

# Pickle.Load gave this error "UnpicklingError: invalid load key, '\x00'."
# loaded_model_pkl = pickle.load(open("output_no_mlflow/model.pkl", "rb"))
# prediction_3 = loaded_model_pkl.predict(test_data_reshaped)
# print(prediction_3)

test_data_no_ID = [0, 171, 80, 34, 23, 43.509726, 1.213191, 21]
test_data_reshaped_noID = np.array(test_data_no_ID).reshape(1,-1)
prediction_4 = loaded_model.predict(test_data_reshaped_noID)
print(prediction_4)

[0]
Counter({0: 6888, 1: 3112})
[0]
Counter({0: 6888, 1: 3112})
[0]
