# H2O AutoML approach
## The following code can be used in an Jupyter Notebook (Python 3.8.X, H2O cluster version 3.36.0.1).

__Import the required modules.__

In [None]:
import h2o
from h2o.automl import H2OAutoML
import numpy as np
from pandas import read_csv
from numpy import set_printoptions
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import shap
import pandas as pd

Attempting to start a local H2O server

In [None]:
h2o.init()

In [None]:
Read the training data.

In [None]:
filename = 'MetS Train.csv'
dataframe = read_csv(filename)

Data preprocessing: filling missing values, substitution of values, select the training features and target feature.

In [None]:
array = dataframe.values
ID_train = array[:,0]
y_train = array[:,-1]
htrain = h2o.H2OFrame(dataframe)
htrain['Metabolic syndrome'] = htrain['Metabolic syndrome'].asfactor()
x = htrain.columns
y = 'Metabolic syndrome'
x.remove(y)
x.remove('Nr_Crt')

__Model selection and tuning__. The time limit for running AutoML is set to fifteen minutes. In this scenario we removed algorithms like Stacked Ensemble and Deep Learning.

In [None]:
aml = H2OAutoML(max_models = 3, max_runtime_secs=900, exclude_algos=['StackedEnsemble','DeepLearning'], seed = 1)

__Training H2O AutoML__. The AutoML leaderboard uses cross-validation metrics to rank the models. The leader model is stored at _aml.leader_ and the leaderboard is stored at _aml.leaderboard_.

In [None]:
aml.train(x=x, y=y, training_frame=htrain)

Checking the Leaderboard.

In [None]:
lb = aml.leaderboard
lb.head()

__Save the best model to filesystem__.

In [None]:
model_path = h2o.save_model(aml.leader, path = "h2o_model")
print(model_path) 

__H2O Explainability interface__ is a convenient wrapper to a number of explainabilty methods and visualizations in H2O. The _explain()_ function generates a list of explanations – individual units of explanation such as a Partial Dependence plot, a Feature Importance plot or a SHapley Additive exPlanations (SHAP) Summary of Top Tree-based Model.

In [None]:
xplain_model = aml.leader.explain(htrain)

In [None]:
xgboost = aml.get_best_model('xgboost')
xplain_model = xgboost.explain(htrain)

__Predicting on train data using the leader model__. The predict function outputs predicted classes, as well as the probability estimates for each of the classes (confidence).

In [None]:
pred_h2o = aml.leader.predict(htrain)
pred_pandas=pred_h2o.as_data_frame(use_pandas=True)
probs = pred_pandas.values

__Restore the model from the filesystem__.

In [None]:
saved_model = h2o.load_model(model_path)

In [None]:
filename = 'MetS Test.csv'
dataframe = read_csv(filename)
array = dataframe.values
ID_test = array[:,0]
y_test = array[:,-1]
htest = h2o.H2OFrame(dataframe)
htest['Metabolic syndrome'] = htest['Metabolic syndrome'].asfactor()

__Predicting on test data using the saved model__. The predict function outputs predicted classes, as well as the probability estimates for each of the classes (confidence).

In [None]:
pred_h2o = saved_model.predict(htest)
pred_pandas=pred_h2o.as_data_frame(use_pandas=True)
probs = pred_pandas.values

pred_confidence = []
y_pred = []
for i in range(len(probs)):
  y_pred.append(probs[i, 0])
  if (probs[i, 0]==0):
    pred_confidence.append(probs[i, 1])
  else:
    pred_confidence.append(probs[i, 2])


In [None]:
import pandas as pd

ds_id = pd.DataFrame(ID_test, columns = ["ID"])
ds_actual = pd.DataFrame(y_test, columns = ["ACTUALVALUE"])
ds_pred = pd.DataFrame(y_pred, columns = ["PREDICTEDVALUE"])
ds_prob = pd.DataFrame(pred_confidence, columns = ["PREDICTIONCONFIDENCE"])
dataframe = pd.concat([ds_id, ds_actual, ds_pred, ds_prob], axis=1)
dataframe.to_csv('h2o_test_pred.csv',index=False)

__Build the confusion matrix__

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)

__Display the confusion matrix__

In [None]:
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt
 
fig, ax = plot_confusion_matrix(conf_mat=conf_matrix, figsize=(2, 2), cmap=plt.cm.Greens)
plt.xlabel('Predictions', fontsize=11)
plt.ylabel('Actuals', fontsize=11)
plt.title('Confusion Matrix', fontsize=11)
plt.show()

__Calculate the performance metrics__

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
print('Specificity : %.3f' %recall_score(y_test, y_pred, pos_label=0))
ba = (recall_score(y_test, y_pred, pos_label=0) + recall_score(y_test, y_pred))/2.0
print('Balanced accuracy : %.3f' % ba)

__Prepares the test data for the calculation of SHAP values__

In [None]:
X_test = htest.drop('Nr_Crt').drop('Metabolic syndrome').as_data_frame()

__A wrapper class for H2O models__

In [None]:
class H2OProbWrapper:
    def __init__(self, h2o_model, feature_names):
        self.h2o_model = h2o_model
        self.feature_names = feature_names

    def predict_binary_prob(self, X):
        if isinstance(X, pd.Series):
            X = X.values.reshape(1,-1)
        self.dataframe= pd.DataFrame(X, columns=self.feature_names)
        self.predictions = self.h2o_model.predict(h2o.H2OFrame(self.dataframe)).as_data_frame().values
        return self.predictions.astype('float64')[:,-1] #probability of True class

__SHAP explainer instantiation__

In [None]:
h2o_wrapper = H2OProbWrapper(saved_model, X_test.columns)
h2o_explainer = shap.KernelExplainer(h2o_wrapper.predict_binary_prob, X_test)

__Computing SHAP values__

In [None]:
shap_values = h2o_explainer(X_test)

__Show summary plot__

In [None]:
shap.summary_plot(shap_values.values, X_test)