# Auto-sklearn approach
## The following code can be used in an Jupyter Notebook (Python 3.8.X, Auto-sklearn 0.14.3).

__Import Python modules.__

In [None]:
import numpy as np
from pandas import read_csv
from numpy import set_printoptions
import autosklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from autosklearn.classification import AutoSklearnClassifier
from sklearn.model_selection import train_test_split
import shap
import pandas as pd

__Prepare the training data set.__

In [None]:
filename = 'MetS Train.csv'
df = read_csv(filename)
array = df.values
ID_train = array[:,0]
X_train = array[:,1:-1]
y_train = array[:,-1]
X_train = X_train.astype('float32')
y_train = LabelEncoder().fit_transform(y_train.astype('str'))

__Build a Classification model__. A time budget of fifteen minutes has been set. The _n_jobs_ parameter establishes the number of jobs to run in parallel. The _per_run_time_limit_ value must be high enough so that a typical machine learning algorithm can be fit on the training data without exceeding this time limit. _Metric_ parameter represents the evaluation metric to evaluate the model performance. For classification, the possible values are: _roc_auc_, _precision_, _accuracy_, _balanced_accuracy_, _f1_, _f1_micro_, _f1_marco_, _f1_weighted_, _f1_samples_, _recall_, _recall_micro_, _recall_macro_, _recall_samples_, _recall_weighted_, _precision_macro_, _precision_micro_, _precision_samples_, _precision_weighted_, _log_loss_ and _average_percision_. 
_Fit_ both optimizes the machine learning models and builds an ensemble out of them.

In [None]:
model = AutoSklearnClassifier(time_left_for_this_task=900, per_run_time_limit=300, n_jobs=2, metric=autosklearn.metrics.accuracy, ensemble_size=1, ensemble_class = 'SingleBest')
model.fit(X_train, y_train)

__Analysis and inspection of the model__. The _PipelineProfiler_ package is a very useful tool for interactive analysis and inspection of the classification model; the steps and algorithms used for its construction are displayed.

In [None]:
import PipelineProfiler
profiler_data = PipelineProfiler.import_autosklearn(model)
PipelineProfiler.plot_pipeline_matrix(profiler_data)

__Save optimized model to a file__. 

In [None]:
import joblib
filename = 'auto_model.sav'
joblib.dump(model, filename)

__Restore tuned model from the file__. 

In [None]:
loaded_model = joblib.load(filename)

Prepare the testing data set.

In [None]:
filename = 'MetS Test.csv'
df = read_csv(filename)
array = df.values
ID_test = array[:,0]
X_test = array[:,1:-1]
y_test = array[:,-1]
X_test = X_test.astype('float32')
y_test = LabelEncoder().fit_transform(y_test.astype('str'))

__Score the machine learning model__: the predicted values and the probability estimates for each value are obtained. 

In [None]:
y_pred = loaded_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy: %.3f" % acc)

Predicting on test data using the tuned model. The _predict_proba_ function outputs predicted classes, as well as the probability estimates for each of the classes (confidence).

In [None]:
import pandas as pd

probs = loaded_model.predict_proba(X_test)
pred_confidence = []
for i in range(len(probs)):
  if (y_pred[i]==0):
    pred_confidence.append(probs[i, 0])
  else:
    pred_confidence.append(probs[i, 1]) 

ds_id = pd.DataFrame(ID_test, columns = ["ID"])
ds_actual = pd.DataFrame(y_test, columns = ["ACTUALVALUE"])
ds_pred = pd.DataFrame(y_pred, columns = ["PREDICTEDVALUE"])
ds_prob = pd.DataFrame(pred_confidence, columns = ["PREDICTIONCONFIDENCE"])
dataframe = pd.concat([ds_id, ds_actual, ds_pred, ds_prob], axis=1)
dataframe.to_csv('autosklearn_test_pred.csv',index=False)    

__Build the confusion matrix__

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)

__Display the confusion matrix__

In [None]:
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt
 
fig, ax = plot_confusion_matrix(conf_mat=conf_matrix, figsize=(2, 2), cmap=plt.cm.Greens)
plt.xlabel('Predictions', fontsize=11)
plt.ylabel('Actuals', fontsize=11)
plt.title('Confusion Matrix', fontsize=11)
plt.show()

__Calculate the performance metrics__

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
print('Specificity : %.3f' %recall_score(y_test, y_pred, pos_label=0))
ba = (recall_score(y_test, y_pred, pos_label=0) + recall_score(y_test, y_pred))/2.0
print('Balanced accuracy : %.3f' % ba)

__Prepares the test data for the calculation of SHAP values__

In [None]:
df_train = df[['DMFT','CPI','Periodontal pockets','Bleeding','Tooth brushing','Dental control','Gingival attachment loss','CV risk','Carotid atherosclerosis','EQ-5D-5L score']]

__A wrapper class for Auto-sklearn models__

In [None]:
class SKLProbWrapper:
    def __init__(self, skl_model, feature_names):
        self.skl_model = skl_model
        self.feature_names = feature_names

    def predict_binary_prob(self, X):
        if isinstance(X, pd.Series):
            X = X.values.reshape(1,-1)
        self.dataframe= pd.DataFrame(X, columns=self.feature_names)
        self.predictions = self.skl_model.predict_proba(self.dataframe.values)
        return self.predictions.astype('float64')[:,-1] #probability of True class

__SHAP explainer instantiation__

In [None]:
skl_wrapper = SKLProbWrapper(model, df_train.columns)
skl_explainer = shap.KernelExplainer(skl_wrapper.predict_binary_prob, df_train)

__Computing SHAP values__

In [None]:
shap_values = skl_explainer(df_train)

__Show summary plot__

In [None]:
shap.summary_plot(shap_values.values, df_train)