<a href="https://colab.research.google.com/github/catarina-moreira/COVID-19/blob/master/Diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Demystifying Predictive Black-Box Models: An Interpretable Probabilistic Approach

Catarina Moreira, Yu-Liang Chou, Mythreyi Velmurugan, Renuka Sindhgatta Rajan, Chun Ouyang, Peter Bruza

**Abstract** 


In [1]:
# Install tensorflow
try:
  # tensorflow_version only exists in Colab
  %tensorflow_version 2.x
except Exception:
  pass


In [2]:
# for reproduciability reasons:
import numpy as np
import random as rn
import tensorflow as tf
import csv

# necessary for starting Numpy generated random numbers in an initial state
np.random.seed(515)

# Necessary for starting core Python generated random numbers in a state
rn.seed(515)

# Force TensorFlow to single thread
# Multiple threads are a potential source of non-reprocible research resulsts
session_conf = tf.compat.v1.ConfigProto( intra_op_parallelism_threads=1,
                                          inter_op_parallelism_threads=1 )

# tf.set_random_seed() will make random number generation in the TensorFlow backend
# have a well defined initial state
# more details: https://www.tensorflow.org/api_docs/python/tf/set_random_seed
tf.compat.v1.set_random_seed(515)

In [None]:
# library to deal with Bayesian Networks
!pip install pyagrum

In [4]:
# Bayesian networks
from sklearn.preprocessing import KBinsDiscretizer
from pylab import *
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb

# for classification purposes
from pyAgrum.lib.bn2roc import showROC


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Auxiliary Functions

In [16]:
PATH = "/Users/catarina/GitHub/causabilityXAi/"

### Evaluation Measures

In [8]:
# RECALL
# Computes the recal measure of an evaluation setting
# y_true: list of groundtruth labels
# y_pred: list of predictions from blackbox
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

# PRECISION
# Computes the precision measure of an evaluation setting
# y_true: list of groundtruth labels
# y_pred: list of predictions from blackbox
def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

# F1
# Computes the F1 measure of an evaluation setting
# y_true: list of groundtruth labels
# y_pred: list of predictions from blackbox
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


### Neural Networks

In [9]:
# CREATE_MODEL
# creates a neural network model with a certain number of hidden layers and a certain 
# number of neurons in each layer.
# input_dim: an integer specifying the number of input neurons
# output_dim: an integer specifying the number of output neurons (the number of labels)
# hidden_layers: an integer specifying the number of hidden layers
# loss_func: the loss function of the model. By default, it is applied the 'categorical_crossentropy'
# optim: the optimisation algorithm used in the model. By default it is used the 'nadam' algorithm
# metrics: a list of strings specifying the metrics to be evaluated ('accuracy', 'f1', 'recall','precision')
def create_model(input_dim, output_dim, nodes, hidden_layers=1, loss_func='categorical_crossentropy', optim='nadam', metrics=['accuracy'], name='model'):
    
    model = Sequential(name=name)
    model.add( Dense(nodes, input_dim=input_dim, activation='relu'))  # input layer
    for i in range(hidden_layers):                                    # hidden layers
      model.add(Dense(nodes, activation='relu'))  
    model.add(Dense(output_dim, activation='softmax'))                # output layer

    if( optim == "nadam" ):                                           # Compile model
      optim = keras.optimizers.Nadam(lr=0.0001, beta_1=0.9, beta_2=0.999)

    model.compile(loss=loss_func, optimizer=optim, 
                  metrics=metrics)
    return model

In [12]:
# GRID_SEARCH
# Generates a set of models with different configurations, ranging from an
# initial number of neurons to a maximum number of neurons
# start_nodes: an integer specifying the initial number of neurons to generate a model from
# max_nodes:   an integer specifying the maximum number of neurons to generate a model from
# max_hlayers: an integer specifying the maximum number of hidden layers to generate a model from
# debug: boolean that acts as a flag. If True, it displays the characteristics of each model
# metrics: a list of strings with the metrics to be evaluated 
def grid_search_model_generator(start_nodes = 1, max_nodes = 12, max_hlayers = 5, debug = False, metrics = ['accuracy'] ):

  models = []

  # generate different models with different neurons and different hidden layers
  for neurons in range(start_nodes, MAX_NODES+1):
    for hidden_layer in range(1, MAX_HLAYERS+1):
      model_name = "model_h" + str(hidden_layer) + "_N"+str(neurons)
      model = create_model(n_features, n_classes, neurons, hidden_layer, name=model_name, metrics = metrics)
    
      models.append( model )  # add the generated model to a list

  # plot general information for each model
  if( DEBUG ):  
    for model in models:
      model.summary()

  return models


In [13]:
# PERFORM_GRID_SEARCH
# given a list of models with different configurations, fit the data to the models,
# and evaluate the model. This function returns a list of training histories for each model
# models: list of models
def perform_grid_search( models ):

  # define the callebacks to take into consideration during training
  # stop training when convergence is achieved after 10 iterations
  early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min')
  
  # save the best model after every epoch
  model_checkpoint = ModelCheckpoint(PATH + "training/diabetes/model_{epoch:02d}-{val_loss:.2f}.h5", 
                                     monitor='val_loss', 
                                     verbose=0, 
                                     save_best_only=True, 
                                     mode='min')
  
  callbacks_list = [early_stop, model_checkpoint]

  # grid search over each model
  for model in models:
    print('MODEL NAME:', model.name)
    history_callback = model.fit(X_train, Y_train, batch_size = BATCH_SIZE, epochs = EPOCHS,
                                 verbose=0, validation_data=(X_validation, Y_validation), callbacks=callbacks_list)
    
    score_test = model.evaluate( X_test, Y_test, verbose=0 )
    score_train = model.evaluate( X_train, Y_train  )
  
    print('Test loss:     ', format(score_test[0], '.4f'), '\tTrain loss: ', format(score_train[0], '.4f') )
    print('Test accuracy: ', format(score_test[1], '.4f'), '\tTrain accu: ', format(score_train[1], '.4f') )
    print('Abs accuracy:  ', format( np.abs( score_test[1] - score_train[1] ), '.4f'))
    print('Abs loss:      ', format( np.abs( score_test[0] - score_train[0] ), '.4f'))
    print('\n###########################################################\n')

    HISTORY_DICT[model.name] = [history_callback, model]

  return HISTORY_DICT


In [14]:
# SAVE_MODEL
#
# 
def serialize_model( model, model_name, path ):
  # serialize model to JSON
  model_json = model.to_json()
  with open(path + model_name+"_DUO.json", "w") as json_file:
    json_file.write(model_json)
  json_file.close()

  # serialize weights to HDF5
  model.save_weights( path + model_name+"_DUO.h5")
  print("Saving files:")
  print(path + model_name+"_DUO.json")
  print(path + model_name+"_DUO.h5")
  print("Model saved to disk") 

# SAVE_MODEL_HISTORY
#
# 
def serialize_model_history(  model_hist, model_name, path ):
  file = open(path + model_name + "_hist.csv", "w")
  w = csv.writer( file )
  
  for key, val in model_hist.history.items():
    w.writerow([key, val])
  file.close()
  print(path + model_name+"_DUO.h5")
  print("Model history saved to disk") 


In [17]:
# LOAD_MODEL_HISTORY ------------------------------------------
#
# 
def load_model_history( model_name, path):

  model_hist_loaded = {}
  values = []

  # load dictionary
  r = open( path + model_name + "_hist.csv", "r").read()
  for line in r.split("\n"):
    if(len(line) == 0):
      continue
  
    metric = line.split(",\"[")[0]                                    # extract metrics
    values_str = line.split(",\"[")[1].replace("]\"","").split(", ")  # extract validation values
    values = [float(val_str) for val_str in values_str]
    model_hist_loaded.update( {metric : values} )
  return model_hist_loaded

# LOAD_MODEL ------------------------------------------
#
# 
def load_model( model_name, path ):
  json_file = open( path + model_name +  "_DUO.json", 'r')
  loaded_model_json = json_file.read()
  json_file.close()

  # load weights into new model
  loaded_model = model_from_json(loaded_model_json)
  loaded_model.load_weights(path + model_name +  "_DUO.h5")
  print("Loaded model from disk")
  return loaded_model

## Training a Neural Network

In [None]:
MY_PATH = "/content/drive/My Drive/Colab Notebooks/ExplanatoryAI/"
DATASET_NAME = "diabetes.csv"

class_var = "Outcome"
dataset_path = MY_PATH + "datasets/" + DATASET_NAME
data = pd.read_csv( dataset_path )
data.sample(5)