In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import normalize
from imblearn.over_sampling import SMOTE, SMOTENC 
from collections import Counter
from sklearn.preprocessing import StandardScaler
from math import sqrt

def get_train_test(dataset_name, path, r_seed):
    if dataset_name == "HeartRisk":
        print("Loading Heart")
        print("------------------------------------------------------------")
        heart_train_risk_path = path + "Dataset-Merged.xlsx"
        dataset = pd.read_excel(heart_train_risk_path).drop(columns = ["P_ID"])
        
        
        target = "HeartRisk"
        binary = ['male', 'smoker?', 'BPMeds', 'prevalentStroke', 
                        'prevalentHyp', 'diabetes']
        categoricals = None
        fairness = ["male"]
        
        dataset = dataset.dropna()
        
        miss_values_features = ["edu", "cigsPerDay", "BPMeds", "totChol", "glucose"]
        for f in miss_values_features:
            dataset = dataset.drop(dataset[dataset[f] == -1].index)

        test_size = 0.2
    elif dataset_name == "cancer":
        print("Loading Cancer")
        print("------------------------------------------------------------")
        cancer_path = path + "wdbc2.csv"
        dataset = pd.read_csv(cancer_path, sep=',').drop(columns = ["ID"])
        target = "cancer type"
        categoricals = None
        fairness = None
        binary = None
        test_size = 0.2
    elif dataset_name == "adult":
        print("Loading Adult")
        print("------------------------------------------------------------")
        adult_path = path + "adult.csv"
        dataset = pd.read_csv(adult_path, sep=',').drop(columns = ["ID"])
        target = "class"
        categoricals = ['workclass', 'education', 'marital-status', 'occupation',
                        'relationship', 'race', 'sex', 'native-country']
        fairness = ["sex"]
        dataset = dataset.dropna()
        test_size = 0.2
    elif dataset_name == "kidney":
        print("Loading Kidney")
        print("------------------------------------------------------------")
        kidney_path = path + "kidney.csv"
        dataset = pd.read_csv(kidney_path, sep=',')
        target = "class"
        categoricals = ["sg", "al", "su"]
        fairness = None
        binary = ["rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane"]
        dataset = dataset.dropna()
        for f in dataset.columns:
            dataset = dataset.drop(dataset[dataset[f] == '?'].index)
        test_size = 0.4
    elif dataset_name == "credit":
        credit_card_path = path + "credit.csv"
        dataset = pd.read_csv(credit_card_path, sep = ',')
        target = "default payment next month"
        categoricals = ["EDUCATION", "MARRIAGE", "PAY_0", "PAY_2", "PAY_3"
                        , "PAY_4", "PAY_5", "PAY_6"]
        fairness = ["SEX"]
        binary = ["SEX"]
        
        test_size = 0.2
    elif dataset_name == "student":
        print("Loading Student")
        print("------------------------------------------------------------")
        student_path = path + "student.csv"
        dataset = pd.read_csv(student_path, sep = ',')
        target = "class"
        categoricals = ["Mjob", "Fjob", "reason", "guardian"]
        fairness = ["sex"]
        binary = ["school", "sex","address", "famsize", "Pstatus", "schoolsup", "famsup", 
                  "paid", "activities", "nursery", "higher", "internet", "romantic"]
        
        test_size = 0.2
        
    else:
        raise Exception("DATASET INESISTENTE")
    if categoricals != None:
        for f in categoricals:
            one_hot = pd.get_dummies(dataset[f], 
                                     prefix = f)
            dataset = dataset.drop(columns = [f])
            dataset = dataset.join(one_hot)
    
        dataset = dataset[[c for c in dataset if c not in [target]] + [target]]
    train_set, test_set = train_test_split(dataset, test_size = test_size, random_state = r_seed, 
                                           stratify = dataset[target])
        
    return train_set, test_set, list(train_set.columns[:-1]), categoricals, binary, fairness



def preprocess(dataset, train_samples, train_labels, test_samples, test_labels, r_seed, 
               columns, categoricals, binary, fairness):
    if categoricals != None:
        categoricals_idx = [columns.index(c2) for c1 in categoricals for c2 in columns if c2.startswith(c1+"_")]
    else:
        categoricals_idx = []
        
    if binary != None:
        binary_idx = [columns.index(c1) for c1 in binary for c2 in columns if c2 == c1]
    else:
        binary_idx = []
      
    num_features_idx = list(set(range(len(columns)))-set(categoricals_idx)-set(binary_idx))
    
    if fairness != None:
        fairness_idx = [columns.index(c1) for c1 in fairness for c2 in columns if c2 == c1]
    else:
        fairness_idx = []

    std = StandardScaler()
    std.fit(train_samples[:,num_features_idx])
    train_samples[:,num_features_idx] = std.transform(train_samples[:,num_features_idx])
    test_samples[:,num_features_idx] = std.transform(test_samples[:,num_features_idx])
    
    if categoricals != None or binary != None:
        rescale_idx = categoricals_idx + binary_idx
        #print(rescale_idx)
        train_samples[:,rescale_idx] = np.where(train_samples[:,rescale_idx] == 0, -1, train_samples[:,rescale_idx])
        test_samples[:,rescale_idx] = np.where(test_samples[:,rescale_idx] == 0, -1, test_samples[:,rescale_idx])
        sm = SMOTENC(categorical_features = rescale_idx, random_state = r_seed)
    else:
        sm = SMOTE(random_state = r_seed)
    train_samples, train_labels = sm.fit_resample(train_samples, train_labels)
    test_samples, test_labels = sm.fit_resample(test_samples, test_labels)
    
    if fairness != None:
        samples = np.concatenate((train_samples, test_samples))
        males = [i for i in range(samples.shape[0]) if samples[i,fairness_idx] == 1]
        females = [i for i in range(samples.shape[0]) if samples[i,fairness_idx] == -1]
    else:
        males = None
        females = None
        
    
    return train_samples, train_labels, test_samples, test_labels, males, females

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report

class EarlyStoppingAtMinLoss(keras.callbacks.Callback):
    def __init__(self, test_samples, test_labels, patience = 2, min_iterations = 10):
        super(EarlyStoppingAtMinLoss, self).__init__()
        self.min_iterations = min_iterations
        self.patience = patience
        self.samples = test_samples
        self.labels = test_labels
        # best_weights to store the weights at which the minimum loss occurs.
        self.best_weights = None

    def on_train_begin(self, logs=None):
        # The number of epoch it has waited when loss is no longer minimum.
        self.wait = 0
        # The epoch the training stops at.
        self.stopped_epoch = 0
        # Initialize the best as infinity.
        self.best = np.Inf

    def on_epoch_end(self, epoch, logs=None):
        current = self.model.evaluate(self.samples, self.labels, verbose = 0)[0]
        if np.less(current, self.best):
            self.best = current
            self.wait = 0
            # Record the best weights if current results is better (less).
            self.best_weights = self.model.get_weights()
        elif epoch >= self.min_iterations:
            self.wait += 1
            if self.wait >= self.patience:
                self.stopped_epoch = epoch
                self.model.stop_training = True
                print("Restoring model weights from the end of the best epoch.")
                self.model.set_weights(self.best_weights)
                

    def on_train_end(self, logs=None):
        if self.stopped_epoch > 0:
            print("Epoch %05d: early stopping" % (self.stopped_epoch + 1))

def compute_model(train_samples, train_labels, test_samples, test_labels, file_path, model_layers, 
                  patience = 2, min_iterations = 10, regularizer = None):
    input_dim = train_samples.shape[1]
    if regularizer == "l1":
        regularizer = tf.keras.regularizers.l1(l1 = 0.0001)
    elif regularizer == "l2":
        regularizer = tf.keras.regularizers.l2(l2 = 0.0001)
    elif regularizer == "l1_l2":
        regularizer = tf.keras.regularizers.l1_l2(l1 = 0.0001, l2 = 0.0001)
    n_class = 1
    if(not os.path.isdir(file_path)):
        if model_layers == 1:
            model = Sequential([Dense(units = input_dim, input_shape= (input_dim,),
                                      activation = "relu", kernel_regularizer=regularizer),
                Dense(units = n_class, activation = "sigmoid")])
        else:
            model = Sequential([Dense(units = input_dim, input_shape= (input_dim,), 
                                      activation = "relu", kernel_regularizer=regularizer),
                Dense(units = input_dim/2, activation = "relu", kernel_regularizer=regularizer),
                Dense(units = n_class, activation = "sigmoid")])
        early_stopping = EarlyStoppingAtMinLoss(test_samples, test_labels, patience, min_iterations)
        callbacks = [early_stopping]
            
        model.compile(optimizer = Adam(learning_rate = 0.001), 
                      loss = "binary_crossentropy", metrics = ["binary_accuracy"])
        
        history = model.fit(x = train_samples, y = train_labels, batch_size = 10, 
                            epochs = 1000, verbose = 1, callbacks = callbacks) 
        
        np.save(file_path+".npy", history.history)
        
        model.save(file_path)
    
def evaluate_model(model, train_samples, train_labels, test_samples, test_labels):  
    print("Evaluate on train data")
    print("TRAIN: ")
    train_pred = (model.predict(train_samples) > 0.5).astype("int32")
    dict_train = classification_report(train_labels, train_pred, output_dict = True)
    fpr, tpr, thresholds = roc_curve(train_labels, train_pred)
    auc_train = auc(fpr, tpr)
    print("AUC: ", auc(fpr, tpr))
    print("___________________________________________________________")
    
    print("Evaluate on test data")
    print("TEST: ")
    test_pred = (model.predict(test_samples) > 0.5).astype("int32")
    dict_test = classification_report(test_labels, test_pred, output_dict = True)
    preds = model.predict(test_samples)
    fpr, tpr, thresholds = roc_curve(test_labels, test_pred)
    auc_test = auc(fpr, tpr)
    print("AUC: ", auc(fpr, tpr))
    print("___________________________________________________________")
    return dict_train, dict_test, auc_train, auc_test
    
def confusion_matrix(model, samples, labels):
    predictions = (model.predict(samples) > 0.5).astype("int32")
    tp = []
    tn = []
    fp = []
    fn = []
    for i in range(samples.shape[0]):
        if predictions[i] == 1 and labels[i] == 1:
            tp.append(i)
        elif predictions[i] == 0 and labels[i] == 0:
            tn.append(i)
        elif predictions[i] == 0 and labels[i] == 1:
            fn.append(i)
        elif predictions[i] == 1 and labels[i] == 0:
            fp.append(i)
    tp = np.asarray(tp)
    tn = np.asarray(tn)
    fp = np.asarray(fp)
    fn = np.asarray(fn)
    print(len(tp), len(tn), len(fp), len(fn))
    print("_____________________________________________________________")
    return tp, tn, fp, fn


In [3]:
import numpy as np
import statistics
import os
import time
import csv
import shap
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import normalize
from scipy.spatial import distance

def compute_shap_values(file_path, samples, columns, explainer):
    if(not os.path.isfile(file_path + ".tsv")):
            print("START CALCULATING SHAP VALUES...")
            shap_values = explainer.shap_values(samples, silent = True)
            print("END CALCULATING SHAP VALUES")
            pd.DataFrame(shap_values[0]).to_csv(file_path + ".tsv", sep = "\t",
                                                index = False)
            
            shap.summary_plot(shap_values, samples, feature_names = columns, show=False)
            plt.savefig(file_path + ".png")
            plt.clf()
            print("_____________________________________________________________________")

def get_shap_values(file_path, model, samples):
    shap_values_as_df = pd.read_csv(file_path + ".tsv", sep='\t')
    
    return np.asarray(shap_values_as_df).astype("float")
         

def weight_samples(shap_values, samples, scaler):
    return np.multiply(samples,np.abs(shap_values)) / scaler

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from numpy.random import seed
import statistics
import shap
import matplotlib.pyplot as plt
import csv
import time
from sklearn.preprocessing import StandardScaler

def save_results(dict_train, dict_test, zeros_train, zeros_test, auc_train, auc_test, file_path, title):
    with open("/".join(file_path.split("/")[:-2])+ "/results " + 
              title + ".tsv", "a") as res_file:
        tsv_writer = csv.writer(res_file, delimiter = "\t")
        print(file_path.split("/")[-1])
        tsv_writer.writerow([file_path.split("/")[-1] + " train","precision","recall", "f1", "zeros shap", "AUC", "Accuracy"])
        tsv_writer.writerow([0, dict_train["0"]["precision"], dict_train["0"]["recall"], dict_train["0"]["f1-score"]])
        tsv_writer.writerow([1, dict_train["1"]["precision"], dict_train["1"]["recall"], dict_train["1"]["f1-score"]])
        tsv_writer.writerow(["macro", dict_train["macro avg"]["precision"], 
                             dict_train["macro avg"]["recall"], dict_train["macro avg"]["f1-score"], zeros_train, auc_train,
                            dict_train["accuracy"]])
        
        tsv_writer.writerow([file_path.split("/")[-1] + " test","precision","recall", "f1"])
        tsv_writer.writerow([0, dict_test["0"]["precision"], dict_test["0"]["recall"], dict_test["0"]["f1-score"]])
        tsv_writer.writerow([1, dict_test["1"]["precision"], dict_test["1"]["recall"], dict_test["1"]["f1-score"]])
        tsv_writer.writerow(["macro", dict_test["macro avg"]["precision"], 
                             dict_test["macro avg"]["recall"], dict_test["macro avg"]["f1-score"], zeros_test, auc_test,
                            dict_test["accuracy"]])
    res_file.close()
                        
def save_confusion_matrix(tps, tns, fps, fns, title):
    with open(title + ".tsv", "w") as cf_file:
        tsv_writer = csv.writer(cf_file, delimiter = "\t")
        for i in range(len(tps)):
            tsv_writer.writerow(["Iter " + str(i) + " TPs"])
            tsv_writer.writerow(tps[i])
            tsv_writer.writerow(["Iter " + str(i) + " TNs"])
            tsv_writer.writerow(tns[i])
            tsv_writer.writerow(["Iter " + str(i) + " FPs"])
            tsv_writer.writerow(fps[i])
            tsv_writer.writerow(["Iter " + str(i) + " FNs"])
            tsv_writer.writerow(fns[i])
    cf_file.close()

def count_zeros(data):
    return np.sum([np.count_nonzero(np.abs(x) < 0.01) for x in data])

    
r_seed = 1

seed(r_seed)
tf.random.set_seed(r_seed)
"""
Method that runs IDW method:
    - dataset: Name of the chosen dataset. {"cancer", "HeartRisk", "student", "kidney"}
    - data_path: Path to the dataset folder
    - results_path: Path where to save the results
    - regularizer: Type of regularizer. {None, "l1", "l2", "l1_l2"}
    - title: title to append at the files names

Results saved by the method:
    - keras models trained during IDW: {n_iter}_median_model_1_hidden
    - history of the keras models: {n_iter}_median_model_1_hidden.npy
    - SHAP scores computed on the train set: {n_iter}_median_model_1_hidden.tsv and {n_iter}_median_model_1_hidden.png
    - SHAP scores computed on the test set: {n_iter}_median_model_1_hidden_test.tsv and {n_iter}_median_model_1_hidden_test.png
    - precision, recall, f1, XCP, AUC and accuracy: results {title}.tsv
    - confusion matrixs: confusion matrix {title}.tsv
"""
def main(dataset, data_path, results_path, title, regularizer = None):
    model_layers = 1
    
    train_set, test_set, columns, categoricals, binary, fairness = get_train_test(dataset, data_path, r_seed)
        
    if categoricals != None:
        categoricals_idx = [columns.index(c2) for c1 in categoricals for c2 in columns if c2.startswith(c1+"_")]
    else:
        categoricals_idx = []
      
    if binary != None:
        binary_idx = [columns.index(c1) for c1 in binary for c2 in columns if c2 == c1]
    else:
        binary_idx = []
      
    num_features_idx = list(set(range(len(columns)))-set(categoricals_idx)-set(binary_idx))
    
    if fairness != None:
        fairness_idx = [columns.index(c1) for c1 in fairness for c2 in columns if c2 == c1]
    else:
        fairness_idx = []
        
    train_samples = np.asarray(train_set[train_set.columns[:-1]]).astype("float")
    train_labels = np.asarray(train_set[train_set.columns[-1]]).astype("int64")
    
     
    test_samples = np.asarray(test_set[test_set.columns[:-1]]).astype("float")
    test_labels = np.asarray(test_set[test_set.columns[-1]]).astype("int64")

    train_samples, train_labels, test_samples, test_labels, males, females = \
                preprocess(dataset, train_samples, 
                    train_labels, test_samples, 
                    test_labels, r_seed, columns,
                    categoricals, binary, fairness)
    
    tps = []
    tns = []
    fps = []
    fns = []
    new_train = train_samples
    new_test = test_samples
    times = []
    epochs = []
    
    if regularizer == None:
        n_iter = 5
    else:
        n_iter = 1
    for i in range(n_iter):
        if model_layers == 1:
            file_path = results_path + str(i) + "_median_model_1_hidden"
                
        
        start_time = time.time()
        compute_model(new_train, train_labels, new_test, 
                      test_labels, file_path, model_layers, patience = 5, 
                      min_iterations = 10, regularizer = regularizer)
        
        model = keras.models.load_model(file_path)
        history=np.load(file_path+ '.npy',allow_pickle='TRUE').item()
        
        dict_train, dict_test, auc_train, auc_test = evaluate_model(model, new_train, 
                                               train_labels, new_test, test_labels)
        
        
        samples = new_train
        if males != None and females != None:
            
            males = [e for e in males if e < samples.shape[0]]
            females = [e for e in females if e < samples.shape[0]]
            
            m = samples[males]
            f = samples[females]
            
            background_sample_m = shap.kmeans(m, 2)
            background_sample_f = shap.kmeans(f, 2)
            background_sample = np.concatenate((background_sample_m.data, background_sample_f.data))

        else:
            background_sample = shap.kmeans(samples, 4).data
            

        samples = np.concatenate((new_train, new_test))
        labels = np.concatenate((train_labels, test_labels))
        tp, tn, fp, fn = confusion_matrix(model, samples, labels)
        
        explainer = shap.KernelExplainer(model.predict, background_sample)
                
        tps.append(tp)
        tns.append(tn)
        fps.append(fp)
        fns.append(fn)
        
        shap_path = file_path
        compute_shap_values(shap_path, new_train, train_set.columns[:-1], explainer)
        shap_values_train = get_shap_values(shap_path, model, new_train)
        zeros_train = count_zeros(shap_values_train)
        zeros_train = zeros_train/(new_train.shape[0] * new_train.shape[1])

        shap_path = file_path + "_test"
        compute_shap_values(shap_path, new_test, test_set.columns[:-1], explainer)
        shap_values_test = get_shap_values(shap_path, model, new_test)
        zeros_test = count_zeros(shap_values_test)
        zeros_test = zeros_test/(new_test.shape[0] * new_test.shape[1])

        new_train = weight_samples(shap_values_train, new_train, 1)
        new_test =  weight_samples(shap_values_test, new_test, 1)
        
        std = StandardScaler()
        std.fit(new_train)
        new_train = std.transform(new_train)
        new_test = std.transform(new_test)
        
        save_results(dict_train, dict_test, zeros_train, zeros_test, auc_train, auc_test, file_path, title)

        time_ = time.time() - start_time
        times.append(time_)
        epochs.append(len(history['loss']))
            
    title_ = "/".join(file_path.split("/")[:-2]) + "/confusion matrix " + title
    save_confusion_matrix(tps,tns,fps,fns,title_)
    print(epochs, times)



data_path = "dataset/cancer/" 

results_path = "results IDW/cancer/1 hidden layer/"
title = "cancer 1 hidden"
main("cancer", data_path, results_path, title)

Loading Cancer
------------------------------------------------------------
Evaluate on train data
TRAIN: 
AUC:  0.9947368421052631
___________________________________________________________
Evaluate on test data
TEST: 
AUC:  0.9861111111111112
___________________________________________________________
353 356 1 4
_____________________________________________________________
0_median_model_1_hidden
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
 1/57 [..............................] - ETA: 0s - loss: 5.8275e-04 - binary_accuracy: 1.0000Restoring model weights from the end of the best epoch.
Epoch 00017: early stopping
INFO:tensorflow:Assets written to: results IDW/cancer/1 hidden layer/1_median_model_1_hidden\assets
Evaluate on train data
TRAIN: 
AUC:  0.9912280701754386
__________________________________

In [None]:
dataset = "cancer"

labels = ["Baseline", "Iter. 1", "Iter. 2", "Iter. 3", "Iter. 4"]

accs = []

path =  "results IDW/"
file = open(path + dataset + "/" + "results " + dataset +  " 1 hidden.tsv")
res = pd.read_csv(file, sep='\t')

accs.append(res[res[res.columns[0]] == "macro"]["Accuracy"].to_numpy(dtype="float64"))
accs = np.array(accs).flatten()   

accs_train = accs[0::2]
accs_test = accs[1::2]

steps = len(labels)

f, ax = plt.subplots(1,1, figsize = (6,6))

ax.set_ylim(0.5, 1.01)
ax.set_yticks(np.arange(0.5,1.05,0.1))
ax.set_yticklabels(labels = np.round(np.arange(0.51,1.05,0.1),1), fontsize = 20)
ax.set_xticks(np.arange(0,steps,1))
ax.set_xticklabels(labels = labels, rotation = 90, fontsize = 20) 
ax.plot(accs_test[:5], color = 'r', alpha = 0.7)
ax.set_ylabel("Accuracy", fontsize = 20)

In [None]:
xcps = []
for i in range(5):
    path = "results IDW"+"/"+dataset+"/"+"1 hidden layer/"+str(i)+"_median_model_1_hidden_test"

    shap_train = get_shap_values(path, None, None)

    den = shap_train.shape[0] * shap_train.shape[1]
    xcps.append(np.count_nonzero(np.abs(shap_train) < 0.01)/den)
f, ax = plt.subplots(1,1, figsize = (6,6))

steps = len(labels)

ax.plot(xcps, lw = 2, alpha = 0.7)
ax.set_yticklabels(labels = np.round(np.arange(0.51,1.05,0.1),1), fontsize = 20)
ax.set_xticks(np.arange(0,steps,1))
ax.set_xticklabels(labels = labels, rotation = 90, fontsize = 18) 
ax.set_ylabel("Explanation Compactness Percentage", fontsize = 18)