In [None]:
# Include project path to available custom class at jupyter
import os
import sys
sys.path.insert(0, os.path.abspath('/home/stacked_ensemble/'))

# Disable warnings
import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
import tensorflow as tf
import keras.backend.tensorflow_backend as K
import shap
import datetime
import re
import copy
K.set_session

import numpy as np
import pandas as pd
import utils.definition_network as dn
import pickle

from sklearn.utils import shuffle
from network_model.custom_ensemble import CustomEnsemble
from network_model.stacked_ensemble import StackedEnsemble
from utils.preprocess_data import PreprocessData
from utils.shap_analyse_plots import ShapAnalysePlots
from utils.con_postgres import ConPostgres

### 1. Helper functions and objects available for all tests in this experiment

In [None]:
def set_period_time_end(time_ini, task_desc):
    time_end = datetime.datetime.now()
    period = time_end - time_ini
    print('%s\t Ini: %s  End: %s  Total: %s' % (task_desc, time_ini.strftime("%Y-%m-%d %H:%M:%S"),
                                                time_end.strftime("%Y-%m-%d %H:%M:%S"), period))

In [None]:
def select_samples_by_total_class(data_df, labels_set, total_sample_by_label):
    samples = []

    for label in labels_set:
        samples.append(data_df[data_df.label == label][0:total_sample_by_label])

    return pd.concat(samples)

### 2. SHAP stacked ensemble analysis (level 1: deep learning)

#### 2.1 Auxiliary Functions

In [None]:
def set_params_stacked_ensemble(dataset_train_path, dataset_test_path, load_cad_submodels):
    epoch = 16
    batch_size = 8
    use_submodel = dict({SUBMODEL: [1,2,3]})
    neurons_by_submodel = 12
    hidden_layer = 3

    metric = 'accuracy'
    loss_fn = 'binary_crossentropy'
    activation_output_fn = 'sigmoid'
    optimizer_fn = 'adam'
    activation_hidden_fn = 'tanh'
    kernel_initializer = 'glorot_uniform'
    use_bias = True
    bias_initializer = 'zeros'
    kernel_regularizer = None
    bias_regularizer = None
    activity_regularizer = None
    kernel_constraint = None
    bias_constraint = None
    path_submodels = dn.PATH_PROJECT + "weak_classifiers/"
    type_submodels = dn.TypePredictionLabel.MULTI_LABEL_CATEGORICAL

    hidden_layers_set = []
    for idx in range(hidden_layer):
            hidden_layers_set.append(
                    dict({'units': neurons_by_submodel,
                                'activation': activation_hidden_fn,
                                'use_bias': use_bias,
                                'kernel_initializer': kernel_initializer,
                                'bias_initializer': bias_initializer,
                                'kernel_regularizer': kernel_regularizer,
                                'bias_regularizer': bias_regularizer,
                                'activity_regularizer': activity_regularizer,
                                'kernel_constraint': kernel_constraint,
                                'bias_constraint': bias_constraint}))

    set_network = dict({'epochs': epoch,
                                            'batch_size': batch_size,
                                            'patient_train': int(
                                                    epoch / 2),
                                            'activation_output_fn': activation_output_fn,
                                            'loss_function': loss_fn,
                                            'optmizer_function': optimizer_fn,
                                            'main_metric': metric,
                                            'load_cad_submodels': load_cad_submodels,
                                            'dataset_train_path': dataset_train_path,
                                            'dataset_test_path': dataset_test_path,
                                            'path_submodels': path_submodels,
                                            'type_submodels': type_submodels,
                                            'submodels': use_submodel,
                                            'hidden_layers': hidden_layers_set
                                            })

    name_test = 'E_' + str(epoch) + '_BS_' + str(batch_size) + \
                            '_US_' + str(len(use_submodel)) + '_N_' + str(neurons_by_submodel) + \
                            '_HL_' + str(hidden_layer) + '_M_' + str(metric)[0:2] + \
                            '_AO_' + str(bias_constraint)[0:2] + \
                            '_LF_' + str(loss_fn)[0:2] + '_OP_' + str(optimizer_fn) + \
                            '_AH_' + str(activation_hidden_fn)[0:2] + '_KI_' + str(kernel_initializer)[0:2] + \
                            '_UB_' + str(use_bias)[0] + '_BI_' + str(bias_initializer)[0:2] + \
                            '_KR_' + str(kernel_regularizer) + '_BR_' + str(bias_regularizer) + \
                            '_AR_' + str(activity_regularizer) + '_KC_' + str(kernel_constraint)[0:2] + \
                            '_BC_' + str(bias_constraint)[0:2]

    return name_test, set_network


In [None]:
def load_stacked_ensemble():
    dataset_train_path = 'dataset/anx_dep_multilabel/SMHD_multi_label_test_train_2112.df'
    dataset_test_path = 'dataset/anx_dep_multilabel/SMHD_multi_label_test_test_528.df'

    name_test, set_network = set_params_stacked_ensemble(dataset_train_path, dataset_test_path, False)
    print("Experiment: " + name_test)
    ensemble_stk = StackedEnsemble('stacked_submodels_'+SUBMODEL, 1, '')

    ensemble_stk.list_report_metrics = []
    ensemble_stk.ensemble_stacked_conf = set_network
    ensemble_stk.k_fold = 5
    ensemble_stk.labels_set = ['control', 'anxiety', 'depression']
    ensemble_stk.labels_ensemble = ['control', 'anxiety', 'depression']

    ensemble_stk.type_predict_label = dn.TypePredictionLabel.MULTI_LABEL_CATEGORICAL
    ensemble_stk.metrics_based_sample = False

    ensemble_stk.set_network_params_ensemble_stack()
    ensemble_stk.load_submodels()
    ensemble_stk.load_pre_trained_model(dn.PATH_PROJECT + "experiments/stacked_ensemble/"\
                                        "t2_E_16_BS_8_US_2_N_12_HL_3_M_ac_AO_No_LF_bi_OP_adam_AH_ta_KI_gl_UB_T_BI_ze_KR_None_BR_None_AR_None_KC_No_BC_No_train_valid_kf_0_ens_stk_model.h5")
    return ensemble_stk


In [None]:
def get_tokenizer_by_submodel(ens_model):
    time_ini = datetime.datetime.now()    

    word_lookup_dict = dict()
    tokenizers_dict = dict()
    for key_model, value in ens_model.all_submodels.items():
        tokenizer = ens_model.all_submodels[key_model]['exp'].pp_data.load_tokenizer()
        tokenizers_dict.update({key_model: tokenizer})

    set_period_time_end(time_ini, "get_word_lookup_by_submodel...")

    return tokenizers_dict

In [None]:
def fill_posts(x_data):
    find_pad = np.where(x_data == 0)[1]
    total_pads = len(find_pad)
    total_valid_terms = len(x_data[0][total_pads:])
    #print(total_pads, total_valid_terms)
    if total_valid_terms >= total_pads:
        #print('just complete ', total_pads)
        t1 = np.array([x_data[0][total_pads:(2*total_pads)]])
        t2 = np.array([x_data[0][total_pads:]])
        tf = np.concatenate((t1, t2), axis=1)
    else:
        mult_valid_terms = int(total_pads/total_valid_terms)
        #print('multivalid', mult_valid_terms)
        
        valid_posts_list = []
        for i in range(mult_valid_terms):
            valid_posts_list.append([x_data[0][total_pads:]])
        
        if mult_valid_terms == 1:
            tf = np.concatenate((valid_posts_list[i], valid_posts_list[i]), axis=1)
        else:
            tf = valid_posts_list[0]
            for i in range(1,len(valid_posts_list)):
                tf = np.concatenate((tf, valid_posts_list[i]), axis=1)
    
        total_terms = int(tf.shape[1])
        #print('multivalid ', total_terms, tf.shape)
        
        if total_terms < 5000:
            total_terms = 5000-total_terms
            #print('complementa com ', total_terms)
            t1 = np.array([tf[0][0:total_terms]])
            #print(t1.shape)
            tf = np.concatenate((t1, tf), axis=1)

    #print(tf.shape)
    
    return tf 

In [None]:
def fill_posts_samples(x_data):
    for idx, x_sample in enumerate(x_data):
        # print(x_data[idx])
        x_data[idx] = fill_posts(np.array([x_sample]))
        # print(x_data[idx])
    return x_data

In [None]:
def generate_explainer_by_submodel(ens_model, data_df):
    explainers_dict = dict()
    for key_model, value in ens_model.all_submodels.items():
        time_ini = datetime.datetime.now()    
        exp = ens_model.all_submodels[key_model]['exp']
        exp.pp_data.type_prediction_label = ens_model.type_predict_label
        x_data, y_data = exp.pp_data.load_subdataset_generic(data_df, ens_model.labels_set)

        # Replace pad with itself text
        x_data = fill_posts_samples(x_data)
        
        explainer = shap.KernelExplainer(ens_model.all_submodels[key_model]['model_class'].model.predict, x_data)
        explainers_dict.update({key_model: explainer})
        set_period_time_end(time_ini, "generate_explainer "+str(key_model)+"...")

    return explainers_dict

In [None]:
def generate_shap_values_all_submodels(ens_model, data_df, explainers_dict, nsamples):
    predicts_by_model = dict()
    shap_values_dict = dict()
    for key_model, value in ens_model.all_submodels.items():
        time_ini = datetime.datetime.now()    
        exp = ens_model.all_submodels[key_model]['exp']
        
        exp.pp_data.type_prediction_label = ens_model.type_predict_label
        x_data, y_data = exp.pp_data.load_subdataset_generic(data_df, ens_model.labels_set)
        
        # Replace pad with itself text
        x_data = fill_posts_samples(x_data)
        
        y_hat = ens_model.all_submodels[key_model]['model_class'].model.predict(x_data)
        predicts_by_model.update({key_model: y_hat})
        print('Predict submodel ', key_model, ': ', y_hat)      
        
        shap_values = explainers_dict[key_model].shap_values(x_data, y=y_data, nsamples=nsamples)
        shap_values_dict.update({key_model: shap_values})
        set_period_time_end(time_ini, "generate_explainer "+str(key_model)+"...")

    return predicts_by_model, shap_values_dict

In [None]:
def print_shap_values_words(shap_plot, ens_model, total_words):
    for key_model, value in ens_model.all_submodels.items():
        shap_plot.explainer_values_print(key_model, total_words)

In [None]:
def enconding_data_by_submodel(key_model, ens_model, data_df):
    exp = ens_model.all_submodels[key_model]['exp']
    exp.pp_data.type_prediction_label = ens_model.type_predict_label
    x_test, y_test = exp.pp_data.load_subdataset_generic(data_df, ens_model.labels_set)
    
    # Replace pad with itself text
    x_test = fill_posts_samples(x_test)
    
    return x_test, y_test

In [None]:
def predict_model(ens_model, data_df):
    print('\nModel \t UserId \t y      \t y_pred \t\t y_pred(%)')

    for key_model, value in ens_model.all_submodels.items():
        exp = ens_model.all_submodels[key_model]['exp']
        
        exp.pp_data.type_prediction_label = ens_model.type_predict_label
        x_data, y_data = exp.pp_data.load_subdataset_generic(data_df, ens_model.labels_set)

        # Replace pad with itself text
        x_data = fill_posts_samples(x_data)

        y_hat = ens_model.all_submodels[key_model]['model_class'].model.predict(x_data)
        
        for index in range(len(y_data)):
            print('%s \t %s \t %s \t %s \t %s' % (key_model, data_df.iloc[index].user_id, y_data[index], 
                                                  np.round(y_hat[index]), y_hat[index]))    

    y, y_hat = ens_model.test_final_model(data_df)           
    print('Predict ensemble model: ', y_hat)

In [None]:
def predict_model_by_year(ens_model, data_df):
    years = data_df.year.unique()
    
    for year in years:
        new_data_df = data_df[data_df.year == year]
        print('Predict sample for year = ', str(year))
        predict_model(ens_model, new_data_df)

In [None]:
def generate_shap_values_by_submodel(key_model, shap_plot, ens_model, data_df, nsamples):
    predicts_by_model = dict()
    shap_values_dict = dict()

    time_ini = datetime.datetime.now()    
    exp = ens_model.all_submodels[key_model]['exp']

    exp.pp_data.type_prediction_label = ens_model.type_predict_label
    x_data, y_data = exp.pp_data.load_subdataset_generic(data_df, ens_model.labels_set)
    
    # Replace pad with itself text
    x_data = fill_posts_samples(x_data)  

    y_hat = ens_model.all_submodels[key_model]['model_class'].model.predict(x_data)
    predicts_by_model.update({key_model: y_hat})
    print('Predict submodel ', key_model, ': ', y_hat)      

    shap_values = shap_plot.explainers_dict[key_model].shap_values(x_data, y=y_data, nsamples=nsamples)
    shap_values_dict.update({key_model: shap_values})
    set_period_time_end(time_ini, "generate_explainer "+str(key_model)+"...")

    return predicts_by_model, shap_values_dict

In [None]:
def generate_dataframe_top_words(shap_plot, total_words, ens_model, samples_df, nsamples):
    analise_samples = []
    
    for key_model, value in ens_model.all_submodels.items():
        # calcula valor shapley e predição
        predicts_by_model, shap_plot.shap_values_dict = generate_shap_values_by_submodel(key_model, 
                                                                                         shap_plot, ens_model, 
                                                                                         samples_df, nsamples)
        # resgata palavras para a mostra
        x_test, y_test = enconding_data_by_submodel(key_model, ens_model, samples_df)

        # Replace pad with itself text
        x_test = fill_posts_samples(x_test)

        word_lookup = shap_plot.generate_word_lookup(key_model, x_test)[0]

        word_for_class = dict()
        for index_class, label_name in enumerate(shap_plot.labels_classifier):
            ps_wrds, ns_wrds, p_wrds, n_wrds = shap_plot.get_signal_pos_neg_words(key_model, word_lookup,
                                                                                  total_words, index_class, 0)
            word_for_class.update({label_name: {'sig_pos_words': ps_wrds, 
                                                'sig_neg_words': ns_wrds,
                                                'pos_words': p_wrds, 
                                                'neg_words': n_wrds}})

        analise_samples.append([samples_df[0:1].user_id.values[0], 
                                samples_df[0:1].label.values[0], 
                                samples_df[0:1].texts.values[0], key_model, 
                                predicts_by_model[key_model], word_for_class, x_test,
                                shap_plot.explainers_dict[key_model].expected_value,
                                dict({key_model: shap_plot.shap_values_dict[key_model]}), 
                                str(word_lookup)])

    data_df = pd.DataFrame(analise_samples, 
                           columns=['user_id', 'label', 'texts', 'key_model', 
                                    'prediction', 'word_for_class', 'x_test',
                                    'explainers_expected_value', 'shap_values', 'word_lookup'])
    return data_df

In [None]:
def generate_plot(shap_plot, analise_model_df, samples_selec, max_features, submodel):
    key_models = [submodel+str(i) for i in range(1,6)]

    for i in range(len(samples_selec)):
        user_id = samples_selec[i:i+1].user_id.values[0]

        for key_model in key_models:
            wl = eval(analise_model_df[(analise_model_df.user_id == user_id) & 
                                       (analise_model_df.key_model == key_model)].word_lookup.values[0])
            xt = analise_model_df[(analise_model_df.user_id == user_id) & 
                                  (analise_model_df.key_model == key_model)].x_test.values[0]
            sv = analise_model_df[(analise_model_df.user_id == user_id) & 
                                  (analise_model_df.key_model == key_model)].shap_values.values[0][key_model]

            print('AMOSTRA USER_ID %s, CLASS %s, MODEL %s' % (str(user_id), samples_selec[i:i+1].label.values[0],
                                                              key_model))
            shap_plot.shap_values_dict[key_model] = sv

            shap_plot.feature_importance_plot(key_model, wl, xt, max_features=max_features)

In [None]:
def process_samples(ens_model, shap_plot, samples_selec, submodel):
    nsample = dict({'ca': 30, 'cd':30, 'cad': 25})
    
    predict_model(ens_model, samples_selec)

    for i in range(len(samples_selec)):
        df = generate_dataframe_top_words(shap_plot, 100, ens_model, samples_selec[i:i+1], nsample[submodel])
        
        df.to_pickle(dn.PATH_PROJECT + 'analise_'+submodel.lower()+'_user_'+
                     str(samples_selec[i:i+1].user_id.values[0])+'.df')

        generate_plot(shap_plot, df, samples_selec[i:i+1], 10, submodel.upper())

In [None]:
def generate_shap_ca():
    print('SETS ----> ', DATASET_TRAIN, LABEL_CLASS)

    train_df = pd.read_pickle(dn.PATH_PROJECT + DATASET_TRAIN)
    train_df = select_samples_by_total_class(train_df, LABEL_CLASS, TOTAL_SAMPLES_TRAIN_BY_CLASS)
    print(train_df.groupby('label').size())    
    
    ens_stk_ca = load_stacked_ensemble()
    
    shap_plot_ca = ShapAnalysePlots()
    shap_plot_ca.labels_classifier = ens_stk_ca.labels_set

    shap_plot_ca.tokenizers_dict = get_tokenizer_by_submodel(ens_stk_ca)
    
    shap_plot_ca.explainers_dict = generate_explainer_by_submodel(ens_stk_ca, train_df)
    
    return ens_stk_ca, shap_plot_ca

In [None]:
def generate_shap_cd():
    print('SETS ----> ', DATASET_TRAIN, LABEL_CLASS)

    train_df = pd.read_pickle(dn.PATH_PROJECT + DATASET_TRAIN)
    train_df = select_samples_by_total_class(train_df, LABEL_CLASS, TOTAL_SAMPLES_TRAIN_BY_CLASS)
    print(train_df.groupby('label').size())    
    
    ens_stk_cd = load_stacked_ensemble()
    
    shap_plot_cd = ShapAnalysePlots()
    shap_plot_cd.labels_classifier = ens_stk_cd.labels_set

    shap_plot_cd.tokenizers_dict = get_tokenizer_by_submodel(ens_stk_cd)
    
    shap_plot_cd.explainers_dict = generate_explainer_by_submodel(ens_stk_cd, train_df)
    
    return ens_stk_cd, shap_plot_cd

In [None]:
def generate_shap_cad():
    print('SETS ----> ', DATASET_TRAIN, LABEL_CLASS)
    
    train_df = pd.read_pickle(dn.PATH_PROJECT + DATASET_TRAIN)
    train_df = select_samples_by_total_class(train_df, LABEL_CLASS, TOTAL_SAMPLES_TRAIN_BY_CLASS)
    print(train_df.groupby('label').size())    
    
    ens_stk_cad = load_stacked_ensemble()
    
    shap_plot_cad = ShapAnalysePlots()
    shap_plot_cad.labels_classifier = ens_stk_cad.labels_set

    shap_plot_cad.tokenizers_dict = get_tokenizer_by_submodel(ens_stk_cad)
    
    shap_plot_cad.explainers_dict = generate_explainer_by_submodel(ens_stk_cad, train_df)
    
    return ens_stk_cad, shap_plot_cad

#### 2.2 Analysis of the ensemble model via submodels

In [None]:
SUBMODEL = 'CA'
DATASET_TRAIN = 'dataset/anxiety/SMHD_train_1040.df'
LABEL_CLASS = ['control', 'anxiety', 'depression']
TOTAL_SAMPLES_TRAIN_BY_CLASS = 300

ens_stk_ca, shap_plot_ca = generate_shap_ca()

In [None]:
SUBMODEL = 'CD'
DATASET_TRAIN = 'dataset/depression/SMHD_train_2160.df'
LABEL_CLASS = ['control', 'anxiety', 'depression']
TOTAL_SAMPLES_TRAIN_BY_CLASS = 300

ens_stk_cd, shap_plot_cd = generate_shap_cd()

In [None]:
SUBMODEL = 'CAD'
DATASET_TRAIN = 'dataset/anxiety,depression/SMHD_train_880.df'
LABEL_CLASS = ['control', 'anxiety,depression']
TOTAL_SAMPLES_TRAIN_BY_CLASS = 200

ens_stk_cad, shap_plot_cad = generate_shap_cad()

In [None]:
test_df = pd.read_pickle(dn.PATH_PROJECT + 'dataset/samples_for_interpretation/SMHD_multi_label_test_test_352_user_id.df')
test_df.groupby('label').size()

In [None]:
process_samples(ens_stk_ca, shap_plot_ca, test_df, 'ca')

In [None]:
process_samples(ens_stk_cd, shap_plot_cd, test_df, 'cd')

In [None]:
process_samples(ens_stk_cad, shap_plot_cad, test_df, 'cad')