# Working with Shap to do some vis on the D1 DAN model
Sean Browning

In [1]:
# === Lib ==========
import os
import pickle as pkl
import shap

import numpy as np
import pandas as pd
import tensorflow.keras as keras
import tensorflow_addons as tfa
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import TensorBoard

import tools.analysis as ta
import tools.preprocessing as tp
import tools.keras as tk

In [12]:
# === GLOBALS ======
OUTCOME = "misa_pt"
DAY_ONE_ONLY = True
WEIGHTED_LOSS = False
TEST_SPLIT = 0.2
VAL_SPLIT = 0.1
RAND = 2021
TB_UPDATE_FREQ = 100
MOD_NAME = "dan"
TIME_SEQ = 225
BATCH_SIZE = 128
EPOCHS = 20

# === DIRS ================

pwd = globals()['_dh'][0]

# If no args are passed to overwrite these values, use repo structure to construct
data_dir = os.path.abspath(os.path.join(pwd, "..", "data", "data", ""))
output_dir = os.path.abspath(os.path.join(pwd, "..", "output", ""))

tensorboard_dir = os.path.abspath(
    os.path.join(data_dir, "..", "model_checkpoints"))
pkl_dir = os.path.join(output_dir, "pkl")
stats_dir = os.path.join(output_dir, "analysis")
probs_dir = os.path.join(stats_dir, "probs")

## Load in data

In [5]:
# Data load
with open(os.path.join(pkl_dir, OUTCOME + "_trimmed_seqs.pkl"), "rb") as f:
    inputs = pkl.load(f)

with open(os.path.join(pkl_dir, "all_ftrs_dict.pkl"), "rb") as f:
    vocab = pkl.load(f)

with open(os.path.join(pkl_dir, "feature_lookup.pkl"), "rb") as f:
    all_feats = pkl.load(f)

with open(os.path.join(pkl_dir, "demog_dict.pkl"), "rb") as f:
    demog_lookup = pkl.load(f)

## Model-specific settings

In [6]:
# Determining number of vocab entries
N_VOCAB = len(vocab) + 1
N_DEMOG = len(demog_lookup) + 1
MAX_DEMOG = max(len(x) for _, x, _ in inputs)
N_CLASS = max(x for _, _, x in inputs) + 1

# Setting y here so it's stable
y = np.array([l[2] for l in inputs])

# Create some metrics
metrics = [
    keras.metrics.AUC(num_thresholds=int(1e5), name="ROC-AUC"),
    keras.metrics.AUC(num_thresholds=int(1e5), curve="PR", name="PR-AUC"),
    tfa.metrics.F1Score(num_classes=N_CLASS if N_CLASS > 2 else 1, average="weighted")
]

if OUTCOME == 'multi_class':
    loss_fn = keras.losses.categorical_crossentropy
else:
    loss_fn = keras.losses.binary_crossentropy

callbacks = [
    # Create early stopping callback
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                    min_delta=0,
                                    patience=2,
                                    mode="auto")
]

## Splitting the data

In [13]:
# Splitting the data
train, test = train_test_split(range(len(inputs)),
                                test_size=TEST_SPLIT,
                                stratify=y,
                                random_state=RAND)

train, val = train_test_split(train,
                                test_size=VAL_SPLIT,
                                stratify=y[train],
                                random_state=RAND)

# Optional weighting
if WEIGHTED_LOSS:
    classes = np.unique(y)
    weights = compute_class_weight('balanced', classes=classes, y=y[train])
    weight_dict = {c: weights[i] for i, c in enumerate(classes)}
else:
    weight_dict = None

## Preparing data

In [8]:
if DAY_ONE_ONLY:
    # Optionally limiting the features to only those from the first day
    # of the actual COVID visit
    features = [l[0][-1] for l in inputs]
else:
    features = [tp.flatten(l[0]) for l in inputs]

# Handling demog
new_demog = [[i + N_VOCAB - 1 for i in l[1]] for l in inputs]
features = [
    features[i] + new_demog[i] for i in range(len(features))
]
demog_vocab = {k: v + N_VOCAB - 1 for k, v in demog_lookup.items()}
vocab.update(demog_vocab)
N_VOCAB = np.max([np.max(l) for l in features]) + 1

# Making the variables
X = keras.preprocessing.sequence.pad_sequences(features,
                                                maxlen=225,
                                                padding='post')

## Model Training

In [19]:
# Produce DAN model to fit
model = tk.DAN(vocab_size=N_VOCAB,
                ragged=False,
                input_length=TIME_SEQ)

model.compile(optimizer="adam", loss=loss_fn, metrics=metrics)

model.fit(X[train],
            y[train],
            batch_size=BATCH_SIZE,
            epochs=EPOCHS,
            validation_data=(X[val], y[val]),
            callbacks=callbacks)

Epoch 1/20


ValueError: in user code:

    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow_addons\metrics\f_scores.py:159 update_state  *
        self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight))
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py:823 assign_add  **
        assign_add_op = gen_resource_variable_ops.assign_add_variable_op(
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\ops\gen_resource_variable_ops.py:56 assign_add_variable_op
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\framework\op_def_library.py:742 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\framework\func_graph.py:591 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\framework\ops.py:3477 _create_op_internal
        ret = Operation(
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\framework\ops.py:1974 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\framework\ops.py:1815 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimension 0 in both shapes must be equal, but are 2 and 1. Shapes are [2] and [1]. for '{{node AssignAddVariableOp_10}} = AssignAddVariableOp[dtype=DT_FLOAT](AssignAddVariableOp_10/resource, Sum_10)' with input shapes: [], [1].


In [14]:
if N_CLASS > 2:
    # We have to pass one-hot labels for model fit, but CLF metrics
    # will take indices
    n_values = np.max(y) + 1
    y_one_hot = np.eye(n_values)[y]

    # Produce DAN model to fit
    model = tk.DAN(vocab_size=N_VOCAB,
                    ragged=False,
                    input_length=TIME_SEQ,
                    n_classes=N_CLASS)

    model.compile(optimizer="adam", loss=loss_fn, metrics=metrics)

    model.fit(X[train],
                y_one_hot[train],
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                validation_data=(X[val], y_one_hot[val]),
                callbacks=callbacks,
                class_weight=weight_dict)

else:
    # Produce DAN model to fit
    model = tk.DAN(vocab_size=N_VOCAB,
                    ragged=False,
                    input_length=TIME_SEQ)

    model.compile(optimizer="adam", loss=loss_fn, metrics=metrics)

    model.fit(X[train],
                y[train],
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                validation_data=(X[val], y[val]),
                callbacks=callbacks,
                class_weight=weight_dict)

Epoch 1/20


ValueError: in user code:

    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow_addons\metrics\f_scores.py:159 update_state  *
        self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight))
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py:823 assign_add  **
        assign_add_op = gen_resource_variable_ops.assign_add_variable_op(
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\ops\gen_resource_variable_ops.py:56 assign_add_variable_op
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\framework\op_def_library.py:742 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\framework\func_graph.py:591 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\framework\ops.py:3477 _create_op_internal
        ret = Operation(
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\framework\ops.py:1974 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    c:\Users\oet5\premier_analysis\venv\lib\site-packages\tensorflow\python\framework\ops.py:1815 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimension 0 in both shapes must be equal, but are 2 and 1. Shapes are [2] and [1]. for '{{node AssignAddVariableOp_10}} = AssignAddVariableOp[dtype=DT_FLOAT](AssignAddVariableOp_10/resource, Sum_10)' with input shapes: [], [1].


## Generate Predictions

In [None]:
# Produce DAN predictions on validation and test sets
val_probs = model.predict(X[val])
test_probs = model.predict(X[test])

## Working with Shapley
I think here we can just use the DeepExplainer class, which is an implmentation of DeepLIFT.


First we have to take a sample of training data

In [None]:
shap_sample = x[np.random.choice(train, 100, replace=False)]

Create our explainer

In [None]:
 
explain = shap.DeepExplainer(model, shap_sample)