In [1]:
%matplotlib inline

import imp
import keras.backend
import keras.models
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

import keras
from keras.datasets import mnist
from keras.models import Model
# from keras.optimizers import RMSprop
from keras import optimizers

import innvestigate
import innvestigate.applications
import innvestigate.applications.mnist
import innvestigate.utils as iutils
import innvestigate.utils.visualizations as ivis
from innvestigate.utils.tests.networks import base as network_base
import time

from IPython.core.display import display, HTML
from innvestigate.tools import Perturbation, PerturbationAnalysis

eutils = imp.load_source("utils", "../utils.py")
mnistutils = imp.load_source("utils_mnist", "../utils_mnist.py")

Using TensorFlow backend.


# Introduction

In this experiment, we are going to build a sentiment analysis classifer, similar to [Arras et al. (2017)][arras]. In particular, we are going to predict sentiments of movie reviews, and apply explanation methods provided by **iNNvestigate** to analyze which words in each sentence influence the prediction of sentiments.

The dataset that we are going to use is [Standford Sentiment Treebank][standford]. Although the original dataset contains reviews in 5 categories: *very negative, negative, neutral, positive,* and *very positive*, *neutral* reviews are excluded and we are here only interested whether reviews are positive or negative. In other words, we are going to build a binary classification. 


In the following, we first download a prepared dataset to local and load it to the notebook. Then, we construct a neural network model, receiving reviews as input and predicting their sentiments. Finally, we apply various explanation methods implemented in iNNvestigate to explain decisions of models. Below is an example of a explanation where red indicates a high relevance score in favour of the prediction, while blue is the opposite.

![][sample]

[arras]: http://www.aclweb.org/anthology/W16-1601
[standford]: https://nlp.stanford.edu/sentiment/
[sample]: https://i.imgur.com/AZm1YcD.png

# Data Preparation
As this notebook is meant to demonstate the functionalities of iNNvestigate, the data was prepared and can be downloaded from:
```
https://drive.google.com/uc?authuser=0&id=1c58cNitBbiLhyC5mUh01JtWgzEElaZgF&export=download
```
The archive contains the following files:
```
sstb-bin
├── dict.txt
├── embeddings.npy
├── sequence_test.txt
└── sequence_train.txt
```

Once the download is complete, please extract the archieve and set the `DATA_PATH` variable to the extracted directory.

In [23]:
# Project Parameters

DATA_PATH = '<PLEASE_UPDATE_THIS_PATH_ACCORDINGLY>'

# reviews will be truncated to this length.
MAX_SEQ_LENGTH = 40

NUM_CLASSES = 2
LABEL_TO_CLASS_NAME = {
    0 : 'negative',
    1 : 'positive'
}

In [3]:
# load embedding weights
embedding_weights = np.load('%s/embeddings.npy' % DATA_PATH)

# load directory 
df_vocab = pd.read_csv('%s/dict.txt' % DATA_PATH, sep='::', header=None, names=['vocab'])
decode_dict = dict(zip(range(len(df_vocab)), list(df_vocab.vocab.values)))

  """


In [5]:
def read_data(path):
    x = []
    y = []

    with open(path) as fp:
        for line in fp:
            tokens = np.array(line.strip().split(' ')).astype(int)

            # the vocab indices from the data start from 1
            seq = list(tokens[1:] - 1)
            x.append(seq)

            # the label in the data ranges between from 1-5
            y.append(tokens[0] - 1)

    return x, y

In [8]:
dataset_names = {
    'train': 'sequence_train.txt',
    'test': 'sequence_test.txt',
} 
datasets = dict()


for k, v in dataset_names.items():
    x, y = read_data('%s/%s' % (DATA_PATH, v))
    y = np.array(y)
    total_samples = y.shape[0]
    
    # create a 3-dimensional data matrix, padding reviews to have MAX_SEQ_LENGTH
    xd = np.zeros((total_samples, MAX_SEQ_LENGTH, embedding_weights.shape[1]))
    for i in range(total_samples):
        lx = len(x[i])
        for j, widx in enumerate(x[i]):
            if j < MAX_SEQ_LENGTH:
                xd[i, j, :] = embedding_weights[widx]
            else:
                break
    # filter out neutral reviews
    indices = np.where(y != 2)
    
    y_selected = y[indices].reshape(-1)
    y_final = np.zeros(y_selected.shape)
    # for positive and very negative reviews, their targets are set to `1`, while negative and very negative are `0`.
    y_final[y_selected > 2] = 1
    
    # return padded reviews, target, and reviews (without padding)
    datasets[k] = dict(
        padded_reviews=xd[indices],
        padded_reviews_4d=np.expand_dims(xd[indices], axis=1),
        targets=y_final.astype(int),
        reviews=np.array(x)[indices]
    )

In [9]:
print('We have %d reviews in the training set, and %d reviews in the testing set' % 
      (len(datasets['train']['reviews']), len(datasets['test']['reviews']))
     )

We have 98788 reviews in the training set, and 1821 reviews in the testing set


In [10]:
sample_idx = 90860

print('Review %d:\n%s' %
      (sample_idx, ' '.join(map(lambda x: decode_dict[x], datasets['train']['reviews'][sample_idx]))))

Review 90860:
are jarring and deeply out of place in what could have -lrb- and probably should have -rrb- been a lighthearted comedy


# Model Construction

For this propose, we are going to use a convoluational neural network, which was experimented in [Arras et al. (2016)][arras2]. As shown below, the architecture has a convoluationa layer, convolving word embeddings of two words, followed by a max pooling layer and a softmax layer.

![][arch]

[arch]: https://i.imgur.com/GE8nrWX.png
[arras2]: https://arxiv.org/abs/1612.07843

In [11]:
def build_network(input_shape, output_n, activation=None, dense_unit=256, dropout_rate=0.25):
    if activation:
        activation = "relu"

    net = {}
    net["in"] = network_base.input_layer(shape=input_shape)
    net["conv"] = keras.layers.Conv2D(filters=100, kernel_size=(1,2), strides=(1, 1), padding='valid')(net["in"])
    net["pool"] = keras.layers.MaxPooling2D(pool_size=(1, input_shape[2]-1), strides=(1,1))(net["conv"])
    net["out"] = network_base.dense_layer(keras.layers.Flatten()(net["pool"]), units=output_n, activation=activation)
    net["sm_out"] = network_base.softmax(net["out"])


    net.update({
        "input_shape": input_shape,

        "output_n": output_n,
    })
    return net

net = build_network((None, 1, MAX_SEQ_LENGTH, embedding_weights.shape[1]), NUM_CLASSES)
model_without_softmax, model_with_softmax = Model(inputs=net['in'], outputs=net['out']), Model(inputs=net['in'], outputs=net['sm_out'])

In [14]:
def train_model(model,  batch_size=128, epochs=20):

    x_train = datasets['train']['padded_reviews_4d']
    y_train = datasets['train']['targets']
    
    x_test = datasets['test']['padded_reviews_4d']
    y_test = datasets['test']['targets']
    
    # convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, NUM_CLASSES)
    y_test = keras.utils.to_categorical(y_test, NUM_CLASSES)

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizers.Adam(),
                  metrics=['accuracy'])

    history = model.fit(x_train, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1)
    score = model.evaluate(x_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

In [15]:
train_model(model_with_softmax, batch_size=256, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.4313657436149583
Test accuracy: 0.8215266338486648


In [16]:
model_without_softmax.set_weights(model_with_softmax.get_weights())

# Model Analysis and Visualization

At this stage, we have a trained model and are ready to explain it via **iNNvestigate**'s analyzers.

In [17]:
# Specify methods that you would like to use to explain the model. 
# Please refer to iNNvestigate's documents for avalible methods.
methods = ['gradient', 'guided_backprop', 'lrp.z', 'lrp.alpha_2_beta_1', 'pattern.net']

In [18]:
# build an analyzer for each method
analyzers = []

for method in methods:
    analyzer = innvestigate.create_analyzer(method, model_without_softmax)
    analyzer.fit(np.expand_dims(datasets['train']['padded_reviews'], axis=1), pattern_type='relu', batch_size=256, verbose=1)
    analyzers.append(analyzer)



Epoch 1/1


In [20]:
# specify indices of reviews that we want to investigate
test_sample_indices = [1718, 726, 908, 1523, 454, 539]
test_sample_preds = [None]*len(test_sample_indices)

# a variable to store analysis results.
analysis = np.zeros([len(test_sample_indices), len(analyzers), 1, MAX_SEQ_LENGTH])

for i, ridx in enumerate(test_sample_indices):

    x, y = datasets['test']['padded_reviews'][ridx], datasets['test']['targets'][ridx]
    

    t_start = time.time()
    x = x.reshape((1, 1, MAX_SEQ_LENGTH, embedding_weights.shape[1]))    

    presm = model_without_softmax.predict_on_batch(x)[0] #forward pass without softmax
    prob = model_with_softmax.predict_on_batch(x)[0] #forward pass with softmax
    y_hat = prob.argmax()
    test_sample_preds[i] = y_hat
    
    for aidx, analyzer in enumerate(analyzers):

        a = np.squeeze(analyzer.analyze(x))
        a = np.sum(a, axis=1)

        analysis[i, aidx] = a
    t_elapsed = time.time() - t_start
    print('Review %d (%.4fs)'% (ridx, t_elapsed))

Review 1718 (1.3727s)
Review 726 (0.0041s)
Review 908 (0.0036s)
Review 1523 (0.0034s)
Review 454 (0.0033s)
Review 539 (0.0033s)


## Visualization

In this part, we will visualize analysis results as heatmaps in which each word is highlighted with a different color depending on its relevance score. For example, if a word is shaded in red, it means that word positively influence the outcome. On the other hand, being shaded in blue indicates a negative influence. 

The implementation is drawn from [Leila's LRP_for_LSTM](arras_lstm). In brief, the visualization is simply a construction of HTML DOMs with shaded background as discribed previously. The 

[arras_lstm]: https://github.com/ArrasL/LRP_for_LSTM

In [21]:
# code in this block taken from https://github.com/ArrasL/LRP_for_LSTM

def rescale_score_by_abs(score, max_score, min_score):
    """
    rescale positive score to the range [0.5, 1.0], negative score to the range [0.0, 0.5],
    using the extremal scores max_score and min_score for normalization
    """

    # CASE 1: positive AND negative scores occur --------------------
    if max_score > 0 and min_score < 0:

        if max_score >= abs(min_score):  # deepest color is positive
            if score >= 0:
                return 0.5 + 0.5 * (score / max_score)
            else:
                return 0.5 - 0.5 * (abs(score) / max_score)

        else:  # deepest color is negative
            if score >= 0:
                return 0.5 + 0.5 * (score / abs(min_score))
            else:
                return 0.5 - 0.5 * (score / min_score)

                # CASE 2: ONLY positive scores occur -----------------------------
    elif max_score > 0 and min_score >= 0:
        if max_score == min_score:
            return 1.0
        else:
            return 0.5 + 0.5 * (score / max_score)

    # CASE 3: ONLY negative scores occur -----------------------------
    elif max_score <= 0 and min_score < 0:
        if max_score == min_score:
            return 0.0
        else:
            return 0.5 - 0.5 * (score / min_score)
        
def getRGB(c_tuple):
    return "#%02x%02x%02x" % (int(c_tuple[0] * 255), int(c_tuple[1] * 255), int(c_tuple[2] * 255))

def span_word(word, normalized_score, raw_score, colormap, highlight=False, attribute="background-color"):
    return "<span style=\"{attribute}: {color}; padding: 1px;\" title=\"relevance {rel}\">{word}</span> ".format(
        attribute=attribute,
        rel=raw_score,
        color=getRGB(colormap(normalized_score)),
        word=word,
    )

def html_heatmap(method, words, scores, cmap_name="bwr", short_version=True):
    colormap = plt.get_cmap(cmap_name)

    assert len(words) == len(scores)
    max_s = max(scores)
    min_s = min(scores)

    output_text = ""

    for idx, w in enumerate(words):
        score = rescale_score_by_abs(scores[idx], max_s, min_s)
        output_text = output_text + span_word(w, score, scores[idx], colormap) + " "

    prefix = '<b>> </b>' if short_version else '<b>Heatmap(%s):</b> ' % method

    return HTML(prefix + output_text)

In [22]:
# Traverse over the analysis results and visualize them.
for i, idx in enumerate(test_sample_indices):

    words = [decode_dict[t] for t in list(datasets['test']['reviews'][idx])]
    
    print('------------')
    print('Review %d' % idx)
    print('Sentence : %s' % ' '.join(words))
    y_true = datasets['test']['targets'][idx]
    y_pred = test_sample_preds[i]

    print("Pred class : %s %s" % (LABEL_TO_CLASS_NAME[y_pred], '✓' if y_pred == y_true else '✗ (%s)' % LABEL_TO_CLASS_NAME[y_true]))
                                
    
    for j, method in enumerate(methods):
        h = html_heatmap(method, words, analysis[i, j][0, :len(words)], short_version=False)
        display(h)

------------
Review 1718
Sentence : this may not have the dramatic gut-wrenching impact of other holocaust films , but it 's a compelling story , mainly because of the way it 's told by the people who were there .
Pred class : positive ✓


------------
Review 726
Sentence : without heavy-handedness , dong provides perspective with his intelligent grasp of human foibles and contradictions .
Pred class : positive ✓


------------
Review 908
Sentence : a frantic search for laughs , with a hit-to-miss ratio that does n't exactly favour the audience .
Pred class : negative ✓


------------
Review 1523
Sentence : what you would end up with if you took orwell , bradbury , kafka , george lucas and the wachowski brothers and threw them into a blender .
Pred class : positive ✓


------------
Review 454
Sentence : a charming yet poignant tale of the irrevocable ties that bind .
Pred class : positive ✓


------------
Review 539
Sentence : but he loses his focus when he concentrates on any single person .
Pred class : negative ✓
