In [1]:
%matplotlib inline
from netCDF4 import Dataset as ncread
import numpy as np
from scipy.io import loadmat
import pandas as pd
import h5py
import math

In [2]:
import matplotlib.pyplot as plt
from matplotlib import rcParams #For changing text properties
import cmocean #A package with beautiful colormaps
from cartopy import crs as ccrs #Useful for plotting maps
import cartopy.util #Requires separate import
from cartopy.util import add_cyclic_point
import cartopy.feature as cf
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
import matplotlib.path as mpath

In [3]:
import keras

Using TensorFlow backend.


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.utils import shuffle

In [5]:
import keras
from keras import layers
from keras import Input
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras import regularizers

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Reshape, Flatten, LSTM, Dense, Dropout, Embedding, Bidirectional, GRU
from keras.optimizers import Adam
from keras import initializers, regularizers
from keras import optimizers
from keras.engine.topology import Layer
from keras import constraints

## LSTM with attention layer

In [6]:
# source: https://github.com/gentaiscool/lstm-attention/blob/58adc7e345b5b3a79638483049704802a66aa1f4/layers.py#L50 def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    def dot_product(x, kernel):
        if K.backend() == 'tensorflow':
            return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
        else:
            return K.dot(x, kernel)
        
class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    follows these equations:
    
    (1) u_t = tanh(W h_t + b)
    (2) \alpha_t = \frac{exp(u^T u)}{\sum_t(exp(u_t^T u))}, this is the attention weight
    (3) v_t = \alpha_t * h_t, v in time t
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        3D tensor with shape: `(samples, steps, features)`.
    """

    def __init__(self,
                W_regularizer=None, u_regularizer=None, b_regularizer=None,
                W_constraint=None, u_constraint=None, b_constraint=None,
                bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                initializer=self.init,
                                name='{}_W'.format(self.name),
                                regularizer=self.W_regularizer,
                                constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                    initializer='zero',
                                    name='{}_b'.format(self.name),
                                    regularizer=self.b_regularizer,
                                    constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                initializer=self.init,
                                name='{}_u'.format(self.name),
                                regularizer=self.u_regularizer,
                                constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero and this results in NaN's. 
        # Should add a small epsilon as the workaround
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        
        return weighted_input

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[1], input_shape[2]
    
class Addition(Layer):
    """
    This layer is supposed to add of all activation weight.
    We split this from AttentionWithContext to help us getting the activation weights
    follows this equation:
    (1) v = \sum_t(\alpha_t * h_t)
    
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    """

    def __init__(self, **kwargs):
        super(Addition, self).__init__(**kwargs)

    def build(self, input_shape):
        self.output_dim = input_shape[-1]
        super(Addition, self).build(input_shape)

    def call(self, x):
        return K.sum(x, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)

In [9]:
# LSTM with attention layer
mlen = 40
mdim = 2000
input_tensor = Input(shape=(mlen,mdim))
layer1 = layers.LSTM(10, return_sequences=True, kernel_regularizer=regularizers.l2(25))(input_tensor)
layer1 = layers.LSTM(10, return_sequences=True, kernel_regularizer=regularizers.l2(0.01))(layer1)
layer1 = AttentionWithContext()(layer1)
layer1 = Addition()(layer1)
layer1 = layers.Dense(10, activation="relu")(layer1)
output_tensor = layers.Dense(2,activation='softmax')(layer1)

callbacks_path = '/net/cfc/s2s/zhengwu/code/tmp/checkpoint_test'
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath=callbacks_path,
        monitor='val_acc',
        save_best_only=True,
    )
]
model = Model(input_tensor, output_tensor)
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 40, 2000)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 40, 10)            80440     
_________________________________________________________________
lstm_4 (LSTM)                (None, 40, 10)            840       
_________________________________________________________________
attention_with_context_2 (At (None, 40, 10)            120       
_________________________________________________________________
addition_2 (Addition)        (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 22        
Total para

In [None]:
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, callbacks=callbacks_list, validation_data=(X_validation, Y_validation))

In [None]:
# output attention weights 
layer_weights = model.layers[3].get_weights()

# extract layer output before the attention layer
new_model = Model(inputs=model.input, outputs=model.layers[2].output)
output_before_att = new_model.predict(X_validation)
print(type(output_before_att),np.shape(output_before_att))

# weights from Yang et al. (2016) HAN
uit = np.tanh(np.dot(output_before_att, layer_weights[0]) + layer_weights[1]) 
eij = np.dot(uit, layer_weights[2])
print(uit.shape,eij.shape)
eij = eij.reshape((eij.shape[0], eij.shape[1]))
print(eij.shape)
ai = np.exp(eij)
weights = np.ndarray((len(ai),mlen))+np.nan
for ii in range(len(ai)):
    weights[ii,:] = ai[ii,:] / np.sum(ai[ii,:])
print(ai.shape,weights.shape)

In [None]:
weights_norm = weights/np.nanmax(weights, axis=(-1))[:,np.newaxis]
fig, axs = plt.subplots(1, figsize=plt.figaspect(0.15))
for ii in range(len(weights)):
    plt.plot(weights[ii,:]*30)

fig, axs = plt.subplots(1, figsize=plt.figaspect(0.15))
plt.plot(np.mean(weights,axis=0))

fig, axs = plt.subplots(1, figsize=plt.figaspect(0.15))
plt.plot(np.mean(weights,axis=0)/np.nanmax(np.mean(weights,axis=0)),'k',linewidth=2)