In [1]:
import keras
import pandas as pd
import numpy as np

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
x_train = np.load('x_train.npy')
x_test = np.load('x_test.npy')
test_val = np.load('test_val.npy')
y_test = np.load('y_test.npy')
y_train = np.load('y_train.npy')
y_val = np.load('y_val.npy')

In [3]:
x_train.shape

(37065, 1011)

In [4]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.layers import BatchNormalization, InputSpec, add
from keras.optimizers import Adam
from keras.models import Model, load_model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers, activations
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.utils import Sequence


In [5]:
class DotProdSelfAttention(Layer):
    """The self-attention layer as in 'Attention is all you need'.
    paper reference: https://arxiv.org/abs/1706.03762
    
    """
    def __init__(self, units,
                 activation=None,
                 use_bias=False,
                 kernel_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 **kwargs):
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        super(DotProdSelfAttention, self).__init__(*kwargs)
        self.units = units
        self.activation = activations.get(activation)
        self.use_bias = use_bias
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.bias_initializer = initializers.get(bias_initializer)
        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)
        self.kernel_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)
        self.input_spec = InputSpec(min_ndim=2)
        self.supports_masking = True

    def build(self, input_shape):
        assert len(input_shape) == 3
        input_dim = input_shape[-1]
        # We assume the output-dim of Q, K, V are the same
        self.kernels = dict.fromkeys(['Q', 'K', 'V'])
        for key, _ in self.kernels.items():
            self.kernels[key] = self.add_weight(shape=(input_dim, self.units),
                                                initializer=self.kernel_initializer,
                                                name='kernel_{}'.format(key),
                                                regularizer=self.kernel_regularizer,
                                                constraint=self.kernel_constraint)
        if self.use_bias:
            raise NotImplementedError
        super(DotProdSelfAttention, self).build(input_shape)
        
    def call(self, x):
        Q = K.dot(x, self.kernels['Q'])
        K_mat = K.dot(x, self.kernels['K'])
        V = K.dot(x, self.kernels['V'])
        attention = K.batch_dot(Q, K.permute_dimensions(K_mat, [0, 2, 1]))
        d_k = K.constant(self.units, dtype=K.floatx())
        attention = attention / K.sqrt(d_k)
        attention = K.batch_dot(K.softmax(attention, axis=-1), V)
        return attention
    
    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) >= 2
        assert input_shape[-1]
        output_shape = list(input_shape)
        output_shape[-1] = self.units
        return tuple(output_shape)

In [6]:
class squash_function(Layer):
    def squash(s, axis=-1, epsilon=1e-7, name=None):
        with tf.name_scope(name, default_name="squash"):
            squared_norm = tf.reduce_sum(tf.square(s), axis=axis,
                                         keep_dims=True)
            safe_norm = tf.sqrt(squared_norm + epsilon)
            squash_factor = squared_norm / (1. + squared_norm)
            unit_vector = s / safe_norm
            return squash_factor * unit_vector

In [7]:
def encoder(input_tensor):
    

    """One encoder as in Attention Is All You Need"""

    # Sub-layer 1
    # Multi-Head Attention
    multiheads = []
    d_v = embed_size // n_heads
    for i in range(n_heads):
        multiheads.append(DotProdSelfAttention(d_v)(input_tensor))
    multiheads = concatenate(multiheads, axis=-1)
    #multiheads = Conv1D(64, kernel_size=1, padding="valid", kernel_initializer="glorot_uniform")(multiheads)
    multiheads = Dense(embed_size)(multiheads)
    #multiheads = Dropout(0.1)(multiheads)
    
    # Residual Connection
    res_con = add([input_tensor, multiheads])
    # Didn't use layer normalization, use Batch Normalization instead here
    res_con = squash_function()(res_con)
    
    # Sub-layer 2
    # 2 Feed forward layer
    ff1 = Dense(32, activation='relu')(res_con)
    ff2 = Dense(embed_size)(ff1)
    output = add([res_con, ff2])
    output = squash_function()(output)
    
    return output

In [8]:
import tensorflow as tf
from keras.layers import Dense, Input, Embedding, Lambda, Dropout, Activation, SpatialDropout1D, Reshape, GlobalAveragePooling1D, merge, Flatten, Bidirectional, CuDNNGRU, add, Conv1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras import optimizers
from keras import initializers
from keras.engine import InputSpec, Layer
from keras import backend as K

class AttentionWeightedAverage(Layer):

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

In [9]:
def Hamming_loss(y_true, y_pred):
    tmp = K.abs(y_true-y_pred)
    return K.mean(K.cast(K.greater(tmp,0.5),dtype=float))

In [10]:
filter_length = 128
num_classes = 20
max_words = 111396
maxlen = 1011
embed_size = 20
n_heads = 8
from sklearn.metrics import hamming_loss as hamming
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint


input1 = Input(shape=(maxlen,))
x = Embedding(max_words, 20, input_length=maxlen)(input1)

#x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
# Add positional encoding
#x = AddPositionalEncoding()(x)
#x = Dropout(0.1)(x)
for i in range(1):
    x = encoder(x)
# These are my own experiments
x = Conv1D(128, kernel_size=2, padding="valid", kernel_initializer="glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
#conc = AttentionWeightedAverage()(x)
conc = concatenate([avg_pool, max_pool])
#conc = concatenate([conc,attn])
conc = squash_function()(conc)
conc = Dense(32, activation="relu")(conc)
#conc = Dropout(0.1)(conc)

preds = Dense(num_classes, activation="sigmoid")(conc)
 
model = keras.Model(input1, preds)
 
model.summary()
 

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[Hamming_loss])

# import tensorflow as tf
# gpu_options = tf.GPUOptions(allow_growth=True)
# session = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))
# model.fit(x_train, y_train, batch_size=8, epochs=4)




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1011)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1011, 20)     2227920     input_1[0][0]                    
__________________________________________________________________________________________________
dot_prod_self_attention_1 (DotP (None, 1011, 2)      120         embedding_1[0][0]                
__________________________________________________________________________________________________
dot_prod_self_attention_2 (DotP (None, 1011, 2)      120         embedding_1[0][0]                
__________________________________________________________________________________________________
dot_pro

In [11]:
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

def _train_model(model, batch_size, train_x, train_y, test_val, y_val):
    num_labels = train_y.shape[1]
    patience = 7
    best_loss = -1
    best_weights = None
    best_epoch = 0
    
    current_epoch = 0
    
    while True:
        model.fit(x_train, y_train, batch_size=batch_size, epochs=1)
        y_pred = model.predict(test_val, batch_size=batch_size)

        total_loss = 0
        for j in range(num_labels):
            loss = log_loss(y_val[:, j], y_pred[:, j])
            total_loss += loss

        total_loss /= num_labels

        print("Epoch {0} loss {1} best_loss {2}".format(current_epoch, total_loss, best_loss))

        current_epoch += 1
        if total_loss < best_loss or best_loss == -1:
            best_loss = total_loss
            best_weights = model.get_weights()
            best_epoch = current_epoch
        else:
            if current_epoch - best_epoch == patience:
                break

    model.set_weights(best_weights)
    return model

In [12]:
import tensorflow as tf
gpu_options = tf.GPUOptions(allow_growth=True)
session = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))

batch_size = 8
history = _train_model(model,batch_size,x_train,y_train,test_val,y_val)


Epoch 1/1
Epoch 0 loss 0.38578275379417015 best_loss -1
Epoch 1/1
Epoch 1 loss 0.3731546902232906 best_loss 0.38578275379417015
Epoch 1/1
Epoch 2 loss 0.3706308835158708 best_loss 0.3731546902232906
Epoch 1/1
Epoch 3 loss 0.3721834573695413 best_loss 0.3706308835158708
Epoch 1/1
Epoch 4 loss 0.37841574437145165 best_loss 0.3706308835158708
Epoch 1/1
Epoch 5 loss 0.3865556740172814 best_loss 0.3706308835158708
Epoch 1/1
Epoch 6 loss nan best_loss 0.3706308835158708
Epoch 1/1
   16/37065 [..............................] - ETA: 5:17 - loss: 0.2466 - Hamming_loss: 0.0875

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


Epoch 7 loss nan best_loss 0.3706308835158708
Epoch 1/1
   16/37065 [..............................] - ETA: 5:02 - loss: 0.2107 - Hamming_loss: 0.0688

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


Epoch 8 loss nan best_loss 0.3706308835158708
Epoch 1/1
   16/37065 [..............................] - ETA: 5:38 - loss: 0.2631 - Hamming_loss: 0.1250

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)




  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


Epoch 9 loss nan best_loss 0.3706308835158708


In [13]:
y_pred1 = history.predict(x_test, batch_size=batch_size)
y_pred2 = history.predict(test_val, batch_size=batch_size)
y_pred1=pd.DataFrame(y_pred1)
y_pred2=pd.DataFrame(y_pred2)
y_pred1.to_pickle("y_pred_transformer_approach3_test.pkl")
y_pred2.to_pickle("y_pred_transformer_approach3_val.pkl")

In [15]:
y_pred_round = np.where(y_pred1>0.5,1,0)
y_pred_round1 = np.where(y_pred2>0.5,1,0)

In [16]:
from sklearn.metrics import jaccard_score,roc_auc_score,confusion_matrix,hamming_loss
print(jaccard_score(y_test,y_pred_round,average='micro'))
print(jaccard_score(y_test,y_pred_round,average='macro'))
print(roc_auc_score(y_test,y_pred_round))
print(hamming_loss(y_test,y_pred_round))

0.5163860440895678
0.3997940254773026
0.6778189264605493
0.17036199095022625


In [None]:
from keras.models import load_model
model.save('transformer_approach6.h5')

In [None]:
df = pd.read_pickle("./str+rad_text-combined.pkl")
df= df.drop(["SUBJECT_ID","text","HADM_ID"],axis=1)
x = df.iloc[:,:483]
y = df.iloc[:,483:]
from sklearn.model_selection import train_test_split
#train,test,val 
seed = 40
x_tra,x_test,y_tra,y_test = train_test_split(x,y,test_size=0.15,random_state=seed)
x_train,x_val,y_train,y_val = train_test_split(x_tra,y_tra,test_size = 0.2,random_state=seed)


%run helper_functions.ipynb

x_train_txt = np.load('x_train.npy') 
x_test_txt = np.load('x_test.npy')
test_val_txt = np.load('test_val.npy') 
y_test_txt = np.load('y_test.npy')
y_train_txt = np.load('y_train.npy')
y_val_txt = np.load('y_val.npy')
batch_size1 = 64

from keras.models import load_model
import tensorflow as tf
gpu_options = tf.GPUOptions(allow_growth=True)
session = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))
conv1 = load_model('conv1.h5',custom_objects={'squash_function': squash_function,
                                              'AttentionWeightedAverage':AttentionWeightedAverage,
                                              'Hamming_loss':Hamming_loss
                                             })

conv3 = load_model('conv3.h5',custom_objects={'squash_function': squash_function,
                                              'AttentionWeightedAverage':AttentionWeightedAverage,
                                              'Hamming_loss':Hamming_loss
                                             })

y_pred_conv3 = conv3.predict(test_val_txt, batch_size=batch_size1)

y_pred_conv1 = conv1.predict(test_val_txt, batch_size=batch_size1)


print(y_pred_conv3.shape,y_pred.shape,y_pred_lgb.shape)

# Structured models

from sklearn.externals import joblib
cat = joblib.load("/home/akshara/MIMIC-Project/Trained Models/CatBoostC.pkl")
lgb = joblib.load("/home/akshara/MIMIC-Project/try1/LGBM_40.pkl")

y_pred_cat = cat.predict(x_val)
y_pred_lgb = lgb.predict(x_val)

def corr(a, b):
    counts = 0
    for i in range(a.shape[0]):  
      # correlation
        print("Finding correlation row : {}".format(i))
      #print("Column to be measured: {}".format(prediction))
        cor = np.corrcoef(a[i,:],b[i,:])
        print("Pearson's correlation score: {}".format(cor))
        if(cor[0][1] < 0.8):  
            counts+=1
    return counts


#l = [corr(y_pred_conv3, y_pred_conv1),corr(y_pred_conv3, y_pred_cat),corr(y_pred_conv1, y_pred_cat),corr(y_pred_conv3, y_pred_lgb),corr(y_pred_conv1, y_pred_lgb)] 
l = [corr(y_pred_conv3, y_pred),corr(y_pred, y_pred_lgb),corr(y_pred_conv1, y_pred)] 

l

y_pred1 =0.6*y_pred+0.3*y_pred_lgb+0.1*y_pred_conv3
y_pred_round = np.where(y_pred1>0.5,1,0)

#y_pred_round = np.where(y_pred>0.5,1,0)
from sklearn.metrics import jaccard_score,roc_auc_score,confusion_matrix,hamming_loss
print(jaccard_score(y_val,y_pred_round,average='micro'))
print(jaccard_score(y_val,y_pred_round,average='macro'))
print(roc_auc_score(y_val,y_pred_round))
print(hamming_loss(y_val,y_pred_round))

