Meta-Embedding

This experiment is to assess the feasibility of meta-embeddings in word representation of clinical notes. 
Reference Paper: https://arxiv.org/pdf/1804.05262.pdf

- averaging and concatenation of different word2vec models




In [20]:
import gensim
import numpy as np
import pandas as pd
from keras import regularizers
from keras.layers import Conv1D, MaxPooling1D, Embedding, Conv2D, Reshape, MaxPool2D, Concatenate
from keras.layers import Dense, Input, Flatten, Dropout, merge
from keras.models import Model
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import gc

In [21]:
medical_notes = pd.read_csv(r"C:\Users\61102\PSU-PhD\Holmusk\ClinNotes_Preprocess.csv")

In [22]:
medical_notes['category'].replace({'Cardiovascular / Pulmonary': 1, 'Neurology': 2, 'Gastroenterology': 3},inplace=True)

In [23]:
## split to train, test, val
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(medical_notes, test_size=0.2, random_state=2018)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

tokenizer = RegexpTokenizer(r'\w+')
train_df["tokens"] = train_df["preprocess_notes"].apply(tokenizer.tokenize)
test_df["tokens"] = test_df["preprocess_notes"].apply(tokenizer.tokenize)
val_df["tokens"] = val_df["preprocess_notes"].apply(tokenizer.tokenize)

all_words = [word for tokens in train_df["tokens"] for word in tokens]
all_words += [word for tokens in test_df["tokens"] for word in tokens]
all_words += [word for tokens in val_df["tokens"] for word in tokens]
VOCAB = sorted(list(set(all_words)))
print(len(VOCAB))


13617


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [24]:
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 500
VOCAB_SIZE = len(VOCAB)

In [25]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_df["preprocess_notes"].tolist())
tokenizer.fit_on_texts(test_df["preprocess_notes"].tolist())
tokenizer.fit_on_texts(val_df["preprocess_notes"].tolist())

train_sequences = tokenizer.texts_to_sequences(train_df["preprocess_notes"].tolist())
test_sequences = tokenizer.texts_to_sequences(test_df["preprocess_notes"].tolist())
val_sequences = tokenizer.texts_to_sequences(val_df["preprocess_notes"].tolist())

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 13626 unique tokens.


In [26]:
train_X = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_X = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
val_X = pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)

labelencoder_Y = LabelEncoder()
labels_train = labelencoder_Y.fit_transform(train_df["category"])
train_y = to_categorical(labels_train, num_classes=3)
labels_test = labelencoder_Y.fit_transform(test_df["category"])
test_y = to_categorical(labels_test, num_classes=3)
labels_val = labelencoder_Y.fit_transform(val_df["category"])
val_y = to_categorical(labels_val, num_classes=3)

In [27]:
word2vec_path = r"C:\Users\61102\PSU-PhD\Holmusk\GoogleNews-vectors-negative300.bin"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True, no_header=False,unicode_errors='ignore')
embedding_matrix_1 = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, index in word_index.items():
    embedding_matrix_1[index, :] = embedding_matrix_1[index, :] = word2vec[word] if word in word2vec else np.random.rand(
        EMBEDDING_DIM)

In [28]:
word2vec_path = r"C:\Users\61102\PSU-PhD\Holmusk\ClinNotes_w2v.bin"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True, no_header=False,unicode_errors='ignore')
embedding_matrix_2 = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, index in word_index.items():
    embedding_matrix_2[index, :] = embedding_matrix_2[index, :] = word2vec[word] if word in word2vec else np.random.rand(
        EMBEDDING_DIM)

In [29]:
embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_2], axis = 0)
del embedding_matrix_1, embedding_matrix_2
gc.collect()
np.shape(embedding_matrix)

(13627, 300)

In [30]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index, trainable=False, extra_conv=True):
    embedding_layer = Embedding(num_words,
                                embedding_dim,
                                weights=[embeddings],
                                input_length=max_sequence_length,
                                trainable=trainable)

    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    convs = []
    filter_sizes = [3, 4, 5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    l_merge = merge.concatenate(inputs=convs, axis=1)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
    pool = MaxPooling1D(pool_size=3)(conv)

    if extra_conv:
        x = Dropout(0.5)(l_merge)
    else:
        # Original Yoon Kim model
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    # x = Dropout(0.5)(x)

    preds = Dense(labels_index, activation='softmax')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [35]:
model = ConvNet(embedding_matrix, MAX_SEQUENCE_LENGTH, len(word_index) + 1, EMBEDDING_DIM,
                len(list(train_df["category"].unique())), True, False)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 500, 300)          4088100   
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 498, 128)          115328    
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 166, 128)          0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 166, 128)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 21248)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               2719

In [36]:
history = model.fit(train_X, train_y, batch_size=512, epochs=20, validation_data=(val_X, val_y))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 588 samples, validate on 66 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [37]:
preds = model.predict(test_X)
print("Classification Report")
print(classification_report(np.argmax(test_y, axis=1), np.argmax(preds, axis=1)))
print("Confusion Matrix")
print(confusion_matrix(np.argmax(test_y, axis=1), np.argmax(preds, axis=1)))
print("Accuracy Score")
print(accuracy_score(np.argmax(test_y, axis=1), np.argmax(preds, axis=1), normalize=True))
print("ROC-AUC Score")
print(roc_auc_score(test_y, preds, multi_class='ovr'))

Classification Report
              precision    recall  f1-score   support

           0       0.51      1.00      0.67        83
           1       0.00      0.00      0.00        42
           2       0.00      0.00      0.00        39

    accuracy                           0.51       164
   macro avg       0.17      0.33      0.22       164
weighted avg       0.26      0.51      0.34       164

Confusion Matrix
[[83  0  0]
 [42  0  0]
 [39  0  0]]
Accuracy Score
0.5060975609756098
ROC-AUC Score
0.5502289652615547


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
