## Text Embedding Using Word2Vec

#### Load the Raw Data

In [1]:
import os
import pandas as pd
 
# Set the working directory for the project
os.chdir('C://Users/Dane/Documents/GitHub/seis735_project/')

# Training variants
variants = pd.read_csv("data/raw/training_variants")

# Load the data from file
text = pd.read_csv("data/raw/training_text", 
                   sep="\|\|", 
                   header=None, 
                   skiprows=1, 
                   names=["ID","Text"],
                   engine="python"
                  )

print(variants.shape)
print(text.shape)

(3321, 4)
(3321, 2)


#### Merge Variants and Text Files

In [2]:
# Use inner join to merge the datasets on ID
merged = pd.merge(left=variants, right=text, how="inner", on="ID")

# Dropping the variants and text datasets as we won't need them anymore
del variants, text

print(merged.shape)
print(merged.dtypes)

(3321, 5)
ID            int64
Gene         object
Variation    object
Class         int64
Text         object
dtype: object


In [3]:
merged.head(5)

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


#### Split the Data into Train and Test

In [4]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test
train, test = train_test_split(merged, test_size=0.1, random_state=20171104)

print(train.shape)
print(test.shape)

(2988, 5)
(333, 5)


#### Sentences into Tokens
Note I've already defined our vocab and cleansed our texts. Let's load the already cleansed texts.

In [5]:
import pickle

# Import the pre-defined training tokens
with open('models/training_text.pickle', 'rb') as obj:
    texts_train = pickle.load(obj)
    
# Import the pre-defined test tokens
with open('models/test_text.pickle', 'rb') as obj:
    texts_test = pickle.load(obj)

# Printing the size of our lines object. It should be 2,988 in length
print(len(texts_train))
print(len(texts_test))

2988
333


Now we need to split the text into individual tokens.

In [6]:
tokens_train = [line.split() for line in texts_train]
tokens_test = [line.split() for line in texts_test]

print(len(tokens_train))
print(len(tokens_test))

2988
333


#### Use Gensim to Perform Text Embedding
First we train the word2vec model on our training data.

In [10]:
from gensim.models import Word2Vec

# Train our model on the train_tokens
model = Word2Vec(tokens_train, min_count=10, size=300)

# Summarize the model
print(model)

# Save the model
model.save('models/word2vec_train.bin')

Word2Vec(vocab=23899, size=300, alpha=0.025)


Grab the vector weights from the trained embedding model. These weights will be used to create a Keras Embedding layer.

In [15]:
weights = model.wv.syn0

Create an embedding matrix representation of the word2vec model.

In [50]:
import numpy as np

# Load our trained word2vec model
model = Word2Vec.load('models/word2vec_train.bin')

# Size of our vocabulary
vocab_size = len(tokenizer.word_index) + 1

# convert the wv word vectors into a numpy matrix that is suitable for insertion into our TensorFlow and Keras models
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    embedding_vector = model[word]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
print(embedding_matrix.shape)

(23900, 300)


Import our previously defined tokenizer, which was trained during our bag-of-words processing.

In [18]:
# Import our tokenizer object
with open('models/tokenizer.pickle', 'rb') as obj:
    tokenizer = pickle.load(obj)

Using TensorFlow backend.


Convert our text data into sequences, and pad the sequences.

In [36]:
train_encoded = tokenizer.texts_to_sequences(texts_train)
test_encoded = tokenizer.texts_to_sequences(texts_test)

In [85]:
from keras.preprocessing.sequence import pad_sequences

# Pad documents to a max length
#max_length = max(len(x) for x in texts_train)
#print(max_length)
max_length = 20000
train_padded = pad_sequences(train_encoded, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_encoded, maxlen=max_length, padding='post')
print(train_padded.shape)

(2988, 20000)


The final step before we start training our model is to convert our target attributes into dummies.

In [52]:
# Convert predictors to matrix format
y_train_true = train.as_matrix()[:,0]
y_train = pd.get_dummies(train[['Class']], prefix='y', columns=['Class']).as_matrix()

y_test_true = test.as_matrix()[:,0]
y_test = pd.get_dummies(test[['Class']], prefix='y', columns=['Class']).as_matrix()

print(y_train_true.shape)
print(y_train.shape)
print(y_test_true.shape)
print(y_test.shape)

(2988,)
(2988, 9)
(333,)
(333, 9)


#### Train a Sequential FF Neural Network

In [54]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

# Create the model architecture
model = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1000, activation='relu'))
model.add(Dense(9, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=True)

# Checkpoint - used to get the best weights during the model training process
checkpoint = ModelCheckpoint(filepath='models/best_weights.h5', monitor='val_loss', save_best_only=True)

# Fit the model
model.fit(train_padded, 
          y_train, 
          validation_data=(test_padded, y_test), 
          epochs=50, 
          batch_size=32, 
          callbacks=[early_stopping, checkpoint], 
          verbose=2
         )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 6, 300)            7170000   
_________________________________________________________________
flatten_9 (Flatten)          (None, 1800)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 1000)              1801000   
_________________________________________________________________
dense_15 (Dense)             (None, 9)                 9009      
Total params: 8,980,009
Trainable params: 1,810,009
Non-trainable params: 7,170,000
_________________________________________________________________
None
Train on 2988 samples, validate on 333 samples
Epoch 1/50
 - 12s - loss: 2.2934 - acc: 0.4759 - val_loss: 1.7570 - val_acc: 0.5556
Epoch 2/50
 - 11s - loss: 1.0168 - acc: 0.7326 - val_loss: 1.4753 - val_acc: 0.5826
Epoch 3/50
 - 9s - loss: 0.7700 - ac

<keras.callbacks.History at 0x2135ae03470>

Performance of this model is poor. Can we get better performance from a convolutional neural network architecture?

In [None]:
from keras.layers import Input
from keras.layers import Conv1D, MaxPooling1D
from keras.models import Model

embedding_dim = 300

# Initialize our embedding layer
embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False)

inputs = Input(shape=(max_length,))
embedding_seq = embedding(inputs)
x = Conv1D(128, 5, activation='relu')(embedding_seq)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(9, activation='softmax')(x)

model = Model(inputs, preds)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(model.summary())

# Early stopping
early_stopping = EarlyStopping(monitor='val_acc', min_delta=0.001, patience=10, verbose=True)

# Checkpoint - used to get the best weights during the model training process
checkpoint = ModelCheckpoint(filepath='models/best_weights.h5', monitor='val_acc', save_best_only=True)

# Fit the model
#model.fit(train_padded, 
#          y_train, 
#          validation_data=(test_padded, y_test), 
#          epochs=50, 
#          batch_size=128, 
#          callbacks=[early_stopping, checkpoint], 
#          verbose=2
#         )