# Model estimation

In [1]:
import numpy as np
from random import shuffle
import sys,os, pickle, jellyfish
from collections import defaultdict
from collections import Counter
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.utils.np_utils import to_categorical
from keras import optimizers


# ------------------------ helper functions -------------------------- #

def get_parent_dir(directory):
    '''Returns the parent directory of the current one'''
    return os.path.dirname(directory)


# ------------------------ functions for processing the text ---------- #
def make_digit(word):
    '''Transforms string labels to digits'''
    if word == 'positive':
        return int(0)
    elif word == 'neutral':
        return int(1)
    elif word == 'negative':
        return int(2)
    else:
        '{} is neither positive, neutral or negative'.format(word)
        
        
def get_word_ids(sentence, vocabulary):
    '''Returns index in the vocabulary for each word of the sentence.'''
    words = text.text_to_word_sequence(sentence)
    return [vocabulary.index(x) if x in vocabulary else len(vocabulary) for x in words]

current_dirs_parent = get_parent_dir(os.getcwd())

Using TensorFlow backend.


### read in vocabulary and word-vector files

In [3]:
vocabulary = []
occurrences = {}

with open('/Users/lisabarcelo/Desktop/W266/food_drug_interaction/BioNLP/ri-3gram-400-tsv/vocab.tsv') as vocabulary_file:
    for line in vocabulary_file:
        word, occurrence = line.strip().split('\t')
        vocabulary.append(word)
        occurrences[word] = occurrence

print(len(vocabulary))

4929266


In [4]:
embeddings_index = {}
with open('/Users/lisabarcelo/Desktop/W266/food_drug_interaction/BioNLP/ri-3gram-400-tsv/vectors.tsv') as embedding_file:
    for i, line in enumerate(embedding_file):
        values = line.strip().split('\t')
        vector = np.asarray(values, dtype='float32')
        embeddings_index[vocabulary[i]] = vector

In [8]:
#pickle.dump(embeddings_index, open('/Users/lisabarcelo/Desktop/W266/food_drug_interaction/BioNLP/embeddings_index.pickle','wb'))
#embeddings_index = pickle.load(open('/Users/lisabarcelo/Desktop/W266/food_drug_interaction/BioNLP/embeddings_index.pickle','rb'))

### Read-in compounds in order to add them to vocabulary

In [14]:
compounds = Counter()
   
with open(current_dirs_parent + '/data/labeled_dataAll.tsv', 'r') as labelled_sents: 
    for num, line in enumerate(labelled_sents):
        _,label,_,_,compound, sent = line.strip().split('\t')
        compounds[compound] += 1
        # here we could canonicalize

In [15]:
print(compounds.most_common(10))
len(compounds)

[('pectin', 17), ('l-dopa', 16), ('styrene', 15), ('genistein', 14), ('glycyrrhizin', 14), ('potato', 13), ('rainbow trout', 13), ('procyanidin', 12), ('lard', 12), ('thiamine', 12)]


983

### pre-trained embeddings for the compounds

If there is more than one alternative (alternative specified by JW distance > .95), than take weighted average of the word vectors, weighted by number of occurences of the compound in the corpus.

In [16]:
i = 0
compound_index = {}
for compound in compounds:
    # the word vectors are 400 long
    compound_embedding = np.zeros(400)
    num_alternatives = 0
    occurrence_total = 0
    for i, word in enumerate(vocabulary):
        # take weighted average (element-wise) of the word vectors, weighted by the occurrences of the compound in corpus
        if jellyfish.jaro_winkler(compound, word) > 0.95:
            compound_embedding += embeddings_index[word]*int(occurrences[word])
            num_alternatives += 1
            occurrence_total += int(occurrences[word])
            
    compound_index[compound] = compound_embedding/(num_alternatives*occurrence_total)

In [8]:
#pickle.dump(compound_index, open('/media/adam/Data/BioNLP/compound_index.pickle','wb'))
#compound_index = pickle.load(open('/media/adam/Data/BioNLP/compound_index.pickle','rb'))

Should have paid more attention to zero division above, rewrite the vectors to 0s if they are np.nans 

In [17]:
temp_compounds = {}
for k,v in compound_index.items():
    if any(np.isnan(v)):
        continue
    temp_compounds[k] = v
len(temp_compounds)

780

Before finding alternative spellings, there were 581 compounds found of our 886, after JW distance alternatives, we have 693. As it doesn't take much time and improves visibility, in another step, let's take out those compounds which are in the top 20k of the vocabulary.

In [18]:
#Tried 30k words!
V = 30000
j = 0
final_compounds = {}
for compound, vector in temp_compounds.items():
    if compound not in vocabulary[:V]:
        final_compounds[compound] = vector
# top 20k words, remaining compounds not in the top 20k, and one extra line for the unknown words        
V_total = V + len(final_compounds) + 1

### Prepare embedding matrix

In [20]:
dim = 400

vocab_final = []
embedding_matrix = np.zeros((V, dim))
for i, word in enumerate(vocabulary[:V]):
    embedding_matrix[i] = embeddings_index[word]
    vocab_final.append(word)

    
final_compound_matrix = np.zeros((len(final_compounds), dim))
for i, item in enumerate(final_compounds.items()):
    compound, vector = item
    final_compound_matrix[i] = vector
    vocab_final.append(compound)

# stack top 20k words, compounds not found among the 20k words and a
# vector of zeros for the words not in the vocabulary    
embedding_matrix = np.vstack([embedding_matrix, final_compound_matrix, np.zeros(dim)])
assert embedding_matrix.shape == (V_total, dim)

In [21]:
'honey' in temp_compounds.keys()

True

### Prepare sentences for analysis
Most importantly, change the words in the sentences to indeces corresponding to the rows of the embedding matrix.

In [22]:
labels = defaultdict(list)
sent_classified = list()
sentences = defaultdict(list)
sent_label = list()
max_sentence_length = 0

with open(current_dirs_parent + '/data/labeled_dataAll.tsv', 'r') as labelled_sents: 
    for num, line in enumerate(labelled_sents):
        if num == 0: continue
        _,label,_,_,compound, sent = line.strip().split('\t')
        # include only those compounds that are found in the corpus
        if compound in temp_compounds.keys():
            labels[num-1] = label.lower()
            sentences[num-1] = sent.lower()
            try:
                sent_label.append(make_digit(label.strip()))
                max_sentence_length = np.max([max_sentence_length, len(sent.split(' '))])
                sent_classified.append(get_word_ids(sent, vocab_final))
            except KeyError:
                pass

## Trainable parameters/settings

In [23]:
LSTM_UNITS = 300
NUM_EPOCHS = 100
optim = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
# change vocabulary size? maybe 20k is too much?

### Fit model

Here I pad the sentences to max sentence length but maybe we could truncate?

In [24]:
ids = list(range(len(sentences)))
shuffle(ids)
P_TRAIN = .75
N_TRAIN = int(round(P_TRAIN * len(sent_label),0))
train_ids = ids[:N_TRAIN]
test_ids = ids[(N_TRAIN + 1):]
train_sents, train_labels = zip(*[(sent_classified[train_id], sent_label[train_id]) for train_id in train_ids])
test_sents, test_labels = zip(*[(sent_classified[test_id], sent_label[test_id]) for test_id in test_ids])

# add padding so that all of the sentences have the same length
train_sents = sequence.pad_sequences(train_sents, maxlen=max_sentence_length)
test_sents = sequence.pad_sequences(test_sents, maxlen=max_sentence_length)
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

# create the model
model = Sequential()
model.add(Embedding(V_total, dim, weights=[embedding_matrix],
                    input_length=max_sentence_length, trainable=False))
model.add(Dropout(0.5))
model.add(LSTM(LSTM_UNITS))
model.add(Dense(3, activation='softmax'))

# Compile model
model.compile(loss='categorical_crossentropy', optimizer=optim, metrics=['categorical_accuracy'])
print(model.summary())

# Final evaluation of the model
model.fit(train_sents, train_labels, epochs=NUM_EPOCHS, batch_size=32)
scores = model.evaluate(test_sents, test_labels, verbose=0)
print("Test accuracy: {}".format(scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 665, 400)          12222400  
_________________________________________________________________
dropout_1 (Dropout)          (None, 665, 400)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 300)               841200    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 903       
Total params: 13,064,503
Trainable params: 842,103
Non-trainable params: 12,222,400
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100


In [25]:
training_prediction = model.predict(train_sents)

In [26]:
pred_classes = np.argmax(training_prediction, axis = 1)
p = 0
neu = 0
neg = 0
for i in pred_classes:
    if i == 0:
        p += 1
    elif i == 1:
        neu += 1
    else:
        neg += 1

In [27]:
[p,neu,neg]

[300, 1083, 158]

In [28]:
true_classes = np.argmax(train_labels, axis=1)
p = 0
neu = 0
neg = 0
for i in true_classes:
    if i == 0:
        p += 1
    elif i == 1:
        neu += 1
    else:
        neg += 1

In [29]:
[p, neu, neg]

[333, 980, 228]

In [30]:
with open('pred_true_30k.csv','w') as outfile:
    for i in range(len(true_classes)):
        if i == 0:
            print('true,predicted', file=outfile)
        print('{0},{1}'.format(true_classes[i],pred_classes[i]), file=outfile)

In [61]:
correct = 0
incorrect = 0
total = 0
for item in open('pred_true_30k.csv','r'):
    total += 1
    true, predicted = item.split(",")
    true = str(true[0])
    predicted = str(predicted[0])
    if true == predicted:
        correct += 1 
    else:
        #print (true, predicted, true==predicted)
        incorrect += 1
    
print (correct/total)

0.7769130998702983
