# Model estimation

In [178]:
import numpy as np
from random import shuffle
import sys,os, pickle, jellyfish
from collections import defaultdict
from collections import Counter
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.utils.np_utils import to_categorical
from keras import optimizers


# ------------------------ helper functions -------------------------- #

def get_parent_dir(directory):
    '''Returns the parent directory of the current one'''
    return os.path.dirname(directory)


# ------------------------ functions for processing the text ---------- #
def make_digit(word):
    '''Transforms string labels to digits'''
    if word == 'positive':
        return int(0)
    elif word == 'neutral':
        return int(1)
    elif word == 'negative':
        return int(2)
    else:
        '{} is neither positive, neutral or negative'.format(word)
        
def get_word_ids(sentence, vocabulary):
    '''Returns index in the vocabulary for each word of the sentence.'''
    words = text.text_to_word_sequence(sentence)
    return [vocabulary.index(x) if x in vocabulary else len(vocabulary) for x in words]

current_dirs_parent = get_parent_dir(os.getcwd())

### read in vocabulary and word-vector files

In [11]:
vocabulary = []
occurrences = {}

with open('/media/adam/Data/BioNLP/ri-3gram-400-tsv/vocab.tsv') as vocabulary_file:
    for line in vocabulary_file:
        word, occurrence = line.strip().split('\t')
        vocabulary.append(word)
        occurrences[word] = occurrence



print(len(vocabulary))


4929266


In [None]:
embeddings_index = {}
with open('/media/adam/Data/BioNLP/ri-3gram-400-tsv/vectors.tsv') as embedding_file:
    for i, line in enumerate(embedding_file):
        values = line.strip().split('\t')
        vector = np.asarray(values, dtype='float32')
        embeddings_index[vocabulary[i]] = vector

In [5]:
#pickle.dump(embeddings_index, open('/media/adam/Data/BioNLP/embeddings_index.pickle','wb'))
embeddings_index = pickle.load(open('/media/adam/Data/BioNLP/embeddings_index.pickle','rb'))

### Read-in compounds in order to add them to vocabulary

In [175]:
compounds = Counter()

       
with open(current_dirs_parent + '/data/labeled_dataAll.tsv', 'r') as labelled_sents: 
    for num, line in enumerate(labelled_sents):
        _,label,_,_,compound, sent = line.strip().split('\t')
        compounds[compound] += 1
        # here we could canonicalize


In [83]:
print(compounds.most_common(10))
len(compounds)

[('l-dopa', 16), ('styrene', 14), ('pectin', 14), ('glycyrrhizin', 13), ('sodium nitrite', 12), ('procyanidin', 12), ('vinegar', 12), ('lard', 12), ('genistein', 11), ('oncorhynchus', 11)]


886

### pre-trained embeddings for the compounds

If there is more than one alternative (alternative specified by JW distance > .95), than take weighted average of the word vectors, weighted by number of occurences of the compound in the corpus.

In [58]:
i = 0
compound_index = {}
for compound in compounds:
    # the word vectors are 400 long
    compound_embedding = np.zeros(400)
    num_alternatives = 0
    occurrence_total = 0
    for i, word in enumerate(vocabulary):
        # take weighted average (element-wise) of the word vectors, weighted by the occurrences of the compound in corpus
        if jellyfish.jaro_winkler(compound, word) > 0.95:
            compound_embedding += embeddings_index[word]*int(occurrences[word])
            num_alternatives += 1
            occurrence_total += int(occurrences[word])
            
    compound_index[compound] = compound_embedding/(num_alternatives*occurrence_total)


In [8]:
#pickle.dump(compound_index, open('/media/adam/Data/BioNLP/compound_index.pickle','wb'))
compound_index = pickle.load(open('/media/adam/Data/BioNLP/compound_index.pickle','rb'))

Should have paid more attention to zero division above, rewrite the vectors to 0s if they are np.nans 

In [9]:
temp_compounds = {}
for k,v in compound_index.items():
    if any(np.isnan(v)):
        continue
    temp_compounds[k] = v
len(temp_compounds)

693

Before finding alternative spellings, there were 581 compounds found of our 886, after JW distance alternatives, we have 693. As it doesn't take much time and improves visibility, in another step, let's take out those compounds which are in the top 20k of the vocabulary.

In [139]:
V = 20000
j = 0
final_compounds = {}
for compound, vector in temp_compounds.items():
    if compound not in vocabulary[:V]:
        final_compounds[compound] = vector
V_total = V + len(final_compounds) + 1

In [140]:
dim = 400

vocab_final = []
embedding_matrix = np.zeros((V, dim))
for i, word in enumerate(vocabulary[:V]):
    embedding_matrix[i] = embeddings_index[word]
    vocab_final.append(word)

    
final_compound_matrix = np.zeros((len(final_compounds), dim))
for i, item in enumerate(final_compounds.items()):
    compound, vector = item
    final_compound_matrix[i] = vector
    vocab_final.append(compound)

# stack top 20k words, compounds not found among the 20k words and a
# vector of zeros for the words not in the vocabulary    
embedding_matrix = np.vstack([embedding_matrix, final_compound_matrix, np.zeros(dim)])
assert embedding_matrix.shape == (V_total, dim)

### Prepare sentences for analysis
Most importantly, change the words in the sentences to indeces corresponding to the rows of the embedding matrix.

In [173]:
labels = defaultdict(list)
sent_classified = list()
sentences = defaultdict(list)
sent_label = list()
max_sentence_length = 0

with open(current_dirs_parent + '/data/labeled_dataAll.tsv', 'r') as labelled_sents: 
    for num, line in enumerate(labelled_sents):
        if num == 0: continue
        _,label,_,_,compound, sent = line.strip().split('\t')
        labels[num-1] = label.lower()
        sentences[num-1] = sent.lower()
        try:
            sent_label.append(make_digit(label.strip()))
            max_sentence_length = np.max([max_sentence_length, len(sent.split(' '))])
            sent_classified.append(get_word_ids(sent, vocab_final))
        except KeyError:
            pass

### Train-test split e.g., on 75%-25%, and we should cross validate but we have so few sentences, that i just wanted to check whether it works

In [207]:
ids = list(range(len(sentences)))
shuffle(ids)
P_TRAIN = .75
N_TRAIN = int(round(P_TRAIN * len(sent_label),0))
train_ids = ids[:N_TRAIN]
test_ids = ids[(N_TRAIN + 1):]
train_sents, train_labels = zip(*[(sent_classified[train_id], sent_label[train_id]) for train_id in train_ids])
test_sents, test_labels = zip(*[(sent_classified[test_id], sent_label[test_id]) for test_id in test_ids])

## Trainable parameters/settings

In [208]:
LSTM_UNITS = 300
NUM_EPOCHS = 5
optim = optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

### Fit model

Here I pad the sentences to max sentence length but maybe we could truncate?

In [None]:
# add padding so that all of the sentences have the same length
train_sents = sequence.pad_sequences(train_sents, maxlen=max_sentence_length)
test_sents = sequence.pad_sequences(test_sents, maxlen=max_sentence_length)
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

# create the model
model = Sequential()
model.add(Embedding(V_total, dim, weights=[embedding_matrix],
                    input_length=max_sentence_length, trainable=False))
model.add(LSTM(LSTM_UNITS))
model.add(Dense(3, activation='softmax'))

# Compile model
model.compile(loss='categorical_crossentropy', optimizer=optim, metrics=['accuracy'])
print(model.summary())

# Final evaluation of the model
model.fit(train_sents, train_labels, epochs=NUM_EPOCHS, batch_size=32)
scores = model.evaluate(test_sents, test_labels, verbose=0)
print("Test accuracy: {}".format(scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_25 (Embedding)     (None, 665, 400)          8221600   
_________________________________________________________________
lstm_24 (LSTM)               (None, 300)               841200    
_________________________________________________________________
dense_24 (Dense)             (None, 3)                 903       
Total params: 9,063,703
Trainable params: 842,103
Non-trainable params: 8,221,600
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
 384/1853 [=====>........................] - ETA: 180s - loss: 0.8947 - acc: 0.6562