# Train Pubmed Embeddings with Skipgram

In [6]:
#https://github.com/nzw0301/keras-examples/blob/master/Skip-gram-with-NS.ipynb
import numpy as np
import glob, re, os, json, sys, codecs, pickle
import nltk

from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input
from keras.layers.merge import Dot
from keras.utils import np_utils
#from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
import keras.preprocessing.text
from keras.preprocessing.sequence import skipgrams
from collections import Counter





# ------------------ Word processing functions -------------------------- #
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset): return word
    else: return "<unk>" # unknown token

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

def flat_map(sentences):
    return [sent for s in sentences for sent in s]



# ------------------------ convenience functions ----------------------- #

def abstract_loader(name):
    with codecs.open(name,"r","utf-8") as data_file:
        data = json.load(data_file)        
    return data.values()

def get_parent_dir(directory):
    import os
    return os.path.dirname(directory)

current_dirs_parent = get_parent_dir(os.getcwd())

## load data

Here we load 220 Open Access articles from the British Journal of Pharmacology as an example

### 1. Read in data

First, load the text data

In [4]:


path = '/home/adam/Documents/MIDS/W266/Project/parsing/pbabstract_total*'
#path = '/media/adam/Data/PMC/Br_J_Pharmacol/*.txt'
# 
files = glob.glob(path)
text = []
sentences = []
# iterate over the list to get each file 
for fle in files:
    # open the file and then call .read() to get the text 
    for t in abstract_loader(fle):
        text.append(t)
# 
N_TEXT = len(text)
for i, t in enumerate(text):
    print('\rAbstract part:', i+1, '/', N_TEXT,end='')
    sys.stdout.flush()
    try:
        sentences.append(nltk.sent_tokenize(t))
    except TypeError:
        pass

# build a flat list of sentences
sents = flat_map(sentences)

Abstract part: 1229107 / 1229107

In [5]:
len(sents)

6422569

In [11]:
pickle.dump(sents, open('pubmed_sentences.pickle','wb'))

then add the compounds from the labelled sentences.

In [12]:

compounds = Counter()

with open(current_dirs_parent + '/data/labelled_sents.csv', 'r') as labelled_sents: 
    for num, line in enumerate(labelled_sents):
        label, compound, sent = line.strip().split('\t')
        compounds[compound] += 1
        
print('Most common compounds: {}'.format(compounds.most_common(10)))
len(compounds)

Most common compounds: [('l-dopa', 16), ('styrene', 14), ('pectin', 14), ('glycyrrhizin', 13), ('sodium nitrite', 12), ('lard', 12), ('procyanidin', 12), ('vinegar', 12), ('genistein', 11), ('pleurotus ostreatus', 11)]


886

### 2. Build vocabulary 

In [13]:
# use if Python 2.7
#def text_to_word_sequence(text,
#                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
#                          lower=True, split=" "):
#    if lower: text = text.lower()
#    try :
#        text = unicode(text, "utf-8")
#    except TypeError:
#        pass
#    translate_table = {ord(c): ord(t) for c,t in  zip(filters, split*len(filters)) }
#    text = text.translate(translate_table)
#    seq = text.split(split)
#    return [i for i in seq if i]
    
#keras.preprocessing.text.text_to_word_sequence = text_to_word_sequence

# build corpus
corpus = [sentence for sentence in sents if sentence.count(' ') >= 2]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
# vocabulary size
V = len(tokenizer.word_index) + 1
V


497272

### 3. Subset vocabulary

In [14]:
wordset = []
for k, v in tokenizer.word_index.items():
    if v < 20000:
        wordset.append(canonicalize_word(k, digits=False))

add compounds to vocabulary

In [15]:
for compound in compounds:
    wordset.append(compound)

wordset = set(wordset)

although we set 20000 above as vocabulary size, it becomes a bit smaller because of canonicalization..this could be improved but maybe not a serious problem

In [16]:
corpus_canon = [' '.join(canonicalize_words(sentence.split(), wordset=wordset, digits=False)) for sentence in sents if sentence.count(' ') >= 2]
tokenizer_canon = Tokenizer()

tokenizer_canon.fit_on_texts(corpus_canon)
V_canon = len(tokenizer_canon.word_index) + 1
V_canon

20269

### 4. Define model to train embeddings

In [17]:
V = V_canon

dim_embedddings = 150

# inputs
w_inputs = Input(shape=(1, ), dtype='int32')
w = Embedding(V, dim_embedddings)(w_inputs)

# context
c_inputs = Input(shape=(1, ), dtype='int32')
c  = Embedding(V, dim_embedddings)(c_inputs)
o = Dot(axes=2)([w, c])
o = Reshape((1,), input_shape=(1, 1))(o)
o = Activation('sigmoid')(o)

SkipGram = Model(inputs=[w_inputs, c_inputs], outputs=o)
SkipGram.summary()
SkipGram.compile(loss='binary_crossentropy', optimizer='adam')



____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1, 150)        3040350     input_1[0][0]                    
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1, 150)        3040350     input_2[0][0]                    
___________________________________________________________________________________________

### 5. Start the training

In [None]:


for _ in range(5):
    loss = 0.
    for i, doc in enumerate(tokenizer_canon.texts_to_sequences(corpus_canon)):
        data, labels = skipgrams(sequence=doc, vocabulary_size=V, window_size=5, negative_samples=5.)
        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        if x:
            loss += SkipGram.train_on_batch(x, y)

    print(loss)



### 6. Save embedding vectors

In [55]:


f = open('vectors_BrJP.txt' ,'w')
f.write('{} {}\n'.format(V-1, dim_embedddings))
vectors = SkipGram.get_weights()[0]
for word, i in tokenizer_canon.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()



In [53]:
!less vectors.txt

9738 100
being 0.0593141 0.452268 -0.390305 0.107675 0.00720654 0.135836 0.0530281 0.4407 11 -0.0564856 0.130661 -0.458944 0.0229433 0.337225 0.136104 0.172514 0.336811 0 .164075 -0.05117 0.25103 -0.114255 0.0270379 -0.0600328 0.0808357 0.0611498 0.01 86546 0.088466 0.213662 0.07562 0.194044 -0.17165 -0.0736555 0.231973 -0.246112  -0.0715608 0.245144 -0.137682 0.127778 -0.128208 0.374055 0.0761042 0.0242863 -0 .187424 0.229611 0.0786641 0.128452 -0.0158993 -0.118428 -0.394745 0.012744 0.25 0862 -0.196155 0.0120631 -0.354271 -0.0190368 0.304903 -0.107094 -0.204871 0.057 992 0.064876 0.270369 -0.0747011 -0.174183 0.0653418 -0.283184 -0.0255197 -0.215 13 0.0202362 0.528384 -0.108962 0.113981 -0.125209 -0.117595 -0.0364177 0.097629 2 0.0885717 0.114088 -0.115811 -0.215144 0.547377 0.0633373 -0.241964 0.106268 - 0.158015 -0.1879 -0.0697495 0.149984 -0.0594108 -0.0208565 0.116233 0.194813 0.1 7322 0.341415 0.303702 -0.178648 -0.176093 -0.00402338 0.155215 0.00123318 -0.00 311036 