# Train Pubmed Embeddings with Skipgram

In [None]:
#https://github.com/nzw0301/keras-examples/blob/master/Skip-gram-with-NS.ipynb
import numpy as np
import glob
import re
import nltk

from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input
from keras.layers.merge import Dot
from keras.utils import np_utils
#from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
import keras.preprocessing.text
from keras.preprocessing.sequence import skipgrams
from collections import Counter





# ------------------ Word processing functions -------------------------- #
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset): return word
    else: return "<unk>" # unknown token

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

def flat_map(sentences):
    return [sent for s in sentences for sent in s]

current_dirs_parent = get_parent_dir(os.getcwd())

## load data

Here we load 220 Open Access articles from the British Journal of Pharmacology as an example

### 1. Read in data

First, load the text data

In [2]:

path = '/media/adam/Data/PMC/Br_J_Pharmacol/*.txt'
# 
files = glob.glob(path)
text = []
sentences = []
# iterate over the list to get each file 
for fle in files:
    # open the file and then call .read() to get the text 
    with codecs.open(fle, 'rb', encoding='utf-8') as f:
        text.append(f.read())
# 
for t in text:
    sentences.append(nltk.sent_tokenize(t))

# build a flat list of sentences
sents = flat_map(sentences)

then add the compounds from the labelled sentences.

In [None]:

compounds = Counter()

with open(current_dirs_parent + '/data/labelled_sents.csv', 'r') as labelled_sents: 
    for num, line in enumerate(labelled_sents):
        label, compound, sent = line.strip().split('\t')
        compounds[compound] += 1
        
print('Most common compounds: {}'.format(compounds.most_common(10)))
len(compounds)

### 2. Build vocabulary 

In [9]:
# use if Python 2.7
#def text_to_word_sequence(text,
#                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
#                          lower=True, split=" "):
#    if lower: text = text.lower()
#    try :
#        text = unicode(text, "utf-8")
#    except TypeError:
#        pass
#    translate_table = {ord(c): ord(t) for c,t in  zip(filters, split*len(filters)) }
#    text = text.translate(translate_table)
#    seq = text.split(split)
#    return [i for i in seq if i]
    
#keras.preprocessing.text.text_to_word_sequence = text_to_word_sequence

# build corpus
corpus = [sentence for sentence in sents if sentence.count(' ') >= 2]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
# vocabulary size
V = len(tokenizer.word_index) + 1
V


118810

### 3. Subset vocabulary

In [11]:
wordset = []
for k, v in tokenizer.word_index.items():
    if v < 10000:
        wordset.append(canonicalize_word(k, digits=False))

although we set 10000 above as vocabulary size, it becomes a bit smaller because of canonicalization..this could be improved but maybe not a serious problem

In [12]:
corpus_canon = [' '.join(canonicalize_words(sentence.split(), wordset=wordset, digits=False)) for sentence in sents if sentence.count(' ') >= 2]
tokenizer_canon = Tokenizer()

tokenizer_canon.fit_on_texts(corpus_canon)
V_canon = len(tokenizer_canon.word_index) + 1
V_canon

9600

## TODO: add compounds to wordset before training

### 4. Define model to train embeddings

In [14]:
V = V_canon

dim_embedddings = 100

# inputs
w_inputs = Input(shape=(1, ), dtype='int32')
w = Embedding(V, dim_embedddings)(w_inputs)

# context
c_inputs = Input(shape=(1, ), dtype='int32')
c  = Embedding(V, dim_embedddings)(c_inputs)
o = Dot(axes=2)([w, c])
o = Reshape((1,), input_shape=(1, 1))(o)
o = Activation('sigmoid')(o)

SkipGram = Model(inputs=[w_inputs, c_inputs], outputs=o)
SkipGram.summary()
SkipGram.compile(loss='binary_crossentropy', optimizer='adam')



____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1, 100)        960000      input_1[0][0]                    
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1, 100)        960000      input_2[0][0]                    
___________________________________________________________________________________________

### 5. Start the training

In [None]:


for _ in range(5):
    loss = 0.
    for i, doc in enumerate(tokenizer_canon.texts_to_sequences(corpus_canon)):
        data, labels = skipgrams(sequence=doc, vocabulary_size=V, window_size=5, negative_samples=5.)
        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        if x:
            loss += SkipGram.train_on_batch(x, y)

    print(loss)



14921.2482462
14011.7568922


### 6. Save embedding vectors

In [None]:


f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(V-1, dim_embedddings))
vectors = SkipGram.get_weights()[0]
for word, i in tokenizer_canon.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()



In [11]:
!less vectors.txt

3919 128
limited -0.196793 0.083001 0.0139577 0.0825418 0.175225 -0.0393457 -0.0117669 -0 .0876988 -0.113816 0.146876 -0.159492 -0.0642036 0.0866598 0.104659 -0.0438206 - 0.00766364 0.0989949 -0.171952 -0.0387336 0.12589 -0.10796 0.0164086 0.117531 -0 .0330524 -0.130886 0.0488566 0.141625 0.0365962 0.0293824 0.0795345 0.00664139 0 .0416241 0.0827253 -0.0701966 -0.0899939 -0.0404796 -0.0572519 -0.100596 -0.0770 045 -0.11439 0.107152 -0.0600173 0.0407845 -0.0100802 0.0469915 0.116893 0.05898 3 -0.172963 0.00372644 0.220961 -0.0621724 0.00068234 0.0179163 -0.0652906 0.021 1658 -0.11143 -0.0318353 -0.118958 0.0968845 0.0745354 0.0688471 0.054137 0.1385 28 -0.0278073 0.101461 -0.0322092 -0.0041395 -0.0837396 0.0898334 0.0472739 -0.0 333762 0.037255 -0.095782 -0.0916879 -0.0380139 -0.0503423 0.0708811 0.139838 -0 .114322 -0.0624948 -0.0718154 0.0730689 0.0822579 0.0474888 -0.116145 0.0916795  0.0237692 -0.0564619 -0.116132 -0.0161411 -0.0160286 -0.0950811 -0.0837354 0.081 5025 -0