In [1]:
import os 
import sys
import numpy as np
import types
import pickle
import json
import gc

from scipy import sparse
import tarfile

module_path = os.path.abspath(os.path.join('..','..','pubmed_data'))
if module_path not in sys.path:
    sys.path.append(module_path)

module_path = os.path.abspath(os.path.join('..',))
if module_path not in sys.path:
    sys.path.append(module_path)

from generic_model import ModelAPI
import pubmed_helper as ph

from keras_new_text import regex_alfanum_tokenizer

ht_tokenizer = ph.load_tokenizer(mode="hashtrick_full_tokens")
#same memory
del ht_tokenizer.index_word
del ht_tokenizer.index_docs
del ht_tokenizer.word_counts
del ht_tokenizer.word_docs

TRIGRAM_VOC = len(ht_tokenizer.word_index) + 1 

saved_models_path = '/backup/saved_models'

Load hashtrick_full_tokens_tokenizer.p


## DSSM model
 - Missing the hashtrick layer


In [2]:
#Create a HashTrick Layer
#this layer will map bag-of-word vector to a letter-trigram-vector
"""
from keras import backend as K
from keras.layers import Layer

#create a letter-trigram-matrix
letter_trigram_matrix = None

class TrigramProjection(Layer):

    def __init__(self, **kwargs):
        super(MyLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.letter_trigram_matrix =  K.constant(letter_trigram_matrix, 
                                                   shape=self.letter_trigram_matrix.shape, 
                                                   name="Letter-trigram-matrix")

        super(MyLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, x):
        #self.letter_trigram_matrix has TxV dimension
        #x must be Vx1
        return K.dot(self.letter_trigram_matrix, x)

    def compute_output_shape(self, input_shape):
        
        return (self.letter_trigram_matrix.shape[0], 1)
"""

#The letter_trigram_matrix would have 1.3Mx70k dimension which is a lot
#instead of the matrix a on the fly convertion function was created




'\nfrom keras import backend as K\nfrom keras.layers import Layer\n\n#create a letter-trigram-matrix\nletter_trigram_matrix = None\n\nclass TrigramProjection(Layer):\n\n    def __init__(self, **kwargs):\n        super(MyLayer, self).__init__(**kwargs)\n\n    def build(self, input_shape):\n        # Create a trainable weight variable for this layer.\n        self.letter_trigram_matrix =  K.constant(letter_trigram_matrix, \n                                                   shape=self.letter_trigram_matrix.shape, \n                                                   name="Letter-trigram-matrix")\n\n        super(MyLayer, self).build(input_shape)  # Be sure to call this at the end\n\n    def call(self, x):\n        #self.letter_trigram_matrix has TxV dimension\n        #x must be Vx1\n        return K.dot(self.letter_trigram_matrix, x)\n\n    def compute_output_shape(self, input_shape):\n        \n        return (self.letter_trigram_matrix.shape[0], 1)\n'

In [3]:
from tensorflow.keras.layers import Input, Dense, Dot, Activation, Concatenate
from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, LambdaCallback
import tensorflow.keras.backend as K

def bag_of_trigram( texts):

    _matrix = np.zeros((len(texts),TRIGRAM_VOC), dtype=np.int8)

    for i,text in enumerate(texts):
        bag_of_word = regex_alfanum_tokenizer(text)
        for j in ht_tokenizer.texts_to_sequences(bag_of_word):
            _matrix[i][j] += 1

    return _matrix

class DSSM(ModelAPI):

    def __init__(self, vocabulary_size, num_neg_examples=4, same_q_d_model = True, only_title=False ,show_model_arch=False):
        super().__init__()
        
        self.TRIGRAM_SIZE = vocabulary_size
        self.num_neg_examples = num_neg_examples
        self.show_model_arch = show_model_arch
        
        self.only_title=only_title
        if self.only_title:
            self.transform_document = lambda x:x["title"]
        else:
            self.transform_document = lambda x:x["title"]+ " " +x["abstract"]
        
        self.same_q_d_model = same_q_d_model
        
        #Last line to be exectuted
        self.model = self.__build_model()
        
        

    def __build_model(self):
        #Build the keras dssm model
        K.clear_session()
        

        # Follow the paper arch
        
        #The INPUT will be the result of the hash trick layer
        query = Input(shape = (self.TRIGRAM_SIZE,), name = "query_input")
        pos_doc = Input(shape = (self.TRIGRAM_SIZE,), name = "pos_doc_input")
        neg_docs = [Input(shape = (self.TRIGRAM_SIZE,), name = ("neg_doc_input_"+str(i))) for i in range(self.num_neg_examples)]
        
        #Create a sub model of the network (siamese arch)
        #2 Inputs query and doc
        q_input = Input(shape = (self.TRIGRAM_SIZE,))
        doc_input = Input(shape = (self.TRIGRAM_SIZE,))
        
        dense_1 = Dense(300, 
                        activation='tanh',
                        kernel_initializer='glorot_uniform', 
                        bias_initializer='glorot_uniform')
        
        dense_2 = Dense(300, 
                        activation='tanh',
                        kernel_initializer='glorot_uniform', 
                        bias_initializer='glorot_uniform')
        
        dense_3 = Dense(128, 
                        activation='tanh',
                        kernel_initializer='glorot_uniform', 
                        bias_initializer='glorot_uniform')
        
        
        #connecting input to the neural network
        query_sub_model = dense_1(q_input)
        query_sub_model = dense_2(query_sub_model)
        query_sub_model = dense_3(query_sub_model)
        
        if self.same_q_d_model is False:
            #redefinition of the sub model weight's for the documents
            dense_1 = Dense(300, 
                        activation='tanh',
                        kernel_initializer='glorot_uniform', 
                        bias_initializer='glorot_uniform')

            dense_2 = Dense(300, 
                            activation='tanh',
                            kernel_initializer='glorot_uniform', 
                            bias_initializer='glorot_uniform')

            dense_3 = Dense(128, 
                            activation='tanh',
                            kernel_initializer='glorot_uniform', 
                            bias_initializer='glorot_uniform')

        doc_sub_model = dense_1(doc_input)
        doc_sub_model = dense_2(doc_sub_model)
        doc_sub_model = dense_3(doc_sub_model)
        
        #model to compute the contiuos representation of the document
        self.doc_sub_model = Model(inputs=[doc_input], outputs=[doc_sub_model], name="doc_sub_model")
        
        if self.same_q_d_model:
            self.query_sub_model = self.doc_sub_model
        else:
            self.query_sub_model = Model(inputs=[q_input], outputs=[query_sub_model], name="query_sub_model")
        
        #similarity between the query and the docs
        q_doc_sim = Dot(axes=1,normalize=True)([query_sub_model,doc_sub_model])
        
        sub_model = Model(inputs=[q_input,doc_input], outputs=[q_doc_sim])
        if self.show_model_arch:
            print("Sub model arch")
            sub_model.summary()
        
        #Making the softmax approximation for 1 pos doc and N neg doc
        q_doc_pos_output = sub_model([query,pos_doc])
        q_doc_neg_output = [sub_model([query,neg_doc]) for neg_doc in neg_docs]
        
        concat = Concatenate(axis=1)([q_doc_pos_output]+q_doc_neg_output)
        
        #missing the smoth factor
        prob = Activation("softmax")(concat)
        
        self.dssm_model = Model(inputs=[query,pos_doc]+neg_docs,outputs=prob)
        if self.show_model_arch:
            self.dssm_model.summary()
        
        #try the sgd optimizer
        self.dssm_model.compile(optimizer='sgd',
                          loss='categorical_crossentropy',
                          metrics=['accuracy'])
       
    
    def create_data_generator(self, data, articles, batch, only_title=False):
        """
        Create a python generator to fed the data in batch to the model
        
        data: list of queries with the following struct {body:"question body",title:"doc title",abstract:"doc abstract"}
        articles: document collection
        """
        
        def bag_of_trigram_list_of_list(g_texts):

            return [bag_of_trigram(texts) for texts in g_texts]
    
        def negative_random_index(low, high, selection, exclude):

            #bad approach! but the selection exclude is a lot small that the num articles...
            neg_random_indexs = np.random.randint(0,len(articles),(selection,))

            while any([i in exclude for i in neg_random_indexs]):
                neg_random_indexs = np.random.randint(0,len(articles),(selection,))

            return neg_random_indexs
        
        #VER ISTO!
        with open("/backup/saved_models/pmid_index_mapping.p","rb") as f:
            pmid_document_map = pickle.load(f)

        def training_generator(data, batch=batch, neg_examples=self.num_neg_examples, only_title=only_title):

            BATCH = batch #approx number of queries to return per batch
            
            q_pos_neg_doc = []

            max_article_index = len(articles)

            while True:

                for query_data in data:

                    if len(q_pos_neg_doc)>=BATCH:
                        b_tri = np.array(bag_of_trigram_list_of_list(q_pos_neg_doc))

                        q = b_tri[:,0,:]
                        pos_doc = b_tri[:,1,:]
                        neg_doc = [b_tri[:,i,:] for i in range(2,2+neg_examples)]
                        X = [q,pos_doc]+neg_doc

                        Y = np.array([[1]+[0]*neg_examples]*len(q_pos_neg_doc))

                        yield (X,Y)
                        q_pos_neg_doc = []
                    else:
                        pos_doc_set = {pmid_document_map[document_pmid] for document_pmid in query_data["documents"]}


                        for index_article in pos_doc_set:
                            row=[]
                            row.append(query_data["body"])
                            row.append(self.transform_document(articles[index_article]))

                            neg_random_indexs = negative_random_index(0, max_article_index, neg_examples, pos_doc_set)
                            row.extend([ self.transform_document(articles[neg_index]) for neg_index in neg_random_indexs])
                            q_pos_neg_doc.append(row) 
        
        return training_generator(data)
        
    def _training_process(self, data, **kwargs):
        #assume that the data is alredy in the format: (query,pos_doc,[neg_docs])
        
        if 'training_data' not in kwargs or 'validation_data' not in kwargs:
            raise TypeError('training_data and validation_data must be suplied!')
        
        training_data = kwargs.pop('training_data') 
        validation_data = kwargs.pop('validation_data') 
        
        if 'batch' in kwargs:
            batch = kwargs.pop('batch')
        else:
            batch = 1024
        
        if 'epoach' in kwargs:
            epoach = kwargs.pop('epoach')
        else:
            epoach = 20
            
        if 'only_title' in kwargs:
            only_title = kwargs.pop('only_title')
        else:
            only_title = False
        
        if 'neg_examples' in kwargs:
            self.num_neg_examples = kwargs.pop('neg_examples')
        
        if kwargs:
            raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
        
        
        
        training_samples = sum([ len(q["documents"]) for q in training_data])
        train_steps = training_samples//batch
        print("Train_steps:",train_steps)

        test_samples = sum([ len(q["documents"]) for q in validation_data])
        test_steps = test_samples//batch
        print("Test_steps:",test_steps)

        #data generators
        train_generator = self.create_data_generator(training_data,
                                                     data,
                                                     batch = batch,
                                                     only_title = only_title)
        
        validation_generator = self.create_data_generator(validation_data,
                                                     data,
                                                     batch = batch,
                                                     only_title = only_title)
        
        
        #callback
        save_best_file_name = "best_checkpoint_dssm_model_"+("title" if only_title else "") + ".h5"
        callback = ModelCheckpoint(os.path.join(self.saved_models_path,save_best_file_name), monitor='val_acc', verbose=0, save_best_only=True)
        
        print("Start dssm training")
        self.dssm_model.fit_generator(train_generator, 
                                      epochs=20, 
                                      steps_per_epoch=train_steps,
                                      shuffle=True,
                                      callbacks = [callback],
                                      verbose=1, 
                                      validation_data=validation_generator,
                                      validation_steps=test_steps)

    #create generator from collection data
    class Bag_of_Trigram_Generator(object):
        def __init__(self, dir_name = "bag_of_trigrams"):
            #TODO: Include batch size option            
            path = os.path.join("/backup/pubmed_archive_tokenized",dir_name)
            self.files = map(lambda x:os.path.join(path,x), sorted(os.listdir(path)))
            
        def __iter__(self):
            
            for file in self.files:
                print("Open the file:",file)

                _matrix = sparse.load_npz(file).todense()
                yield _matrix

                del _matrix
                #print("Force garbage collector",gc.collect())

        def __len__(self):
            return len(self.members)
        
        
    def build_document_representation(self):
        
        iter_generator = iter(self.Bag_of_Trigram_Generator())
        
        self.collection_representation = []
        
        for data in iter_generator:
            self.collection_representation.append(self.doc_sub_model.predict(data, batch_size = 2048, verbose=1))
            del data
            print("Force garbage collector",gc.collect())
            
        self.collection_representation = np.vstack(self.collection_representation)
        """

        gen = self.Bag_of_Trigram_Generator()
        
        def clean_up(batch,logs={}):
            del batch
            print("Force garbage collector",gc.collect())
            
        cleanup_callback = LambdaCallback(on_batch_end=clean_up)
        
        self.collection_representation = self.doc_sub_model.predict_generator(
                                                        iter(gen), 
                                                        steps=len(gen),
                                                        verbose=1,
                                                        callbacks=[cleanup_callback])
        """
        
    def _predict_process(self, queries):
        pass
        
    @staticmethod
    def load(f_name, path = '/backup/saved_models/'):
        file_name = os.path.join(path, f_name)
        
        print("Load metadata")
        with open(file_name+"_dssm_metadata.p","rb") as file:
            metadata = pickle.load(file) 
        
        dssm = DSSM(metadata["TRIGRAM_VOC"])
        dssm.same_q_d_model = metadata["same_q_d_model"]
        print("Load document representation")
        dssm.collection_representation = np.load(file_name+"_dssm_doc_emb.npy")
        print("Load model")
        dssm.dssm_model = load_model(file_name+"_dssm_model.h5")
        if dssm.same_q_d_model:
            dssm.doc_sub_model = dssm.dssm_model["doc_sub_model"]
            dssm.query_sub_model = dssm.doc_sub_model
        else:
            dssm.doc_sub_model = dssm.dssm_model["doc_sub_model"]
            dssm.query_sub_model = dssm.dssm_model["query_sub_model"]
        
    def save(self, **kwargs):
        
        if "f_name" in kwargs:
            f_name = kwargs.pop("f_name")
        else:
            raise TypeError("f_name must be provided")
        
        if kwargs:
            raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
            
        file_name = os.path.join(self.saved_models_path, f_name)
        
        #save the document representation
        np.save(file_name+"_dssm_doc_emb",self.collection_representation)
        
        #save the neural model
        self.dssm_model.save(file_name+"_dssm_model.h5")
        
        #save some metadata
        with open(file_name+"_dssm_metadata.p","wb") as file:
            pickle.dump({"TRIGRAM_VOC":self.TRIGRAM_SIZE,"same_q_d_model":self.same_q_d_model},file)
        
        

In [4]:
dssm = DSSM(TRIGRAM_VOC)

In [9]:
#articles_generator = ph.create_pubmed_collection_generator()()

#Build document representation
#articles = next(articles_generator)
gc.collect()
dssm.build_document_representation()

#gen = DSSM.Bag_of_Trigram_Generator()
#iter_gen = iter(gen)
#data = next(iter_gen)

#print("DSSM predict")
#r_1 = dssm.doc_sub_model.predict(data)

Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_000_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_001_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_002_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_003_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_004_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_005_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_006_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubm

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_042_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_043_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_044_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_045_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_046_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_047_title_abs_pubmed.p.npz
Force garbage collector 15
Open the file: /backup/pubmed_archive_tokenized/bag_of_trigrams/bag_of_trigrams_file_048_title_abs_pubmed.p.npz
Force garbage collector 15


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
del data
gc.collect()
data = next(iter_gen)

print("DSSM predict")
r_2 = dssm.doc_sub_model.predict(data)

In [116]:
# SAVE BAG OF TRIGRAM

gen = DSSM.Collection_Generator(articles, lambda x:x["title"]+" "+x["abstract"],batch_size=int(1e6))




Using batch size of 1000000


### Data generator for training

In [5]:
#with open("/backup/saved_models/pmid_index_mapping.p","rb") as f:
#    pmid_document_map = pickle.load(f)
    
#Load training and test data
bioASQ_data_path = "/backup/BioASQ-training7b/"
bioASQ_data_train = json.load(open(os.path.join(bioASQ_data_path,"7b_train_split.json")))
bioASQ_data_test = json.load(open(os.path.join(bioASQ_data_path,"7b_test_split.json")))

In [6]:
#load the articles to the memory
articles_generator = ph.create_pubmed_collection_generator()
articles = []
for docs in articles_generator():
    articles.extend(docs)
  

Open /backup/pubmed_archive_json/pubmed_ready.tar.gz
Returning: 2721103 articles
Force garbage collector 0
Open the file: pubmed_ready_08241072_to_11124313
Returning: 2883242 articles
Force garbage collector 0
Open the file: pubmed_ready_11124314_to_13996815
Returning: 2872502 articles
Force garbage collector 0
Open the file: pubmed_ready_13996816_to_18824354
Returning: 4827539 articles
Force garbage collector 0


In [7]:
## Train dssm

dssm.train(articles,
           epoach = 20,
           batch = 512,
           training_data=bioASQ_data_train,
           validation_data=bioASQ_data_test
           
           )

Train_steps: 47
Test_steps: 12
Start dssm training
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [8]:
dssm.dssm_model.save("model-20-epoach-regex-abs-title-final.h5")

## Data generator

In [21]:

def negative_random_index(low, high, selection, exclude):
    
    #bad approach! but the selection exclude is a lot small that the num articles...
    neg_random_indexs = np.random.randint(0,len(articles),(selection,))
    
    while any([i in exclude for i in neg_random_indexs]):
        neg_random_indexs = np.random.randint(0,len(articles),(selection,))
    
    return neg_random_indexs
    

                
def training_generator(type_of_data,batch=128,neg_examples=4,only_title=False):
    
    BATCH = batch #approx number of queries to return per batch
    if only_title:
        transform_article = lambda x:x["title"]
    else:
        transform_article = lambda x:x["title"]+ " " +x["abstract"]
    q_pos_neg_doc = []
    
    max_article_index = len(articles)
    
    while True:
        
        for query_data in type_of_data:

            if len(q_pos_neg_doc)>=BATCH:
                b_tri = np.array(bag_of_trigram_list_of_list(q_pos_neg_doc))

                q = b_tri[:,0,:]
                pos_doc = b_tri[:,1,:]
                neg_doc = [b_tri[:,i,:] for i in range(2,2+neg_examples)]
                X = [q,pos_doc]+neg_doc

                Y = np.array([[1]+[0]*neg_examples]*len(q_pos_neg_doc))

                yield (X,Y)
                q_pos_neg_doc = []
            else:
                pos_doc_set = {pmid_document_map[document_pmid] for document_pmid in query_data["documents"]}
                
                
                for index_article in pos_doc_set:
                    row=[]
                    row.append(query_data["body"])
                    row.append(transform_article(articles[index_article]))
                    
                    neg_random_indexs = negative_random_index(0, max_article_index, neg_examples, pos_doc_set)
                    row.extend([ transform_article(articles[neg_index]) for neg_index in neg_random_indexs])
                    q_pos_neg_doc.append(row) 
                 
                    
                    
            
        
    

In [8]:
print(dssm.dssm_model.inputs)
print(dssm.dssm_model.outputs)

[<tf.Tensor 'query_input:0' shape=(?, 77301) dtype=float32>, <tf.Tensor 'pos_doc_input:0' shape=(?, 77301) dtype=float32>, <tf.Tensor 'neg_doc_input_0:0' shape=(?, 77301) dtype=float32>, <tf.Tensor 'neg_doc_input_1:0' shape=(?, 77301) dtype=float32>, <tf.Tensor 'neg_doc_input_2:0' shape=(?, 77301) dtype=float32>, <tf.Tensor 'neg_doc_input_3:0' shape=(?, 77301) dtype=float32>]
[<tf.Tensor 'activation/Softmax:0' shape=(?, 5) dtype=float32>]


In [22]:
from tensorflow.keras.callbacks import ModelCheckpoint

BATCH = 128

training_samples = sum([ len(q["documents"]) for q in bioASQ_data_train])
train_steps = training_samples//BATCH
print("train_steps:",train_steps)

test_samples = sum([ len(q["documents"]) for q in bioASQ_data_test])
test_steps = test_samples//BATCH
print("test_steps:",test_steps)

#callback
callback = ModelCheckpoint('model-title-{epoch:03d}-{acc:03f}-{val_acc:03f}.h5', monitor='val_acc', verbose=0, save_best_only=True)

dssm.dssm_model.fit_generator(training_generator(bioASQ_data_train,batch=BATCH,only_title=True), 
                              epochs=20, 
                              steps_per_epoch=train_steps,
                              shuffle=True,
                              callbacks = [callback],
                              verbose=1, 
                              validation_data=training_generator(bioASQ_data_test,batch=BATCH,only_title=True),
                              validation_steps=test_steps)

train_steps: 188
test_steps: 48
Epoch 1/20
Epoch 2/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [23]:
dssm.dssm_model.save("model-title-final.h5")

#verify the generator




In [24]:
#num of docs per query

sum([len(docs["documents"]) for docs in bioASQ_data_train])

24356

9655078

Open the pubmed tar.gz
Creating generator
Open the file: 0
Returning: 2776363 articles
Force garbage collector 0
Open the file: 1
Returning: 2743606 articles
Force garbage collector 0
Open the file: 2
Returning: 2721103 articles
Force garbage collector 0
Open the file: 3
Returning: 2883242 articles
Force garbage collector 0
Open the file: 4
Returning: 2872502 articles
Force garbage collector 0
Open the file: 5
Returning: 4827539 articles
Force garbage collector 0


## CODE THAT I WILL NOT USE PROBABLY

In [None]:
from multiprocessing import Process


    
def processJob(process_ID, queries):
    print("Start:",process_ID)
    q_pos_neg_doc = []
    max_article_index = len(articles)
    neg_examples=4
    
    for query_data in queries:

        pos_doc_set = {pmid_document_map[document_pmid] for document_pmid in query_data["documents"]}

        for index_article in pos_doc_set:
            row=[]
            row.append(query_data["body"])
            row.append(transform_article(articles[index_article]))

            neg_random_indexs = negative_random_index(0, max_article_index, neg_examples, pos_doc_set)
            row.extend([ transform_article(articles[neg_index]) for neg_index in neg_random_indexs])
            q_pos_neg_doc.append(row) 
        
        
    print("build bag of trigram:",process_ID,"len data",len(q_pos_neg_doc))
    b_tri = np.array(bag_of_trigram_list_of_list(q_pos_neg_doc))

    q = b_tri[:,0,:]
    pos_doc = b_tri[:,1,:]
    neg_doc = [b_tri[:,i,:] for i in range(2,2+neg_examples)]
    X = [q,pos_doc]+neg_doc
    print(len(X))
    Y = np.array([[1]+[0]*neg_examples]*len(q_pos_neg_doc))
    print(len(Y),Y.shape)
    
    
    file_name = "bioASQ_bag_of_trigram_{0:03}.p".format(process_ID)
    print("save:",file_name)
    with open("/backup/BioASQ-training7b/"+file_name,"wb") as f:
        pickle.dump(X,f)


num_queries_per_itter = 200

num_divissions = len(bioASQ_data_train)//num_queries_per_itter

batch = range(0,len(bioASQ_data_train),num_queries_per_itter)

for i,j in enumerate(batch):
    data = bioASQ_data_train[j:j+num_queries_per_itter]

    n_process = 20
    n_queries = len(data)
    n_queries_per_process = n_queries//n_process

    batch = list(range(0,n_queries,n_queries_per_process))

    if len(batch)==n_process:
        batch.append(n_queries)
    else:
        batch[n_process] = n_queries

    process = []

    for k in range(n_process):
        process.append(Process(target=processJob, args=(i*n_process + k, data[batch[k]:batch[k+1]],)))

    print("Start",n_process,"working threads")
    for p in process:
        p.start()

    print("Wait",n_process,"working threads")
    for p in process:
        p.join()

    print(gc.collect())