In [1]:
import json
from os.path import join
import os
os.chdir("/home/tiagoalmeida/bioASQ-taskb/")

import sys
import pickle
import gc
import numpy as np
import tarfile
import random
from collections import defaultdict
from bisect import bisect


##add keras to the modules
module_path = os.path.abspath(os.path.join('pubmed_data'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from pubmed_data import pubmed_helper as ph

import


## DeepRank
Reference PAPER :https://arxiv.org/pdf/1710.05649.pdf

### Network structure
 - [General Network Configuration](#var_def)
 - [Input Network](#input_net)
 - [Measure Network](#measure_net)
 - [Aggregation Network](#aggreation_net)
 - [Final Network](#final_net)

In [2]:
#Load tokenizer and the embedding matrix

MODE = "regex_full_tokens"
tk = ph.load_tokenizer(mode=MODE)
emb_dict = ph.load_embeddings(mode=MODE)

assert len(tk.word_counts) == len(emb_dict)

#Number of different words
VOCAB_SIZE = len(tk.word_counts)+1

#Dimension of embeddings
EMB_DIM = emb_dict[1].shape[0]

emb_matrix = np.zeros((VOCAB_SIZE, EMB_DIM))

for i,vector in emb_dict.items():
    emb_matrix[i] = vector


Load regex_full_tokens_tokenizer.p
Load regex_full_tokens_word_embedding.p


In [3]:
#Tokenized stopwords

biomedical_stop_words = ["a", "about", "again", "all", "almost", "also", "although", "always", "among", "an", "and", "another", "any", "are", "as", "at", "be", "because", "been", "before", "being", "between", "both", "but", "by", "can", "could", "did", "do", "does", "done", "due", "during", "each", "either", "enough", "especially", "etc", "for", "found", "from", "further", "had", "has", "have", "having", "here", "how", "however", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "kg", "km", "made", "mainly", "make", "may", "mg", "might", "ml", "mm", "most", "mostly", "must", "nearly", "neither", "no", "nor", "obtained", "of", "often", "on", "our", "overall", "perhaps", "pmid", "quite", "rather", "really", "regarding", "seem", "seen", "several", "should", "show", "showed", "shown", "shows", "significantly", "since", "so", "some", "such", "than", "that", "the", "their", "theirs", "them", "then", "there", "therefore", "these", "they", "this", "those", "through", "thus", "to", "upon", "use", "used", "using", "various", "very", "was", "we", "were", "what", "when", "which", "while", "with", "within", "without", "would"]
biomedical_stop_words_tokens = set(tk.texts_to_sequences([biomedical_stop_words])[0])


<a id='var_def'></a>
## General Network Configuration

In [4]:

from tensorflow import unstack, stack
##Test 
from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, activations
from tensorflow.keras.initializers import Zeros, Ones, Constant
from tensorflow.keras.layers import Dense, Lambda, Bidirectional, Dot,Masking,Reshape, Concatenate, Layer, Embedding, Input, Conv2D, GlobalMaxPooling2D, Flatten, TimeDistributed, GRU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.activations import tanh, sigmoid


from tensorflow.keras.preprocessing.sequence import pad_sequences

from models.deep_model_for_ir.custom_layers import MaskedSelfAttention

K.clear_session()

#Number max of term per query
MAX_Q_TERM = 13

#Number max of the snippet terms
QUERY_CENTRIC_CONTEX = 15

#Number max of passages per query term
MAX_PASSAGES_PER_QUERY = 5

#Snippet position padding value
SNIPPET_POSITION_PADDING_VALUE = -1

#Mode for the creation of the S matrix
S_MATRIX_MODE = 0
#S_MATRIX_DIMENSION = EMB_DIM*2+1

#Train embedding weights
EMB_TRAINABLE = False

#Number of filters in CNN
CNN_FILTERS = 150
CNN_KERNELS = (5,5)

#RNN DIM
USE_BIDIRECTIONAL = False
GRU_REPRESENTATION_DIM = 58

ACTIVATION_FUNCTION = "selu"

REGULARIZATION = regularizers.l2(0.0001)

#Term gating network mode
TERM_GATING_MODE =  3#2- weigt fixed per position, 1 - DRMM like term gating

assert S_MATRIX_MODE in [0,1]
assert TERM_GATING_MODE in [0,1,2,3]

#MACRO STYLE
def S_MATRIX_3D_DIMENSION():
    if S_MATRIX_MODE==0:
        return 1
    elif S_MATRIX_MODE==1:
        return EMB_DIM*2+1

DEBUG = False

<a id='input_net'></a>
## Input Network


In [5]:

"""""""""""""""""""""""""""
  ---- Custom Layers ----
"""""""""""""""""""""""""""
class SimilarityMatrix(Layer):
    
    def __init__(self, query_max_term, snippet_max_term, interaction_mode=0, **kwargs):
        """
        interaction mode 0: only use similarity matrix
                    mode 1: similarity matrix + query and snippet embeddings
        """
        assert interaction_mode in [0,1] #only valid modes
        
        self.query_max_term = query_max_term
        self.snippet_max_term = snippet_max_term
        self.interaction_mode = interaction_mode
        
        super().__init__(**kwargs)
        
    def call(self,x):
        if self.interaction_mode==0:
            #sim => dot product (None, MAX_Q_TERM, EMB_DIM) x (None, MAX_Q_TERM, MAX_PASSAGE_PER_Q, EMB_DIM, QUERY_CENTRIC_CONTEX)
            query = K.expand_dims(x[0], axis=1) #(None, 1, MAX_Q_TERM, EMB_DIM)
            query = K.expand_dims(query, axis=1) #(None, 1, 1, MAX_Q_TERM, EMB_DIM)
            query = K.repeat_elements(query,x[1].shape[1],axis=1) #(None, MAX_PASSAGE_PER_Q, MAX_Q_TERM, EMB_DIM)
            query = K.repeat_elements(query,x[1].shape[2],axis=2)
            s_matrix = K.batch_dot(query,x[1]) #(None, MAX_PASSAGE_PER_Q, MAX_Q_TERM, #(None, MAX_PASSAGE_PER_Q, MAX_Q_TERM, EMB_DIM)
            
            s_matrix = K.expand_dims(s_matrix)
            
            return s_matrix #Add one more dimension #(None, MAX_PASSAGE_PER_Q, MAX_Q_TERM, #(None, MAX_PASSAGE_PER_Q, MAX_Q_TERM, EMB_DIM, 1)
        elif self.interaction_mode==1:
            raise NotImplementedError("interaction mode of layer SimilarityMatrix is not implemented")
    """                  
    def compute_output_shape(self, input_shape):
        if self.interaction_mode==0:
            return (input_shape[0][0], input_shape[0][1], self.query_max_term, self.snippet_max_term, 1)
        elif self.interaction_mode==1:
            return (input_shape[0][0], input_shape[0][1], self.query_max_term, self.snippet_max_term, input_shape[0][2]*input_shape[1][2]+1) 
    """

"""""""""""""""""""""""""""
     ---- Layers ----
"""""""""""""""""""""""""""
#Embedding Layer
embedding = Embedding(VOCAB_SIZE,EMB_DIM, name="embedding_layer",weights=[emb_matrix], trainable=EMB_TRAINABLE)

#S matrix ref in the paper
similarity_matrix = SimilarityMatrix(MAX_Q_TERM, QUERY_CENTRIC_CONTEX, interaction_mode=S_MATRIX_MODE, name="query_snippet_similarity")

#transpose (None, QUERY_CENTRIC_CONTEX, EMB_DIM) => (None, EMB_DIM, QUERY_CENTRIC_CONTEX) 
transpose_layer = Lambda(lambda x:K.permute_dimensions(x,[0,1,2,4,3]), name="snippet_transpose") 

"""""""""""""""""""""""""""
 ---- Auxiliar Models ----
"""""""""""""""""""""""""""

#Snippet single embedding transformation
snippet_token_input = Input(shape = (MAX_Q_TERM, MAX_PASSAGES_PER_QUERY, QUERY_CENTRIC_CONTEX,), name = "snippet_token")
snippet_emb = embedding(snippet_token_input)
snippet_emb_transpose = transpose_layer(snippet_emb)
snippet_emb_model = Model(inputs = [snippet_token_input], outputs=[snippet_emb_transpose], name = "snippet_emb_model")
print("\n\nsnippet_emb_model summary")
snippet_emb_model.summary()

"""""""""""""""""""""""""""
  ---- Input Network ----
"""""""""""""""""""""""""""

if DEBUG:
    query_token_input = Input(shape=(MAX_Q_TERM,), name="query_tokens")


    snippets_tokens_input = Input(shape = (MAX_Q_TERM, MAX_PASSAGES_PER_QUERY, QUERY_CENTRIC_CONTEX), name = "snippet_tokens_ipmodel") 
    
    query_emb = embedding(query_token_input)

    snippet_emb = embedding(snippets_tokens_input)
    snippet_emb_transpose = transpose_layer(snippet_emb)
    
    sim_matrix_layer = similarity_matrix([query_emb,snippet_emb_transpose])
    
    
    
    input_model = Model(inputs = [query_token_input,snippets_tokens_input], outputs=[sim_matrix_layer], name="input_model")
    print("\n\ninput_model summary")
    input_model.summary()
    
    
    print("\nOutput tensor",sim_matrix_layer)
    



snippet_emb_model summary
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
snippet_token (InputLayer)   (None, 13, 5, 15)         0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 13, 5, 15, 200)    858359000 
_________________________________________________________________
snippet_transpose (Lambda)   (None, 13, 5, 200, 15)    0         
Total params: 858,359,000
Trainable params: 0
Non-trainable params: 858,359,000
_________________________________________________________________


<a id='measure_net'></a>
## Measure Network

In [6]:

    
class MaskedConv2D(Layer):
    
    def __init__(self, filters, kernel_size, activation, regularizer=None, **kargs):
        super(MaskedConv2D, self).__init__(**kargs)

        self.activation = activations.get(activation)
        
        if regularizer is None or isinstance(regularizer,str):
            self.regularizer = regularizers.get(regularizer)
        else:
            self.regularizer = regularizer
        
        self.filters = filters
        self.kernel_size = kernel_size
        self.activation = activation

    def build(self, input_shape):

        self.conv2dlayer = Conv2D( filters = self.filters, kernel_size=self.kernel_size, activation=self.activation, kernel_regularizer=self.regularizer )
        self.conv2dlayer.build(input_shape)
        self._trainable_weights = self.conv2dlayer.trainable_weights
        
        super(MaskedConv2D, self).build(input_shape)
    
    def call(self, x):
        
        condition = K.all(x) #if all the values are the same
        inv_condition = (1-K.cast(condition, K.floatx()))
        print(inv_condition)
        feature_maps = self.conv2dlayer(x)
        
        return feature_maps * inv_condition

"""""""""""""""""""""""""""
 ---- Auxiliar Models ----
"""""""""""""""""""""""""""

#Exctrate high-level features from query and snippet interactions with CNN
cnn_extraction_model = Sequential(name="cnn_extraction_model")
cnn_extraction_model.add(MaskedConv2D(input_shape = (MAX_Q_TERM, QUERY_CENTRIC_CONTEX, S_MATRIX_3D_DIMENSION()), filters = CNN_FILTERS, kernel_size=CNN_KERNELS, activation=ACTIVATION_FUNCTION ))
cnn_extraction_model.add(GlobalMaxPooling2D())
print("\n\ncnn_extraction_model summary")
cnn_extraction_model.summary()


td_cnn_extraction_model = Sequential(name="TD_cnn_extraction_model")
td_cnn_extraction_model.add(TimeDistributed(cnn_extraction_model, input_shape=(MAX_PASSAGES_PER_QUERY, MAX_Q_TERM, QUERY_CENTRIC_CONTEX, S_MATRIX_3D_DIMENSION())))
td_cnn_extraction_model.summary()

"""""""""""""""""""""""""""
     ---- Layers ----
"""""""""""""""""""""""""""
#concatenation layer over the last dimension
concat_snippet_position = Concatenate( name = "concat_snippet_position")

self_attention = MaskedSelfAttention(CNN_FILTERS+1)

#add dimension Layer
add_passage_dim = Lambda(lambda x:K.expand_dims(x,axis=1), name="add_passage_dim")#Reshape(target_shape=(1,GRU_REPRESENTATION_DIM))

#add last dimension Layer
add_dim = Lambda(lambda x:K.expand_dims(x), name="add_dim")

#reciprocal function
reciprocal_f = Lambda(lambda x:1/(x+2), name="reciprocal_function")

#concatenation layer over second dimension (passage dimension)
concat_representation = Concatenate(axis = 1,name = "concat_representation")

Tensor("masked_conv2d/sub:0", shape=(), dtype=float32)


cnn_extraction_model summary
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masked_conv2d (MaskedConv2D) (None, 9, 11, 150)        3900      
_________________________________________________________________
global_max_pooling2d (Global (None, 150)               0         
Total params: 3,900
Trainable params: 3,900
Non-trainable params: 0
_________________________________________________________________
Tensor("time_distributed/masked_conv2d/sub:0", shape=(), dtype=float32)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed (TimeDistri (None, 5, 150)            3900      
Total params: 3,900
Trainable params: 3,900
Non-trainable params: 0
_________________________________________________________________


<a id='measure_net'></a>
## Aggregation Network

In [7]:

"""""""""""""""""""""""""""
  ---- Custom Layers ----
"""""""""""""""""""""""""""

snippet_rnn_rep_dim = CNN_FILTERS+1


    
class TermGatingDRMM_FFN(Layer):
    
    def __init__(self, embedding_dim = EMB_DIM, rnn_dim = snippet_rnn_rep_dim ,activation=None, initializer='glorot_normal', regularizer=None):
        super(TermGatingDRMM_FFN, self).__init__()

        self.activation = activations.get(activation)
        self.initializer = initializers.get(initializer)
        
        if regularizer is None or isinstance(regularizer,str):
            self.regularizer = regularizers.get(regularizer)
        else:
            self.regularizer = regularizer
        
        self.emb_dim = embedding_dim
        self.rnn_dim = rnn_dim

    def build(self, input_shape):
        
        #term gating W
        self.W_query = self.add_variable(name = "term_gating_We",
                                   shape = [self.emb_dim,1],
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.dense_score = Dense(1,kernel_regularizer = self.regularizer, activation=self.activation)
        
        dense_shape = input_shape[1]
        print(dense_shape)
        
        self.dense_score.build((dense_shape[0],dense_shape[2]))
        self._trainable_weights += self.dense_score.trainable_weights
        #self.ones = K.constant(np.ones((aggreation_dimension,1)))
        
        super(TermGatingDRMM_FFN, self).build(input_shape)
    
    def call(self, x):
        
        query_embeddings = x[0] #(None, MAX_Q_TERM, EMB_SIZE)
        snippet_representation_per_query = x[1] #(None, MAX_Q_TERM, BI_GRU_DIM)
        
        #compute gated weights
        gated_logits = K.squeeze(K.dot(query_embeddings, self.W_query), axis = -1 )
        #print(gated_logits)
        gated_distribution = K.expand_dims(K.softmax(gated_logits))
        #print(gated_distribution)
        #snippet projection
        
        weighted_score = K.sum(snippet_representation_per_query * gated_distribution,  axis = 1)
        print(weighted_score)
        
        return self.dense_score(weighted_score) # Replace with K.sum of all elements?

<a id='final_net'></a>
## Final Network

In [8]:

"""""""""""""""""""""""""""
  ---- Final Network ----
"""""""""""""""""""""""""""
query_token_input = Input(shape=(MAX_Q_TERM,), name="ds_query_tokens")
doc_score_snippet_input = Input(shape = (MAX_Q_TERM,MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), name = "ds_snippet_tokens")
doc_score_snippet_position_input = Input(shape = (MAX_Q_TERM,MAX_PASSAGES_PER_QUERY), name = "ds_snippet_position_tokens")


unstack_by_q_term = Lambda(lambda x:unstack(x,axis=1), name="unstack_query_term")

#doc_score_snippet_by_q_term = unstack_by_q_term(doc_score_snippet_input)
#doc_score_snippet_position_by_q_term = unstack_by_q_term(doc_score_snippet_position_input)

#INPUT in token format
#query_token_input = Input(shape=(MAX_Q_TERM,), name="query_tokens")
#snippets_tokens_input = [Input(shape = (MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), name = "snippet_tokens_"+str(q)) for q in range(MAX_Q_TERM)]
#inputs_contex_position = [Input(shape = (MAX_PASSAGES_PER_QUERY,), name = "q_context_position_"+str(q)) for q in range(MAX_Q_TERM)]

query_emb = embedding(query_token_input)

doc_score_snippet_emb = embedding(doc_score_snippet_input)
doc_score_snippet_emb_transpose = transpose_layer(doc_score_snippet_emb)

query_snippets_s_matrix = similarity_matrix([query_emb,doc_score_snippet_emb_transpose])

list_of_s_matrix_by_q_term = unstack_by_q_term(query_snippets_s_matrix)
list_of_snippet_postion_by_q_term = unstack_by_q_term(doc_score_snippet_position_input)

relevance_representation = []
for i in range(MAX_Q_TERM):
    
    snippet_relative_position = reciprocal_f(list_of_snippet_postion_by_q_term[i])
    
    local_relevance = td_cnn_extraction_model(list_of_s_matrix_by_q_term[i])
    
    local_relevance_position = concat_snippet_position([local_relevance,add_dim(snippet_relative_position)])
    
    relevance_representation.append(add_passage_dim(self_attention(local_relevance_position)))

concat_relevance = concat_representation(relevance_representation)

if TERM_GATING_MODE==0:
    term_gating = TermGating(vocab_size=VOCAB_SIZE, activation=ACTIVATION_FUNCTION)
    document_score = term_gating([query_token_input,concat_relevance])
    
elif TERM_GATING_MODE==1:
    term_gating = TermGatingDRMM()
    document_score = term_gating([query_emb,concat_relevance])

elif TERM_GATING_MODE==2:
    term_gating = TermGatingDRMM_Projection()
    document_score = term_gating([query_emb,concat_relevance])

elif TERM_GATING_MODE==3:
    term_gating = TermGatingDRMM_FFN(activation=ACTIVATION_FUNCTION, regularizer=REGULARIZATION)
    document_score = term_gating([query_emb,concat_relevance])

document_score_model = Model(inputs = [query_token_input, doc_score_snippet_input, doc_score_snippet_position_input], outputs = [document_score], name="query_document_score")
document_score_model.summary()      



Tensor("TD_cnn_extraction_model/time_distributed/masked_conv2d/sub:0", shape=(), dtype=float32)
condition Tensor("masked_self_attention/All:0", shape=(?, 5, 1), dtype=bool)
inv_condition Tensor("masked_self_attention/sub:0", shape=(?, 5, 1), dtype=float32)
x_projection Tensor("masked_self_attention/Reshape_2:0", shape=(?, 5, 151), dtype=float32)
x_tanh Tensor("masked_self_attention/Tanh:0", shape=(?, 5, 151), dtype=float32)
x_attention Tensor("masked_self_attention/Reshape_5:0", shape=(?, 5, 1), dtype=float32)
x_attention_maked Tensor("masked_self_attention/add:0", shape=(?, 5, 1), dtype=float32)
x_attention_softmax Tensor("masked_self_attention/transpose_3:0", shape=(?, 5, 1), dtype=float32)
x_scored_emb Tensor("masked_self_attention/mul_1:0", shape=(?, 5, 151), dtype=float32)
x_attention_rep Tensor("masked_self_attention/Sum:0", shape=(?, 151), dtype=float32)
Tensor("TD_cnn_extraction_model_1/time_distributed/masked_conv2d/sub:0", shape=(), dtype=float32)
condition Tensor("masked_sel

x_attention_softmax Tensor("masked_self_attention_9/transpose_3:0", shape=(?, 5, 1), dtype=float32)
x_scored_emb Tensor("masked_self_attention_9/mul_1:0", shape=(?, 5, 151), dtype=float32)
x_attention_rep Tensor("masked_self_attention_9/Sum:0", shape=(?, 151), dtype=float32)
Tensor("TD_cnn_extraction_model_10/time_distributed/masked_conv2d/sub:0", shape=(), dtype=float32)
condition Tensor("masked_self_attention_10/All:0", shape=(?, 5, 1), dtype=bool)
inv_condition Tensor("masked_self_attention_10/sub:0", shape=(?, 5, 1), dtype=float32)
x_projection Tensor("masked_self_attention_10/Reshape_2:0", shape=(?, 5, 151), dtype=float32)
x_tanh Tensor("masked_self_attention_10/Tanh:0", shape=(?, 5, 151), dtype=float32)
x_attention Tensor("masked_self_attention_10/Reshape_5:0", shape=(?, 5, 1), dtype=float32)
x_attention_maked Tensor("masked_self_attention_10/add:0", shape=(?, 5, 1), dtype=float32)
x_attention_softmax Tensor("masked_self_attention_10/transpose_3:0", shape=(?, 5, 1), dtype=float32

## FINAL Trainable arch

In [9]:


query_token_input = Input(shape=(MAX_Q_TERM,), name="dr_query_tokens")
positive_snippet_input = Input(shape = (MAX_Q_TERM,MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), name = "positive_snippet_tokens")
positive_snippet_position_input = Input(shape = (MAX_Q_TERM,MAX_PASSAGES_PER_QUERY), name = "positive_snippet_position_tokens")
negative_snippet_input = Input(shape = (MAX_Q_TERM,MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), name = "negative_snippet_tokens")
negative_snippet_position_input = Input(shape = (MAX_Q_TERM,MAX_PASSAGES_PER_QUERY), name = "negative_snippet_position_tokens")

positive_documents_score = document_score_model([query_token_input, positive_snippet_input, positive_snippet_position_input])
negative_documents_score = document_score_model([query_token_input, negative_snippet_input, negative_snippet_position_input])

#stack_socres = stack_scores_layer([positive_documents_score,negative_documents_score])



#pairwise_loss_layer = Lambda(pairwise_hinge_loss, name="pairwise_hinge")
#pairwise_loss = pairwise_loss_layer([positive_documents_score,negative_documents_score])



inputs = [query_token_input, positive_snippet_input, positive_snippet_position_input, negative_snippet_input, negative_snippet_position_input]

deepRank_model = Model(inputs = inputs, outputs = [positive_documents_score, negative_documents_score], name="deep_rank")


p_loss = K.mean(K.maximum(0.0, 1.0 - positive_documents_score + negative_documents_score))

deepRank_model.add_loss(p_loss)

deepRank_model.summary() 
#m.predict([Q, Q_t1_passage, Q_t2_passage, Q_t3_passage, Q_t1_passage_pos, Q_t2_passage_pos, Q_t3_passage_pos])

#deepRank_model.compile(loss=pairwise_hinge_loss, optimizer='sgd')

Tensor("query_document_score/TD_cnn_extraction_model/time_distributed/masked_conv2d/sub:0", shape=(), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_1/time_distributed/masked_conv2d/sub:0", shape=(), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_2/time_distributed/masked_conv2d/sub:0", shape=(), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_3/time_distributed/masked_conv2d/sub:0", shape=(), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_4/time_distributed/masked_conv2d/sub:0", shape=(), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_5/time_distributed/masked_conv2d/sub:0", shape=(), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_6/time_distributed/masked_conv2d/sub:0", shape=(), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_7/time_distributed/masked_conv2d/sub:0", shape=(), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_mod

x_projection Tensor("query_document_score/masked_self_attention_8/Reshape_2:0", shape=(?, 5, 151), dtype=float32)
x_tanh Tensor("query_document_score/masked_self_attention_8/Tanh:0", shape=(?, 5, 151), dtype=float32)
x_attention Tensor("query_document_score/masked_self_attention_8/Reshape_5:0", shape=(?, 5, 1), dtype=float32)
x_attention_maked Tensor("query_document_score/masked_self_attention_8/add:0", shape=(?, 5, 1), dtype=float32)
x_attention_softmax Tensor("query_document_score/masked_self_attention_8/transpose_3:0", shape=(?, 5, 1), dtype=float32)
x_scored_emb Tensor("query_document_score/masked_self_attention_8/mul_1:0", shape=(?, 5, 151), dtype=float32)
x_attention_rep Tensor("query_document_score/masked_self_attention_8/Sum:0", shape=(?, 151), dtype=float32)
condition Tensor("query_document_score/masked_self_attention_9/All:0", shape=(?, 5, 1), dtype=bool)
inv_condition Tensor("query_document_score/masked_self_attention_9/sub:0", shape=(?, 5, 1), dtype=float32)
x_projection Te

x_projection Tensor("query_document_score_1/masked_self_attention_3/Reshape_2:0", shape=(?, 5, 151), dtype=float32)
x_tanh Tensor("query_document_score_1/masked_self_attention_3/Tanh:0", shape=(?, 5, 151), dtype=float32)
x_attention Tensor("query_document_score_1/masked_self_attention_3/Reshape_5:0", shape=(?, 5, 1), dtype=float32)
x_attention_maked Tensor("query_document_score_1/masked_self_attention_3/add:0", shape=(?, 5, 1), dtype=float32)
x_attention_softmax Tensor("query_document_score_1/masked_self_attention_3/transpose_3:0", shape=(?, 5, 1), dtype=float32)
x_scored_emb Tensor("query_document_score_1/masked_self_attention_3/mul_1:0", shape=(?, 5, 151), dtype=float32)
x_attention_rep Tensor("query_document_score_1/masked_self_attention_3/Sum:0", shape=(?, 151), dtype=float32)
condition Tensor("query_document_score_1/masked_self_attention_4/All:0", shape=(?, 5, 1), dtype=bool)
inv_condition Tensor("query_document_score_1/masked_self_attention_4/sub:0", shape=(?, 5, 1), dtype=float3

x_attention_softmax Tensor("query_document_score_1/masked_self_attention_12/transpose_3:0", shape=(?, 5, 1), dtype=float32)
x_scored_emb Tensor("query_document_score_1/masked_self_attention_12/mul_1:0", shape=(?, 5, 151), dtype=float32)
x_attention_rep Tensor("query_document_score_1/masked_self_attention_12/Sum:0", shape=(?, 151), dtype=float32)
Tensor("query_document_score_1/term_gating_drmm_ffn/Sum:0", shape=(?, 151), dtype=float32)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
dr_query_tokens (InputLayer)    (None, 13)           0                                            
__________________________________________________________________________________________________
positive_snippet_tokens (InputL (None, 13, 5, 15)    0                                            
__________________________________________________________________

In [10]:
WRITE_GRAPH = False
if WRITE_GRAPH:
    from tensorflow.summary import FileWriter

    graph = K.get_session().graph
     # Your model implementation
    #with graph.as_default():
      # compile method actually creates the model in the graph.
      #deepRank_model.compile(loss=identity_loss, optimizer='adam', metrics=['accuracy'])
    writer = FileWriter(logdir='tensorboard/deepRank', graph=graph)
    writer.flush()




In [11]:
#document_score.predict([Q, Q_t1_passage, Q_t2_passage, Q_t3_passage, Q_t1_passage_pos, Q_t2_passage_pos, Q_t3_passage_pos])

In [12]:
path_dl_train = "/backup/results/fast_method_relevant_results/train_data_deep_models_v2.tar.gz"



tar = tarfile.open(path_dl_train)
#open
print("Open",path_dl_train)
m = tar.getmembers()[0]
f = tar.extractfile(m)
train_articles_collection = pickle.load(f)

f.close()

Open /backup/results/fast_method_relevant_results/train_data_deep_models_v2.tar.gz


In [13]:
PARTILLY_POSITIVE_SAMPLES = 2
NEGATIVE_SAMPLES = 3

class TrainDataGenerator(object):
    def __init__(self, article_collection, tokenizer, batch_queries_size):
        
        self.batch_size = batch_queries_size
        self.tokenizer = tokenizer
        
        self.train_data = article_collection["bioasq_data"]
        self.articles = article_collection["collection"]
        self.irrelevant_pmid = article_collection["irrelevant_pmid"]
        
        self.num_steps = len(self.train_data)//self.batch_size
    
    def __len__(self):
        return self.num_steps
    
    def __iter__(self):
        
        
        query = []
        query_positive_doc = []
        query_positive_doc_position = []
        query_negative_doc = []
        query_negative_doc_position = []
        
        while True:
            
            #stop condition
            if len(query)>=self.batch_size:
                #missing fill the gap for the missing query_terms
                query = np.array(query)
                p=np.random.permutation(query.shape[0])
                query = query[p]
                query_positive_doc = np.array(query_positive_doc)[p]
                query_positive_doc_position = np.array(query_positive_doc_position)[p]
                query_negative_doc = np.array(query_negative_doc)[p]
                query_negative_doc_position =  np.array(query_negative_doc_position)[p]
                
                X = [query, query_positive_doc, query_positive_doc_position, query_negative_doc, query_negative_doc_position]
                #Y = [np.zeros((len(query))),np.zeros((len(query)))]
                yield X


                #reset
                query = []
                query_positive_doc = []
                query_positive_doc_position = []
                query_negative_doc = []
                query_negative_doc_position = []
            
            #select a random question
            random_query_index = random.randint(0, len(self.train_data)-1) 
            query_data = self.train_data[random_query_index]
            
            #list of partilly relevant documents
            partilly_positive_pmid_docs = query_data["partilly_positive_pmid"]

            tokenized_query = query_data["query"][:MAX_Q_TERM]
            
            for j in range(PARTILLY_POSITIVE_SAMPLES+NEGATIVE_SAMPLES):
                #select a random positive
                random_doc_index = random.randint(0, len(query_data["positive_pmid"])-1) 
                doc_pmid = query_data["positive_pmid"][random_doc_index]

                tokenized_positive_doc = self.articles[doc_pmid]
                positive_snippets, positive_snippets_position = self.__snippet_interaction(tokenized_query, tokenized_positive_doc)
                
                if j<PARTILLY_POSITIVE_SAMPLES:
                    #select the partilly posivite doc
                    random_ind = bisect(query_data["partially_positive_cumulative_prob"],random.random())
                    
                    #try invert the probs
                    random_ind = (len(query_data["partially_positive_cumulative_prob"])-1)-random_ind
                    
                    random_negative_doc_pmid = query_data["partilly_positive_pmid"][random_ind]
                    #print(self.__get_article(random_negative_doc_pmid))
                    tokenized_negative_doc = self.articles[random_negative_doc_pmid]
                    negative_snippets, negative_snippets_position = self.__snippet_interaction(tokenized_query, tokenized_negative_doc)
                else:
                    #select a random negative
                    random_doc_index = random.randint(0, len(self.irrelevant_pmid)-1) 
                    doc_pmid = self.irrelevant_pmid[random_doc_index]
                    
                    tokenized_negative_doc = self.articles[doc_pmid]
                    negative_snippets, negative_snippets_position = self.__snippet_interaction(tokenized_query, tokenized_negative_doc)
                
                
                ### add ###

                #not efficient
                query.append(tokenized_query)

                #positive doc
                query_positive_doc.append(positive_snippets)
                query_positive_doc_position.append(positive_snippets_position)

                #negative doc
                query_negative_doc.append(negative_snippets)
                query_negative_doc_position.append(negative_snippets_position)
            

            
    def __snippet_interaction(self, tokenized_query, tokenized_doc, snippet_length=QUERY_CENTRIC_CONTEX):
        
        snippets = []
        snippets_position = [] 

        half_size = snippet_length//2
        
        tokenized_doc = [ x for x in tokenized_doc if x not in biomedical_stop_words_tokens]
        
        #O(n^2) complexity, probably can do better with better data struct TODO see if is worthit
        for query_token in tokenized_query:
            
            snippets_per_token = []
            snippets_per_token_position = []
            
            if query_token != 0: #jump padded token
            
                for i,doc_token in enumerate(tokenized_doc):

                    if doc_token==query_token:

                        lower_index = i-half_size
                        lower_index = max(0,lower_index)

                        higher_index = i+half_size
                        higher_index = min(len(tokenized_doc),higher_index)

                        snippets_per_token.append(tokenized_doc[lower_index:higher_index])
                        snippets_per_token_position.append(i)
            
            if len(snippets_per_token)==0:
                snippets.append(np.zeros((MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), dtype=np.int32))
                snippets_position.append(np.zeros((MAX_PASSAGES_PER_QUERY), dtype=np.int32)+SNIPPET_POSITION_PADDING_VALUE)
                continue
                
            max_snippets_len = min(MAX_PASSAGES_PER_QUERY, len(snippets_per_token))
            
            ### snippets in matrix format
            #pad
            snippets_per_token = pad_sequences(snippets_per_token, maxlen = QUERY_CENTRIC_CONTEX, padding="post")
            #fill the gaps
            _temp = np.zeros((MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), dtype=np.int32)
            _temp[:max_snippets_len] = snippets_per_token[:max_snippets_len]
            snippets.append(_temp)
            
            ### snippets_position in matrix format
            #pad
            snippets_per_token_position = pad_sequences([snippets_per_token_position], maxlen = MAX_PASSAGES_PER_QUERY, padding="post", value=SNIPPET_POSITION_PADDING_VALUE)[0]
            snippets_position.append(snippets_per_token_position)
            
        return snippets, snippets_position
            
        
        


## Test (validation) data generator


In [14]:
path_dl_test = "/backup/results/fast_method_relevant_results/test_data_deep_models_v2.tar.gz"



tar = tarfile.open(path_dl_test)
#open
print("Open",path_dl_test)
m = tar.getmembers()[0]
f = tar.extractfile(m)
test_articles_collection = pickle.load(f)

f.close()

Open /backup/results/fast_method_relevant_results/test_data_deep_models_v2.tar.gz


In [15]:

class TestDataGenerator(object):
    def __init__(self, article_collection, tokenizer):
        
        self.tokenizer = tokenizer
        
        self.test_data = article_collection["bioasq_data"] 
        self.articles = article_collection["collection"]
        
        self.num_steps = len(self.test_data)
        
    
    def __get_article(self, pmid):
        return self.article_map(self.articles[pmid])
    
    def __len__(self):
        return self.num_steps
    
    def __iter__(self):
        
        
        query = []
        query_doc = []
        query_doc_position = []
        

        for query_data in self.test_data:

            #tokenized_query = self.tokenizer.texts_to_sequences([query_data["query"]])[0]
            tokenized_query = query_data["query"][:MAX_Q_TERM]
            #manualy remove the stopwords
            #tokenized_query = [ token for token in tokenized_query if token not in biomedical_stop_words_tokens]

            #tokenized_query = pad_sequences([tokenized_query], maxlen = MAX_Q_TERM, padding="post")[0]

            for doc_pmid in query_data["documents"]:
                #positive

                tokenized_doc = self.articles[doc_pmid]
                doc_snippets, doc_snippets_position = self.__snippet_interaction(tokenized_query, tokenized_doc)

                ### add ###

                query.append(tokenized_query)

                #positive doc
                query_doc.append(doc_snippets)
                query_doc_position.append(doc_snippets_position)


            #missing fill the gap for the missing query_terms

            X = [np.array(query), np.array(query_doc), np.array(query_doc_position)]

            yield X

            #reset
            query = []
            query_doc = []
            query_doc_position = []

                
    def __snippet_interaction(self, tokenized_query, tokenized_doc, snippet_length=QUERY_CENTRIC_CONTEX):
        
        snippets = []
        snippets_position = [] 

        half_size = snippet_length//2
        
        tokenized_doc = [ x for x in tokenized_doc if x not in biomedical_stop_words_tokens]
        
        #O(n^2) complexity, probably can do better with better data struct TODO see if is worthit
        for query_token in tokenized_query:
            
            snippets_per_token = []
            snippets_per_token_position = []
            
            if query_token != 0: #jump padded token
                
                for i,doc_token in enumerate(tokenized_doc):

                    if doc_token==query_token:

                        lower_index = i-half_size
                        lower_index = max(0,lower_index)

                        higher_index = i+half_size
                        higher_index = min(len(tokenized_doc),higher_index)

                        snippets_per_token.append(tokenized_doc[lower_index:higher_index])
                        snippets_per_token_position.append(i)

            if len(snippets_per_token)==0:
                snippets.append(np.zeros((MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), dtype=np.int32))
                snippets_position.append(np.zeros((MAX_PASSAGES_PER_QUERY), dtype=np.int32)+SNIPPET_POSITION_PADDING_VALUE)
                continue
                
            max_snippets_len = min(MAX_PASSAGES_PER_QUERY, len(snippets_per_token))
            
            ### snippets in matrix format
            #pad
            snippets_per_token = pad_sequences(snippets_per_token, maxlen = QUERY_CENTRIC_CONTEX, padding="post")
            #fill the gaps
            _temp = np.zeros((MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), dtype=np.int32)
            _temp[:max_snippets_len] = snippets_per_token[:max_snippets_len]
            snippets.append(_temp)
            
            ### snippets_position in matrix format
            #pad
            snippets_per_token_position = pad_sequences([snippets_per_token_position], maxlen = MAX_PASSAGES_PER_QUERY, padding="post",value=SNIPPET_POSITION_PADDING_VALUE)[0]
            snippets_position.append(snippets_per_token_position)
            
        return snippets, snippets_position

In [16]:
validation_articles_collection = {"bioasq_data":[],"collection":test_articles_collection["collection"]}

_temp = test_articles_collection["bioasq_data"][:]
random.shuffle(_temp)
print(len(_temp))

validation_percentage = 0.08

split_index = int(len(_temp)*validation_percentage)
print(split_index)

validation_articles_collection["bioasq_data"] = _temp[:split_index]
#test_articles_collection["bioasq_data"] = _temp[split_index:]

print("validation size",len(validation_articles_collection["bioasq_data"]),"test size",len(test_articles_collection["bioasq_data"]))

549
43
validation size 43 test size 549


In [17]:
def validate_test_data(data):
    data_generator = TestDataGenerator(data, tk)
    data_generator = iter(data_generator)

    query_results = {}

    for i,X in enumerate(data_generator):
        print("Predict query:",i,end="\r")
        deep_ranking = document_score_model.predict(X)
        deep_ranking = map(lambda x:x[0],deep_ranking.tolist())
        bm25_results = data["bioasq_data"][i]["documents"]
        deep_ranking_pmid = list(zip(bm25_results,deep_ranking))
        deep_ranking_pmid.sort(key=lambda x:-x[1])
        query_results[data["bioasq_data"][i]["id"]] = {"result":deep_ranking_pmid,"goldstandard":data["bioasq_data"][i]["positive_pmid"]}
        #print("save query results:",i,end="\r")
        
    return query_results


In [18]:
#validate score

def validation_score(deep_rank_test_query_results):
    id_to_remove = []
    for k,v in deep_rank_test_query_results.items():
        if len(v["goldstandard"]) == 0:
            id_to_remove.append(k)

    for k in id_to_remove:
        del deep_rank_test_query_results[k]

    deep_rank_test_query_results = list(deep_rank_test_query_results.values())

    print("TEST set, len ",len(deep_rank_test_query_results))

    expectations = list(map(lambda x:x["goldstandard"],deep_rank_test_query_results))
    predictions = list(map(lambda x:x["result"],deep_rank_test_query_results))

    #print("Recall:",f_recall(predictions,expectations,at=1000))
    bioasq_map = f_map(predictions,expectations,bioASQ=True)
    print("MAP @10 bioASQ:", bioasq_map)
    print("MAP @25:",f_map(predictions,expectations, bioASQ=True, at=25))
    print("MAP @50:",f_map(predictions,expectations, bioASQ=True, at=50))
    print("MAP @100:",f_map(predictions,expectations, bioASQ=True, at=100))
    print("MAP @200:",f_map(predictions,expectations, bioASQ=True, at=200))
    print("MAP @300:",f_map(predictions,expectations, bioASQ=True, at=300))

    print("MAP:",f_map(predictions,expectations, use_len=True))
    
    print("RECALL@10:",f_recall(predictions,expectations, at=10))
    print("RECALL@50:",f_recall(predictions,expectations, at=50))
    print("RECALL@100:",f_recall(predictions,expectations, at=100))
    return bioasq_map, predictions, expectations

## Train 

In [19]:
from tensorflow.keras.optimizers import SGD, Adam,  Adadelta

#sgd = SGD(lr=0.001)
#adam = Adam(lr=0.001)
adadelta = Adadelta(lr=2)

deepRank_model.compile( optimizer=adadelta)



In [20]:
from models.generic_model import ModelAPI, f_recall, f_map

gen = TrainDataGenerator(train_articles_collection, tk, 256)

gen_iter = iter(gen)

loss = []

for i,line in enumerate(loss):
    
    print("Epoach:",i,"| avg loss:",np.mean(loss[i]),"| max loss:",np.max(loss[i]),"| min loss:",np.min(loss[i]))

import time

max_bio_map_val = 0.05
max_bio_map_test = 0

for epoach in range(1,200):
    loss_per_epoach = []
    for step in range(len(gen)):
        X = next(gen_iter)
        
        start = time.time()
        loss_per_epoach.append(deepRank_model.train_on_batch(X))
        print("Step:",step,"| loss:",loss_per_epoach[-1],"| current max loss:",np.max(loss_per_epoach),"| current min loss:",np.min(loss_per_epoach),"| time:",time.time()-start,end="\r")
        

    
    
    if epoach%10==0:
        print("")
        validate_query_results = validate_test_data(validation_articles_collection)
        print("")
        bio_map_val, _, _ = validation_score(validate_query_results)
        if bio_map_val >= max_bio_map_val-0.06:
            max_bio_map_val = bio_map_val
            print("")
            print("Run for the test set")
            test_query_results = validate_test_data(test_articles_collection)
            bio_map_test, _, _ = validation_score(test_query_results)

            if bio_map_test >= max_bio_map_test:
                max_bio_map_test = bio_map_test
                
                document_score_model.save_weights("deep_rank_v9_weights.h5")
                #deepRank_model.save("deep_rank_model.h5")
                
    loss.append(loss_per_epoach)
    print("",end="\r")#clear the line
    print("Epoach:",epoach,"| avg loss:",np.mean(loss[-1]),"| max loss:",np.max(loss[-1]),"| min loss:",np.min(loss[-1]))
#deepRank_model.fit_generator(gen_iter, steps_per_epoch=len(gen), verbose=1, epochs=27)

Epoach: 1 | avg loss: 0.8882006 | max loss: 1.025772 | min loss: 0.7128969 0.7128969 | time: 7.43687248229980532
Epoach: 2 | avg loss: 0.34196332 | max loss: 0.48080906 | min loss: 0.177340050.17734005 | time: 7.6594877243041996
Epoach: 3 | avg loss: 0.2557034 | max loss: 0.30212566 | min loss: 0.215079770.21507977 | time: 7.64337730407714815
Epoach: 4 | avg loss: 0.18034147 | max loss: 0.222534 | min loss: 0.1334560.133456 | time: 7.593686819076538899916
Epoach: 5 | avg loss: 0.18079163 | max loss: 0.25998604 | min loss: 0.1040057 0.1040057 | time: 7.4614498615264895
Epoach: 6 | avg loss: 0.18685608 | max loss: 0.2493649 | min loss: 0.1537939315379393 | time: 7.5812289714813239175
Epoach: 7 | avg loss: 0.16083944 | max loss: 0.20457028 | min loss: 0.11834845.11834845 | time: 7.64025330543518135
Step: 7 | loss: 0.10525398 | current max loss: 0.1778894 | current min loss: 0.1025156 | time: 7.65668797492981195
Predict query: 42
TEST set, len  42
MAP @10 bioASQ: 0.13290154950869237
MAP @2

Epoach: 47 | avg loss: 0.082646355 | max loss: 0.13201395 | min loss: 0.05153874751538747 | time: 7.5191085338592537
Epoach: 48 | avg loss: 0.0868948 | max loss: 0.17041183 | min loss: 0.047159310.04715931 | time: 7.5394632816314764
Epoach: 49 | avg loss: 0.106871985 | max loss: 0.14453365 | min loss: 0.08198213681982136 | time: 7.56011891365051314
Step: 7 | loss: 0.07064087 | current max loss: 0.12618406 | current min loss: 0.0669798 | time: 7.5262191295623787
Predict query: 42
TEST set, len  42
MAP @10 bioASQ: 0.16864417989417987
MAP @25: 0.24787088023384013
MAP @50: 0.31359462976773056
MAP @100: 0.32558428152764085
MAP @200: 0.3300914888528071
MAP @300: 0.3363064440647655
MAP: 0.3492994260746646
RECALL@10: 0.45495799288755495
RECALL@50: 0.7331034236803861
RECALL@100: 0.8128922559415489

Run for the test set
TEST set, len  542
MAP @10 bioASQ: 0.14712653019387342
MAP @25: 0.2138288601775964
MAP @50: 0.2526510410215959
MAP @100: 0.2733734529275443
MAP @200: 0.2828183168034176
MAP @300:

MAP @300: 0.34193284019208087
MAP: 0.35865253516341794
RECALL@10: 0.44083067529385084
RECALL@50: 0.766039931616894
RECALL@100: 0.839137198362962

Run for the test set
TEST set, len  542
MAP @10 bioASQ: 0.15431778246353892
MAP @25: 0.2256929341634719
MAP @50: 0.2657934006335234
MAP @100: 0.2861659607038726
MAP @200: 0.29534671386588957
MAP @300: 0.29847390495662424
MAP: 0.27632211315470706
RECALL@10: 0.3914295366413763
RECALL@50: 0.6769405014027358
RECALL@100: 0.7709743751702849
Epoach: 90 | avg loss: 0.0749875 | max loss: 0.09062836 | min loss: 0.060851634
Epoach: 91 | avg loss: 0.07664901 | max loss: 0.12221871 | min loss: 0.043770984043770984 | time: 7.859082221984863
Epoach: 92 | avg loss: 0.09703517 | max loss: 0.1302043 | min loss: 0.07485768.07485768 | time: 7.74547743797302251
Epoach: 93 | avg loss: 0.09396885 | max loss: 0.14644778 | min loss: 0.049152702.049152702 | time: 7.753264665603638
Epoach: 94 | avg loss: 0.09016445 | max loss: 0.11524065 | min loss: 0.0529537605295376 

KeyboardInterrupt: 

In [None]:

_ = validation_score(validate_test_data(test_articles_collection))



In [23]:
gen = TrainDataGenerator(train_articles_collection, tk, 256)

gen_iter = iter(gen)


for i,line in enumerate(loss):
    
    print("Epoach:",i,"| avg loss:",np.mean(loss[i]),"| max loss:",np.max(loss[i]),"| min loss:",np.min(loss[i]))

import time


for epoach in range(250,300):
    loss_per_epoach = []
    for step in range(len(gen)):
        X = next(gen_iter)
        
        start = time.time()
        loss_per_epoach.append(deepRank_model.train_on_batch(X))
        print("Step:",step,"| loss:",loss_per_epoach[-1],"| current max loss:",np.max(loss_per_epoach),"| current min loss:",np.min(loss_per_epoach),"| time:",time.time()-start,end="\r")
        

    

        print("")
        print("Run for the test set")
        test_query_results = validate_test_data(test_articles_collection)
        bio_map_test, _, _ = validation_score(test_query_results)

        if bio_map_test >= max_bio_map_test:
            max_bio_map_test = bio_map_test

            deepRank_model.save_weights("deep_rank_v6_weights.h5")
                #deepRank_model.save("deep_rank_model.h5")
                
    loss.append(loss_per_epoach)
    print("",end="\r")#clear the line
    print("Epoach:",epoach,"| avg loss:",np.mean(loss[-1]),"| max loss:",np.max(loss[-1]),"| min loss:",np.min(loss[-1]))

Epoach: 0 | avg loss: 0.9125016 | max loss: 1.0042224 | min loss: 0.80067205
Epoach: 1 | avg loss: 0.49317193 | max loss: 0.72874445 | min loss: 0.36090243
Epoach: 2 | avg loss: 0.28394535 | max loss: 0.30443656 | min loss: 0.22263007
Epoach: 3 | avg loss: 0.27021748 | max loss: 0.3453509 | min loss: 0.21942542
Epoach: 4 | avg loss: 0.22671619 | max loss: 0.29153928 | min loss: 0.17943613
Epoach: 5 | avg loss: 0.20183098 | max loss: 0.24140441 | min loss: 0.15858825
Epoach: 6 | avg loss: 0.17883258 | max loss: 0.21680002 | min loss: 0.14568402
Epoach: 7 | avg loss: 0.17802173 | max loss: 0.21380427 | min loss: 0.1308456
Epoach: 8 | avg loss: 0.16129185 | max loss: 0.18449171 | min loss: 0.14353858
Epoach: 9 | avg loss: 0.17940602 | max loss: 0.2193988 | min loss: 0.13944818
Epoach: 10 | avg loss: 0.17154312 | max loss: 0.19820029 | min loss: 0.13796973
Epoach: 11 | avg loss: 0.14994204 | max loss: 0.21805936 | min loss: 0.08608124
Epoach: 12 | avg loss: 0.15823033 | max loss: 0.1936159

Step: 0 | loss: 0.11017014 | current max loss: 0.11017014 | current min loss: 0.11017014 | time: 7.198953151702881
Run for the test set
TEST set, len  542
MAP @10 bioASQ: 0.15984449130205572
MAP @25: 0.23606571372261492
MAP @50: 0.27730569437429264
MAP @100: 0.29791470334666664
MAP @200: 0.306586992229299
MAP @300: 0.30988258322266754
MAP: 0.2777642714465491
RECALL@10: 0.3993093253566211
RECALL@50: 0.6919821718259653
RECALL@100: 0.7841027630008729
Step: 1 | loss: 0.13290612 | current max loss: 0.13290612 | current min loss: 0.11017014 | time: 7.3045594692230225
Run for the test set
TEST set, len  542
MAP @10 bioASQ: 0.16175386575294323
MAP @25: 0.23877232985998448
MAP @50: 0.279995892313945
MAP @100: 0.3006779991899093
MAP @200: 0.30938276512018537
MAP @300: 0.3124854489296801
MAP: 0.27772919079815606
RECALL@10: 0.40005204504457903
RECALL@50: 0.6942410418181969
RECALL@100: 0.7835968498027603
Step: 2 | loss: 0.07845879 | current max loss: 0.13290612 | current min loss: 0.07845879 | time

KeyboardInterrupt: 

In [29]:
path_save = "/backup/results/deep_rank"
path_save = os.path.join(path_save, "deep_rank_v2_17_1_test_data.p")

with open(path_save, "wb") as f:
    pickle.dump(test_query_results,f)

In [None]:
## PREPARE SUBMISSION


test_bioASQ_results_results = list(map(lambda k:{"id":k[0],"documents":list(map(lambda x:"http://www.ncbi.nlm.nih.gov/pubmed/"+str(x[0]), k[1]["result"]))[:10]}, test_bioASQ_results.items()))
_temp = []

for query in bioASQ_data:
    _jump = False
    for r_query in test_bioASQ_results_results:
        if query["id"] == r_query["id"]:
            _jump = True
    
    #no match so add
    if not _jump:
        _temp.append({"id":query["id"],"documents":[]})

test_bioASQ_results_results.extend(_temp)

print(len(test_bioASQ_results_results))
assert len(test_bioASQ_results_results) == 100
a = {"questions": test_bioASQ_results_results}
with open("5b_phaseA_01.json","w") as f:
    json.dump(a,f)
    

test_bioASQ_results_results[0]


In [26]:
query_to_test_index = 0

data_generator = TestDataGenerator(test_articles_collection, tk)
data_generator = iter(data_generator)
for _ in range(query_to_test_index+1):
    X = next(data_generator)


In [65]:
re_ranking = document_score_model.predict(X)

In [66]:
re_ranking = map(lambda x:x[0],re_ranking.tolist())

In [67]:
bm25_results = test_articles_collection["bioasq_data"][query_to_test_index]["documents"]
positive_docs = test_articles_collection["bioasq_data"][query_to_test_index]["positive_pmid"]

In [68]:
re_ranking_pmid = list(zip(bm25_results,re_ranking))

In [69]:
re_ranking_pmid.sort(key=lambda x:-x[1])

In [70]:
re_ranking_pmid[:10]

[('24794627', 5.3322014808654785),
 ('30251567', 5.313037872314453),
 ('28796422', 5.227417945861816),
 ('30114722', 5.093368053436279),
 ('29947303', 5.0901007652282715),
 ('30697454', 5.0804266929626465),
 ('30569414', 4.911670207977295),
 ('28901190', 4.814671039581299),
 ('24577791', 4.803395748138428),
 ('26907255', 4.67585563659668)]

In [71]:
test_articles_collection["bioasq_data"][query_to_test_index]["query"]

array([13502,    43,   478,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0], dtype=int32)

In [72]:
positive_docs

['24554704',
 '24784583',
 '24577791',
 '23197849',
 '24035588',
 '21060967',
 '25479728',
 '21755313',
 '24469711',
 '22512788',
 '24911883',
 '24794627',
 '21464439',
 '25059784']

In [73]:
positive_docs_ranked = []
for i,result in enumerate(re_ranking_pmid):
    if result[0] in set(positive_docs):
        positive_docs_ranked.append((i,result[0],result[1]))
        
true_ranked = []
for i,pmid in enumerate(bm25_results):
    if pmid in set(positive_docs):
        true_ranked.append((i,pmid))

print(positive_docs_ranked)
print(true_ranked)

[(0, '24794627', 5.3322014808654785), (8, '24577791', 4.803395748138428), (11, '24554704', 4.6346845626831055), (12, '23197849', 4.612618446350098), (16, '21060967', 4.446898460388184), (18, '24469711', 4.439567565917969), (19, '21755313', 4.431821823120117), (25, '22512788', 4.27816104888916), (34, '21464439', 4.063064098358154), (35, '25059784', 4.056085586547852), (38, '24784583', 4.003556728363037), (43, '25479728', 3.937878131866455), (50, '24035588', 3.7958528995513916), (343, '24911883', 2.838761806488037)]
[(3, '23197849'), (4, '21755313'), (6, '25479728'), (7, '24784583'), (8, '24577791'), (9, '24035588'), (15, '22512788'), (17, '24911883'), (19, '21464439'), (22, '24794627'), (126, '25059784'), (134, '24554704'), (466, '21060967'), (2793, '24469711')]


# Test with train set, check overfit

In [38]:
query_tokens = np.array([X[0][0]])
snippet_list = np.array([X[1][0]])

query_tokens = X[0][:2]
snippet_list = X[1][:2]

model_input = [query_tokens, snippet_list]

In [39]:

print(query_tokens.shape)
print(snippet_list.shape)

(2, 15)
(2, 15, 3, 15)


In [40]:
matrix = input_model.predict(model_input)
np.array(matrix).shape

(2, 15, 3, 15, 15, 1)

In [21]:
matrix[0][4][0].shape

(15, 15, 1)

In [42]:
np.squeeze(matrix[0][7][0])

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.

In [148]:
query_tokens[0]

array([  165, 13502,    26,    61,     8,     1,    43,     2,   478],
      dtype=int32)

In [168]:
snippet_list[0][1][0]

array([  363,     5,  2386,    97,  7598,   774,    32, 13502,    18,
          15,     7, 11695,   117,    17,     0], dtype=int32)

In [169]:
emb_dict[13502]

array([ 0.05703647,  0.19594026,  0.03365219,  0.15514491,  0.00540348,
       -0.02335026, -0.06095085,  0.0226689 , -0.05668721,  0.01571985,
       -0.09896637,  0.13836679,  0.02710932,  0.06420047, -0.03692323,
        0.03899341,  0.00553868, -0.08639584, -0.05358738, -0.02609682,
        0.06495432, -0.00129713, -0.01882407, -0.10850747, -0.02421302,
        0.05556208,  0.00291283, -0.04882976,  0.01770345,  0.0035051 ,
        0.07192209, -0.00432884, -0.15161929, -0.07024549, -0.04793473,
        0.01823143,  0.10337584, -0.04076301,  0.01026187,  0.12004871,
        0.03939956, -0.03548966, -0.10689223, -0.16337523,  0.10883316,
        0.01135785,  0.03041399,  0.06011688, -0.09919181,  0.01741308,
       -0.04328503, -0.00256405, -0.11370766,  0.0522779 ,  0.0702537 ,
        0.01021139,  0.06773005,  0.01114117, -0.05878652,  0.0720681 ,
        0.05551391,  0.08731035,  0.07339004,  0.0031227 ,  0.10792159,
        0.12050318, -0.05851915, -0.08350374, -0.03341928,  0.12

In [147]:
data_generator = TrainDataGenerator(train_articles_collection, tk, 256)
data_generator = iter(data_generator)
for i in range(9):
    print(i,end="\r")
    X,Y = next(data_generator)

8

In [151]:
list(map(lambda x: len(list(filter(lambda y:y!=0,x))),X[0]))

[8,
 8,
 8,
 8,
 8,
 8,
 8,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 12,
 12,
 12,
 12,
 12,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 3,
 3,
 3,
 3,
 3

In [157]:
X[0][26]

array([ 988,  988,  279, 1208,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0], dtype=int32)

In [161]:
X[3][26]

array([[[  1075,      2,    986,      4,      6,    248,    988,   2250,
             44,    713,   6300,   6300,      0],
        [  3974,      2,    986,      4,      6,    248,    988,   2250,
             18,    713,   6300,   6300,      0],
        [    17,     17,  46298,  59753,   9143,   2106,    988,   2250,
             16,      6,    200,    174,      0]],

       [[  1075,      2,    986,      4,      6,    248,    988,   2250,
             44,    713,   6300,   6300,      0],
        [  3974,      2,    986,      4,      6,    248,    988,   2250,
             18,    713,   6300,   6300,      0],
        [    17,     17,  46298,  59753,   9143,   2106,    988,   2250,
             16,      6,    200,    174,      0]],

       [[     2,    248,    986,      4,      1,  10169,    279,      0,
              0,      0,      0,      0,      0],
        [     0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0],
        [   

In [164]:
pos,neg = deepRank_model.predict(X)

In [256]:
query_to_test_index = 1

data_generator = TestDataGenerator(train_articles_collection, tk)
data_generator = iter(data_generator)
for _ in range(query_to_test_index+1):
    X = next(data_generator)


KeyError: 'bioasq_data'

In [257]:
re_ranking = document_score_model.predict(X)

bm25_results = train_articles_collection["test_data"][query_to_test_index]["documents"]
positive_docs = train_articles_collection["test_data"][query_to_test_index]["positive_pmid"]

re_ranking_pmid = list(zip(bm25_results,re_ranking.tolist()))

re_ranking_pmid.sort(key=lambda x:-x[1])

re_ranking_pmid[:10]

[('26671317', 7.931817054748535),
 ('20975159', 7.902041435241699),
 ('20650709', 7.8478264808654785),
 ('19805301', 7.842199802398682),
 ('21731768', 7.818233013153076),
 ('24681619', 7.759010314941406),
 ('26631348', 7.714381217956543),
 ('22196114', 7.696432590484619),
 ('23817568', 7.648404598236084),
 ('26410599', 7.3919854164123535)]

In [216]:
print(tk.sequences_to_texts([train_articles_collection["test_data"][query_to_test_index]["query"]]))
positive_docs

['kind enzyme encoded proto oncogene abl1']


['21435002',
 '20841568',
 '9500553',
 '24012954',
 '18796434',
 '23842646',
 '18528425']

In [217]:
positive_docs_ranked = []
for i,result in enumerate(re_ranking_pmid):
    if result[0] in set(positive_docs):
        positive_docs_ranked.append((i,result[0],result[1]))
        
true_ranked = []
for i,pmid in enumerate(bm25_results):
    if pmid in set(positive_docs):
        true_ranked.append((i,pmid))

print(positive_docs_ranked)
print(true_ranked)

[(5, '24012954', 5.5485920906066895), (29, '9500553', 4.8889594078063965), (87, '21435002', 4.592401504516602), (112, '23842646', 4.525805473327637), (155, '18796434', 4.400295257568359), (342, '18528425', 4.139955043792725), (2244, '20841568', 3.1609909534454346)]
[(1, '9500553'), (29, '21435002'), (187, '24012954'), (309, '18528425'), (333, '23842646'), (610, '20841568'), (2354, '18796434')]


In [None]:
bm25_results