In [1]:
import json
from os.path import join
import os
os.chdir("/home/tiagoalmeida/bioASQ-taskb/")

import sys
import pickle
import gc
import numpy as np
import tarfile
import random
from collections import defaultdict
from bisect import bisect


##add keras to the modules
module_path = os.path.abspath(os.path.join('pubmed_data'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from pubmed_data import pubmed_helper as ph

## DeepRank
Reference PAPER :https://arxiv.org/pdf/1710.05649.pdf

### Network structure
 - [General Network Configuration](#var_def)
 - [Input Network](#input_net)
 - [Measure Network](#measure_net)
 - [Aggregation Network](#aggreation_net)
 - [Final Network](#final_net)

In [2]:
#Load tokenizer and the embedding matrix

MODE = "regex_full_tokens"
tk = ph.load_tokenizer(mode=MODE)
emb_dict = ph.load_embeddings(mode=MODE)

assert len(tk.word_counts) == len(emb_dict)

#Number of different words
VOCAB_SIZE = len(tk.word_counts)+1

#Dimension of embeddings
EMB_DIM = emb_dict[1].shape[0]

emb_matrix = np.zeros((VOCAB_SIZE, EMB_DIM))

for i,vector in emb_dict.items():
    emb_matrix[i] = vector


Load regex_full_tokens_tokenizer.p
Load regex_full_tokens_word_embedding.p


In [3]:
#Tokenized stopwords

biomedical_stop_words = ["a", "about", "again", "all", "almost", "also", "although", "always", "among", "an", "and", "another", "any", "are", "as", "at", "be", "because", "been", "before", "being", "between", "both", "but", "by", "can", "could", "did", "do", "does", "done", "due", "during", "each", "either", "enough", "especially", "etc", "for", "found", "from", "further", "had", "has", "have", "having", "here", "how", "however", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "kg", "km", "made", "mainly", "make", "may", "mg", "might", "ml", "mm", "most", "mostly", "must", "nearly", "neither", "no", "nor", "obtained", "of", "often", "on", "our", "overall", "perhaps", "pmid", "quite", "rather", "really", "regarding", "seem", "seen", "several", "should", "show", "showed", "shown", "shows", "significantly", "since", "so", "some", "such", "than", "that", "the", "their", "theirs", "them", "then", "there", "therefore", "these", "they", "this", "those", "through", "thus", "to", "upon", "use", "used", "using", "various", "very", "was", "we", "were", "what", "when", "which", "while", "with", "within", "without", "would"]
biomedical_stop_words_tokens = set(tk.texts_to_sequences([biomedical_stop_words])[0])


<a id='var_def'></a>
## General Network Configuration

In [4]:

from tensorflow import unstack, stack
##Test 
from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, activations
from tensorflow.keras.initializers import Zeros, Ones, Constant
from tensorflow.keras.layers import Dense, Lambda, Bidirectional, Dot,Masking,Reshape, Concatenate, Layer, Embedding, Input, Conv2D, GlobalMaxPooling2D, Flatten, TimeDistributed, GRU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.activations import tanh, sigmoid


from tensorflow.keras.preprocessing.sequence import pad_sequences

K.clear_session()

#Number max of term per query
MAX_Q_TERM = 13

#Number max of the snippet terms
QUERY_CENTRIC_CONTEX = 15

#Number max of passages per query term
MAX_PASSAGES_PER_QUERY = 5

#Snippet position padding value
SNIPPET_POSITION_PADDING_VALUE = -1

#Mode for the creation of the S matrix
S_MATRIX_MODE = 0
#S_MATRIX_DIMENSION = EMB_DIM*2+1

#Train embedding weights
EMB_TRAINABLE = False

#Number of filters in CNN
CNN_FILTERS = 100
CNN_KERNELS = (3,3)

#RNN DIM
USE_BIDIRECTIONAL = False
GRU_REPRESENTATION_DIM = 56

ACTIVATION_FUNCTION = "selu"

REGULARIZATION = regularizers.l2(0.0001)

#Term gating network mode
TERM_GATING_MODE =  3#2- weigt fixed per position, 1 - DRMM like term gating

assert S_MATRIX_MODE in [0,1]
assert TERM_GATING_MODE in [0,1,2,3]

#MACRO STYLE
def S_MATRIX_3D_DIMENSION():
    if S_MATRIX_MODE==0:
        return 1
    elif S_MATRIX_MODE==1:
        return EMB_DIM*2+1

DEBUG = False

<a id='input_net'></a>
## Input Network


In [5]:

"""""""""""""""""""""""""""
  ---- Custom Layers ----
"""""""""""""""""""""""""""
class SimilarityMatrix(Layer):
    
    def __init__(self, query_max_term, snippet_max_term, interaction_mode=0, **kwargs):
        """
        interaction mode 0: only use similarity matrix
                    mode 1: similarity matrix + query and snippet embeddings
        """
        assert interaction_mode in [0,1] #only valid modes
        
        self.query_max_term = query_max_term
        self.snippet_max_term = snippet_max_term
        self.interaction_mode = interaction_mode
        
        super().__init__(**kwargs)
        
    def call(self,x):
        if self.interaction_mode==0:
            #sim => dot product (None, MAX_Q_TERM, EMB_DIM) x (None, MAX_Q_TERM, MAX_PASSAGE_PER_Q, EMB_DIM, QUERY_CENTRIC_CONTEX)
            query = K.expand_dims(x[0], axis=1) #(None, 1, MAX_Q_TERM, EMB_DIM)
            query = K.expand_dims(query, axis=1) #(None, 1, 1, MAX_Q_TERM, EMB_DIM)
            query = K.repeat_elements(query,x[1].shape[1],axis=1) #(None, MAX_PASSAGE_PER_Q, MAX_Q_TERM, EMB_DIM)
            query = K.repeat_elements(query,x[1].shape[2],axis=2)
            s_matrix = K.batch_dot(query,x[1]) #(None, MAX_PASSAGE_PER_Q, MAX_Q_TERM, #(None, MAX_PASSAGE_PER_Q, MAX_Q_TERM, EMB_DIM)
            
            s_matrix = K.expand_dims(s_matrix)
            
            return s_matrix #Add one more dimension #(None, MAX_PASSAGE_PER_Q, MAX_Q_TERM, #(None, MAX_PASSAGE_PER_Q, MAX_Q_TERM, EMB_DIM, 1)
        elif self.interaction_mode==1:
            raise NotImplementedError("interaction mode of layer SimilarityMatrix is not implemented")
    """                  
    def compute_output_shape(self, input_shape):
        if self.interaction_mode==0:
            return (input_shape[0][0], input_shape[0][1], self.query_max_term, self.snippet_max_term, 1)
        elif self.interaction_mode==1:
            return (input_shape[0][0], input_shape[0][1], self.query_max_term, self.snippet_max_term, input_shape[0][2]*input_shape[1][2]+1) 
    """

"""""""""""""""""""""""""""
     ---- Layers ----
"""""""""""""""""""""""""""
#Embedding Layer
embedding = Embedding(VOCAB_SIZE,EMB_DIM, name="embedding_layer",weights=[emb_matrix], trainable=EMB_TRAINABLE)

#S matrix ref in the paper
similarity_matrix = SimilarityMatrix(MAX_Q_TERM, QUERY_CENTRIC_CONTEX, interaction_mode=S_MATRIX_MODE, name="query_snippet_similarity")

#transpose (None, QUERY_CENTRIC_CONTEX, EMB_DIM) => (None, EMB_DIM, QUERY_CENTRIC_CONTEX) 
transpose_layer = Lambda(lambda x:K.permute_dimensions(x,[0,1,2,4,3]), name="snippet_transpose") 

"""""""""""""""""""""""""""
 ---- Auxiliar Models ----
"""""""""""""""""""""""""""

#Snippet single embedding transformation
snippet_token_input = Input(shape = (MAX_Q_TERM, MAX_PASSAGES_PER_QUERY, QUERY_CENTRIC_CONTEX,), name = "snippet_token")
snippet_emb = embedding(snippet_token_input)
snippet_emb_transpose = transpose_layer(snippet_emb)
snippet_emb_model = Model(inputs = [snippet_token_input], outputs=[snippet_emb_transpose], name = "snippet_emb_model")
print("\n\nsnippet_emb_model summary")
snippet_emb_model.summary()

"""""""""""""""""""""""""""
  ---- Input Network ----
"""""""""""""""""""""""""""

if DEBUG:
    query_token_input = Input(shape=(MAX_Q_TERM,), name="query_tokens")


    snippets_tokens_input = Input(shape = (MAX_Q_TERM, MAX_PASSAGES_PER_QUERY, QUERY_CENTRIC_CONTEX), name = "snippet_tokens_ipmodel") 
    
    query_emb = embedding(query_token_input)

    snippet_emb = embedding(snippets_tokens_input)
    snippet_emb_transpose = transpose_layer(snippet_emb)
    
    sim_matrix_layer = similarity_matrix([query_emb,snippet_emb_transpose])
    
    
    
    input_model = Model(inputs = [query_token_input,snippets_tokens_input], outputs=[sim_matrix_layer], name="input_model")
    print("\n\ninput_model summary")
    input_model.summary()
    
    
    print("\nOutput tensor",sim_matrix_layer)
    



snippet_emb_model summary
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
snippet_token (InputLayer)   (None, 13, 5, 15)         0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 13, 5, 15, 200)    858359000 
_________________________________________________________________
snippet_transpose (Lambda)   (None, 13, 5, 200, 15)    0         
Total params: 858,359,000
Trainable params: 0
Non-trainable params: 858,359,000
_________________________________________________________________


<a id='measure_net'></a>
## Measure Network

In [6]:
class FastMaskedConv2D(Layer):
    
    def __init__(self, filters, kernel_size, activation, initializer='glorot_normal', regularizer=None, **kargs):
        super(FastMaskedConv2D, self).__init__(**kargs)

        self.activation = activations.get(activation)
        self.initializer = initializers.get(initializer)
        
        if regularizer is None or isinstance(regularizer,str):
            self.regularizer = regularizers.get(regularizer)
        else:
            self.regularizer = regularizer
        
        self.filters = filters
        self.kernel_size = kernel_size
        #self.activation = activation

    def build(self, input_shape):
        input_filter = int(input_shape[-1])
        
        self.kernel_3_3 = self.add_variable(name = "conv_kernel_3_3",
                                   shape = (3,3,input_filter,CNN_FILTERS),
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.kernel_5_1 = self.add_variable(name = "conv_kernel_3_1",
                                   shape = (5,1,input_filter,CNN_FILTERS),
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.kernel_1_5 = self.add_variable(name = "conv_kernel_1_3",
                                   shape = (1,5,input_filter,CNN_FILTERS),
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.kernel_3_3_bias = self.add_variable(name = "conv_kernel_3_3_bias",
                                   shape = (CNN_FILTERS,),
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.kernel_5_1_bias = self.add_variable(name = "conv_kernel_3_1_bias",
                                   shape = (CNN_FILTERS,),
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.kernel_1_5_bias = self.add_variable(name = "conv_kernel_1_3_bias",
                                   shape = (CNN_FILTERS,),
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        #end dimensions = 7, 9, 100
        
        
        self.dense = self.add_variable(name = "dense_projection",
                                   shape = (CNN_FILTERS*3, GRU_REPRESENTATION_DIM),
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.dense_bias = self.add_variable(name = "dense_projection_bias",
                                   shape = (GRU_REPRESENTATION_DIM,),
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
       
        
        super(FastMaskedConv2D, self).build(input_shape)
    
    def call(self, x):
        
        #condition = K.all(x) #if all the values are the same
        #zero_out = K.repeat_elements(x[:,:self.convOutShape[1],:self.convOutShape[2],:],self.convOutShape[3],axis=-1)
        
        #conv
        kernel_3_3 = K.conv2d(x, self.kernel_3_3)
        kernel_3_3 = K.bias_add(kernel_3_3, self.kernel_3_3_bias)
        kernel_3_3 = self.activation(kernel_3_3)
        kernel_3_3_pool = K.pool2d(kernel_3_3,(11,13))
        
        kernel_5_1 = K.conv2d(x, self.kernel_5_1)
        kernel_5_1 = K.bias_add(kernel_5_1, self.kernel_5_1_bias)
        kernel_5_1 = self.activation(kernel_5_1)
        print(kernel_5_1)
        #kernel_5_1_pool = K.pool2d(kernel_5_1,(11,13))
        
        kernel_1_5 = K.conv2d(x, self.kernel_1_5)
        kernel_1_5 = K.bias_add(kernel_1_5, self.kernel_1_5_bias)
        kernel_1_5 = self.activation(kernel_1_5)
        print(kernel_1_5)
        #kernel_1_5_pool = K.pool2d(kernel_1_3,(11,13))
        print(kernel_3_3_pool)
        print(kernel_3_1_pool)
        print(kernel_1_3_pool)
        
        kernel_3_3_flat = K.reshape(kernel_3_3_pool,(-1,CNN_FILTERS))
        kernel_3_1_flat = K.reshape(kernel_3_1_pool,(-1,CNN_FILTERS))
        kernel_1_3_flat = K.reshape(kernel_1_3_pool,(-1,CNN_FILTERS))
        print(kernel_3_3_flat)
        concat =  K.concatenate([kernel_3_3_flat,kernel_3_1_flat,kernel_1_3_flat])
        print(concat)
        
        proj = K.dot(concat, self.dense)
        proj = K.bias_add(proj,self.dense_bias)
        proj = self.activation(proj)
        return proj

        #masked_conv = K.switch(condition, zero_out , self.conv2layer(x) )
        
        #return masked_conv
    
class MaskedConv2D(Layer):
    
    def __init__(self, filters, kernel_size, activation, initializer='glorot_normal', regularizer=None, **kargs):
        super(MaskedConv2D, self).__init__(**kargs)

        self.activation = activations.get(activation)
        self.initializer = initializers.get(initializer)
        
        if regularizer is None or isinstance(regularizer,str):
            self.regularizer = regularizers.get(regularizer)
        else:
            self.regularizer = regularizer
        
        self.filters = filters
        self.kernel_size = kernel_size

    def build(self, input_shape):

        input_filter = int(input_shape[-1])
        
        self.kernel_3_3 = self.add_variable(name = "conv_kernel_3_3",
                                   shape = (3,3,input_filter,CNN_FILTERS),
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.kernel_5_1 = self.add_variable(name = "conv_kernel_5_1",
                                   shape = (5,1,input_filter,CNN_FILTERS),
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.kernel_1_5 = self.add_variable(name = "conv_kernel_1_5",
                                   shape = (1,5,input_filter,CNN_FILTERS),
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.kernel_3_3_bias = self.add_variable(name = "conv_kernel_3_3_bias",
                                   shape = (self.filters,),)
        
        self.kernel_5_1_bias = self.add_variable(name = "conv_kernel_5_1_bias",
                                   shape = (self.filters,),)
        
        self.kernel_1_5_bias = self.add_variable(name = "conv_kernel_1_5_bias",
                                   shape = (self.filters,),)
        
        #end dimensions = 7, 9, 100
        
        
        self.dense = self.add_variable(name = "dense_projection",
                                   shape = (self.filters*3, GRU_REPRESENTATION_DIM),
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.dense_bias = self.add_variable(name = "dense_projection_bias",
                                   shape = (GRU_REPRESENTATION_DIM,),)
        
        
        super(MaskedConv2D, self).build(input_shape)
    
    def call(self, x):
        
        condition = K.all(x) #if all the values are the same
        inv_condition = (1-K.cast(condition, K.floatx()))
        
        kernel_3_3 = K.conv2d(x, self.kernel_3_3)
        kernel_3_3 = K.bias_add(kernel_3_3, self.kernel_3_3_bias)
        kernel_3_3 = self.activation(kernel_3_3)
        kernel_3_3_pool = K.pool2d(kernel_3_3,(11,13))
        
        kernel_5_1 = K.conv2d(x, self.kernel_5_1)
        kernel_5_1 = K.bias_add(kernel_5_1, self.kernel_5_1_bias)
        kernel_5_1 = self.activation(kernel_5_1)
        kernel_5_1_pool = K.pool2d(kernel_5_1,(9,15))
        
        kernel_1_5 = K.conv2d(x, self.kernel_1_5)
        kernel_1_5 = K.bias_add(kernel_1_5, self.kernel_1_5_bias)
        kernel_1_5 = self.activation(kernel_1_5)
        kernel_1_5_pool = K.pool2d(kernel_1_5,(13,11))
        
        print(kernel_3_3_pool)
        print(kernel_5_1_pool)
        print(kernel_1_5_pool)
        
        kernel_3_3_flat = K.reshape(kernel_3_3_pool,(-1,self.filters))
        kernel_5_1_flat = K.reshape(kernel_5_1_pool,(-1,self.filters))
        kernel_1_5_flat = K.reshape(kernel_1_5_pool,(-1,self.filters))
        print(kernel_3_3_flat)
        print(kernel_5_1_flat)
        print(kernel_1_5_flat)
        
        concat =  K.concatenate([kernel_3_3_flat,kernel_5_1_flat,kernel_1_5_flat])
        print(concat)
        
        proj = K.dot(concat, self.dense)
        proj = K.bias_add(proj,self.dense_bias)
        proj = self.activation(proj)

        
        return proj * inv_condition

"""""""""""""""""""""""""""
 ---- Auxiliar Models ----
"""""""""""""""""""""""""""

#measure_layer = MaskedConv2D(filters = CNN_FILTERS, kernel_size=CNN_KERNELS, activation=ACTIVATION_FUNCTION)

cnn_extraction_model = Sequential(name="cnn_extraction_model")
cnn_extraction_model.add(MaskedConv2D(input_shape = (MAX_Q_TERM, QUERY_CENTRIC_CONTEX, S_MATRIX_3D_DIMENSION()), filters = CNN_FILTERS, kernel_size=CNN_KERNELS, activation=ACTIVATION_FUNCTION ))
print("\n\ncnn_extraction_model summary")
cnn_extraction_model.summary()

td_cnn_extraction_model = Sequential(name="TD_cnn_extraction_model")
td_cnn_extraction_model.add(TimeDistributed(cnn_extraction_model, input_shape=(MAX_PASSAGES_PER_QUERY, MAX_Q_TERM, QUERY_CENTRIC_CONTEX, S_MATRIX_3D_DIMENSION())))
td_cnn_extraction_model.summary()

"""""""""""""""""""""""""""
     ---- Layers ----
"""""""""""""""""""""""""""
#concatenation layer over the last dimension
concat_snippet_position = Concatenate( name = "concat_snippet_position")

#RNN using GRU units
if USE_BIDIRECTIONAL:
    rnn_instance = GRU(GRU_REPRESENTATION_DIM, kernel_regularizer=REGULARIZATION, activation=ACTIVATION_FUNCTION, name="aggregation_snippet_by_q_term")
    gru = Bidirectional(rnn_instance, merge_mode="concat")
else:
    gru = GRU(GRU_REPRESENTATION_DIM, kernel_regularizer=REGULARIZATION, activation=ACTIVATION_FUNCTION, name="aggregation_snippet_by_q_term")
    
#add dimension Layer
add_passage_dim = Lambda(lambda x:K.expand_dims(x,axis=1), name="add_passage_dim")#Reshape(target_shape=(1,GRU_REPRESENTATION_DIM))

#add last dimension Layer
add_dim = Lambda(lambda x:K.expand_dims(x), name="add_dim")

#reciprocal function
reciprocal_f = Lambda(lambda x:1/(x+2), name="reciprocal_function")

#concatenation layer over second dimension (passage dimension)
concat_representation = Concatenate(axis = 1,name = "concat_representation")

Tensor("masked_conv2d/MaxPool:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("masked_conv2d/MaxPool_1:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("masked_conv2d/MaxPool_2:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("masked_conv2d/Reshape:0", shape=(?, 100), dtype=float32)
Tensor("masked_conv2d/Reshape_1:0", shape=(?, 100), dtype=float32)
Tensor("masked_conv2d/Reshape_2:0", shape=(?, 100), dtype=float32)
Tensor("masked_conv2d/concat:0", shape=(?, 300), dtype=float32)


cnn_extraction_model summary
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masked_conv2d (MaskedConv2D) (None, 56)                19056     
Total params: 19,056
Trainable params: 19,056
Non-trainable params: 0
_________________________________________________________________
Tensor("time_distributed/masked_conv2d/MaxPool:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("time_distributed/masked_conv2d/MaxPool_1:0", shape=(?, 1, 1

<a id='measure_net'></a>
## Aggregation Network

In [7]:

"""""""""""""""""""""""""""
  ---- Custom Layers ----
"""""""""""""""""""""""""""

if USE_BIDIRECTIONAL:
    snippet_rnn_rep_dim = GRU_REPRESENTATION_DIM*2
else:
    snippet_rnn_rep_dim = GRU_REPRESENTATION_DIM

class TermGating(Layer):
    
    def __init__(self, vocab_size,activation=None, initializer='glorot_normal', regularizer=None):
        super(TermGating, self).__init__()

        self.activation = activations.get(activation)
        self.initializer = initializers.get(initializer)
        
        if regularizer is None or isinstance(regularizer,str):
            self.regularizer = regularizers.get(regularizer)
        else:
            self.regularizer = regularizer
        
        self.vocab_size = vocab_size

    def build(self, input_shape):
        
        #term gating W
        self.We = self.add_variable(name = "term_gating_We",
                                   shape = [self.vocab_size,1],
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        #self.ones = K.constant(np.ones((aggreation_dimension,1)))
       
        """ self.fully_connected_W = self.add_variable(name = "fully_connect_We",
                                   shape = [self.vocab_size,1],
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        """
        self.dense_score = Dense(1,kernel_regularizer = self.regularizer, activation=self.activation)
        self.dense_score.build(input_shape[1])
        self._trainable_weights += self.dense_score.trainable_weights
        
        super(TermGating, self).build(input_shape)
    
    def call(self, x):
        #broadcast = self.We * self.ones
        
        query_token = K.cast(x[0],'int32') #(None, MAX_Q_TERM)
        snippet_representation_per_query = x[1] #(None, MAX_Q_TERM, BI_GRU_DIM)
        
        query_token_onehot = K.one_hot(query_token, self.vocab_size)
        print(query_token_onehot)
        
        query_vars = K.squeeze(K.dot(query_token_onehot,self.We),axis=-1)
        print(query_vars) #(None, MAX_Q_TERM)
        
        query_term_prob = K.softmax(query_vars) 
        print(query_term_prob) #(None, MAX_Q_TERM)
        
        snippet_representation = K.expand_dims(query_term_prob) * snippet_representation_per_query
        snippet_representation_sum = K.sum(snippet_representation, axis= 1)
        #self.dense_score(snippet_representation_sum)
        
        return self.dense_score(snippet_representation_sum)
    


class TermGatingDRMM(Layer):
    
    def __init__(self, embedding_dim = EMB_DIM, activation=None, initializer='glorot_normal', regularizer=None):
        super(TermGatingDRMM, self).__init__()

        self.activation = activations.get(activation)
        self.initializer = initializers.get(initializer)
        
        if regularizer is None or isinstance(regularizer,str):
            self.regularizer = regularizers.get(regularizer)
        else:
            self.regularizer = regularizer
        
        self.emb_dim = embedding_dim
        
    def build(self, input_shape):
        
        #term gating W
        self.We = self.add_variable(name = "term_gating_We",
                                   shape = [self.emb_dim,1],
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        #self.ones = K.constant(np.ones((aggreation_dimension,1)))
        
        super(TermGatingDRMM, self).build(input_shape)
    
    def call(self, x):
        
        query_embeddings = x[0] #(None, MAX_Q_TERM, EMB_SIZE)
        snippet_representation_per_query = x[1] #(None, MAX_Q_TERM, BI_GRU_DIM)
        
        #compute gated weights
        gated_logits = K.dot(query_embeddings, self.We) 
        gated_distribution = K.softmax(gated_logits)
        
        #apply the gated weights
        #print(snippet_representation_per_query)
        #print(gated_distribution)
        
        #normalization?
        
        return K.sum(K.sum(snippet_representation_per_query * gated_distribution, axis = -1),  axis = -1) # Replace with K.sum of all elements?



class TermGatingDRMM_Projection(Layer):
    
    def __init__(self, embedding_dim = EMB_DIM, rnn_dim = snippet_rnn_rep_dim ,activation=None, initializer='glorot_normal', regularizer=None):
        super(TermGatingDRMM_Projection, self).__init__()

        self.activation = activations.get(activation)
        self.initializer = initializers.get(initializer)
        
        if regularizer is None or isinstance(regularizer,str):
            self.regularizer = regularizers.get(regularizer)
        else:
            self.regularizer = regularizer
        
        self.emb_dim = embedding_dim
        self.rnn_dim = rnn_dim

    def build(self, input_shape):
        
        #term gating W
        self.W_query = self.add_variable(name = "term_gating_We",
                                   shape = [self.emb_dim,1],
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.W_snippets_projection_score = self.add_variable(name = "snippets_projection_score",
                                   shape = [self.rnn_dim,1],
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
    
        #self.ones = K.constant(np.ones((aggreation_dimension,1)))
        
        super(TermGatingDRMM_Projection, self).build(input_shape)
    
    def call(self, x):
        
        query_embeddings = x[0] #(None, MAX_Q_TERM, EMB_SIZE)
        snippet_representation_per_query = x[1] #(None, MAX_Q_TERM, BI_GRU_DIM)
        
        #compute gated weights
        gated_logits = K.squeeze(K.dot(query_embeddings, self.W_query), axis = -1 )
        #print(gated_logits)
        gated_distribution = K.softmax(gated_logits)
        #print(gated_distribution)
        #snippet projection
        snippets_score = K.squeeze(K.dot(snippet_representation_per_query, self.W_snippets_projection_score), axis = -1)
        #snippets_score_activation = tanh(snippets_score)
        
        weighted_score = snippets_score * gated_distribution
        
        return K.sum(weighted_score,  axis = 1) # Replace with K.sum of all elements?

    
class TermGatingDRMM_FFN(Layer):
    
    def __init__(self, embedding_dim = EMB_DIM, rnn_dim = snippet_rnn_rep_dim ,activation=None, initializer='glorot_normal', regularizer=None):
        super(TermGatingDRMM_FFN, self).__init__()

        self.activation = activations.get(activation)
        self.initializer = initializers.get(initializer)
        
        if regularizer is None or isinstance(regularizer,str):
            self.regularizer = regularizers.get(regularizer)
        else:
            self.regularizer = regularizer
        
        self.emb_dim = embedding_dim
        self.rnn_dim = rnn_dim

    def build(self, input_shape):
        
        #term gating W
        self.W_query = self.add_variable(name = "term_gating_We",
                                   shape = [self.emb_dim,1],
                                   initializer = self.initializer,
                                   regularizer = self.regularizer,)
        
        self.dense_score = Dense(1,kernel_regularizer = self.regularizer, activation=self.activation)
        
        dense_shape = input_shape[1]
        print(dense_shape)
        
        self.dense_score.build((dense_shape[0],dense_shape[2]))
        self._trainable_weights += self.dense_score.trainable_weights
        #self.ones = K.constant(np.ones((aggreation_dimension,1)))
        
        super(TermGatingDRMM_FFN, self).build(input_shape)
    
    def call(self, x):
        
        query_embeddings = x[0] #(None, MAX_Q_TERM, EMB_SIZE)
        snippet_representation_per_query = x[1] #(None, MAX_Q_TERM, BI_GRU_DIM)
        
        #compute gated weights
        gated_logits = K.squeeze(K.dot(query_embeddings, self.W_query), axis = -1 )
        #print(gated_logits)
        gated_distribution = K.expand_dims(K.softmax(gated_logits))
        #print(gated_distribution)
        #snippet projection
        
        weighted_score = K.sum(snippet_representation_per_query * gated_distribution,  axis = 1)
        print(weighted_score)
        
        return self.dense_score(weighted_score) # Replace with K.sum of all elements?

<a id='final_net'></a>
## Final Network

In [8]:

"""""""""""""""""""""""""""
  ---- Final Network ----
"""""""""""""""""""""""""""
query_token_input = Input(shape=(MAX_Q_TERM,), name="ds_query_tokens")
doc_score_snippet_input = Input(shape = (MAX_Q_TERM,MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), name = "ds_snippet_tokens")
doc_score_snippet_position_input = Input(shape = (MAX_Q_TERM,MAX_PASSAGES_PER_QUERY), name = "ds_snippet_position_tokens")


unstack_by_q_term = Lambda(lambda x:unstack(x,axis=1), name="unstack_query_term")

#doc_score_snippet_by_q_term = unstack_by_q_term(doc_score_snippet_input)
#doc_score_snippet_position_by_q_term = unstack_by_q_term(doc_score_snippet_position_input)

#INPUT in token format
#query_token_input = Input(shape=(MAX_Q_TERM,), name="query_tokens")
#snippets_tokens_input = [Input(shape = (MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), name = "snippet_tokens_"+str(q)) for q in range(MAX_Q_TERM)]
#inputs_contex_position = [Input(shape = (MAX_PASSAGES_PER_QUERY,), name = "q_context_position_"+str(q)) for q in range(MAX_Q_TERM)]

query_emb = embedding(query_token_input)

doc_score_snippet_emb = embedding(doc_score_snippet_input)
doc_score_snippet_emb_transpose = transpose_layer(doc_score_snippet_emb)

query_snippets_s_matrix = similarity_matrix([query_emb,doc_score_snippet_emb_transpose])

list_of_s_matrix_by_q_term = unstack_by_q_term(query_snippets_s_matrix)
list_of_snippet_postion_by_q_term = unstack_by_q_term(doc_score_snippet_position_input)

relevance_representation = []
for i in range(MAX_Q_TERM):
    
    snippet_relative_position = reciprocal_f(list_of_snippet_postion_by_q_term[i])
    
    local_relevance = td_cnn_extraction_model(list_of_s_matrix_by_q_term[i])
    
    local_relevance_position = concat_snippet_position([local_relevance,add_dim(snippet_relative_position)])
    
    relevance_representation.append(add_passage_dim(gru(local_relevance_position)))

concat_relevance = concat_representation(relevance_representation)

if TERM_GATING_MODE==0:
    term_gating = TermGating(vocab_size=VOCAB_SIZE, activation=ACTIVATION_FUNCTION)
    document_score = term_gating([query_token_input,concat_relevance])
    
elif TERM_GATING_MODE==1:
    term_gating = TermGatingDRMM()
    document_score = term_gating([query_emb,concat_relevance])

elif TERM_GATING_MODE==2:
    term_gating = TermGatingDRMM_Projection()
    document_score = term_gating([query_emb,concat_relevance])

elif TERM_GATING_MODE==3:
    term_gating = TermGatingDRMM_FFN(activation=ACTIVATION_FUNCTION, regularizer=REGULARIZATION)
    document_score = term_gating([query_emb,concat_relevance])

document_score_model = Model(inputs = [query_token_input, doc_score_snippet_input, doc_score_snippet_position_input], outputs = [document_score], name="query_document_score")
document_score_model.summary()      



Tensor("TD_cnn_extraction_model/time_distributed/masked_conv2d/MaxPool:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("TD_cnn_extraction_model/time_distributed/masked_conv2d/MaxPool_1:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("TD_cnn_extraction_model/time_distributed/masked_conv2d/MaxPool_2:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("TD_cnn_extraction_model/time_distributed/masked_conv2d/Reshape:0", shape=(?, 100), dtype=float32)
Tensor("TD_cnn_extraction_model/time_distributed/masked_conv2d/Reshape_1:0", shape=(?, 100), dtype=float32)
Tensor("TD_cnn_extraction_model/time_distributed/masked_conv2d/Reshape_2:0", shape=(?, 100), dtype=float32)
Tensor("TD_cnn_extraction_model/time_distributed/masked_conv2d/concat:0", shape=(?, 300), dtype=float32)
Tensor("TD_cnn_extraction_model_1/time_distributed/masked_conv2d/MaxPool:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("TD_cnn_extraction_model_1/time_distributed/masked_conv2d/MaxPool_1:0", shape=(?, 1, 1, 100), dtype=float32)
Ten

Tensor("TD_cnn_extraction_model_11/time_distributed/masked_conv2d/MaxPool:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("TD_cnn_extraction_model_11/time_distributed/masked_conv2d/MaxPool_1:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("TD_cnn_extraction_model_11/time_distributed/masked_conv2d/MaxPool_2:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("TD_cnn_extraction_model_11/time_distributed/masked_conv2d/Reshape:0", shape=(?, 100), dtype=float32)
Tensor("TD_cnn_extraction_model_11/time_distributed/masked_conv2d/Reshape_1:0", shape=(?, 100), dtype=float32)
Tensor("TD_cnn_extraction_model_11/time_distributed/masked_conv2d/Reshape_2:0", shape=(?, 100), dtype=float32)
Tensor("TD_cnn_extraction_model_11/time_distributed/masked_conv2d/concat:0", shape=(?, 300), dtype=float32)
Tensor("TD_cnn_extraction_model_12/time_distributed/masked_conv2d/MaxPool:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("TD_cnn_extraction_model_12/time_distributed/masked_conv2d/MaxPool_1:0", shape=(?, 1, 1, 1

## FINAL Trainable arch

In [9]:


query_token_input = Input(shape=(MAX_Q_TERM,), name="dr_query_tokens")
positive_snippet_input = Input(shape = (MAX_Q_TERM,MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), name = "positive_snippet_tokens")
positive_snippet_position_input = Input(shape = (MAX_Q_TERM,MAX_PASSAGES_PER_QUERY), name = "positive_snippet_position_tokens")
negative_snippet_input = Input(shape = (MAX_Q_TERM,MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), name = "negative_snippet_tokens")
negative_snippet_position_input = Input(shape = (MAX_Q_TERM,MAX_PASSAGES_PER_QUERY), name = "negative_snippet_position_tokens")

positive_documents_score = document_score_model([query_token_input, positive_snippet_input, positive_snippet_position_input])
negative_documents_score = document_score_model([query_token_input, negative_snippet_input, negative_snippet_position_input])

#stack_socres = stack_scores_layer([positive_documents_score,negative_documents_score])



#pairwise_loss_layer = Lambda(pairwise_hinge_loss, name="pairwise_hinge")
#pairwise_loss = pairwise_loss_layer([positive_documents_score,negative_documents_score])



inputs = [query_token_input, positive_snippet_input, positive_snippet_position_input, negative_snippet_input, negative_snippet_position_input]

deepRank_model = Model(inputs = inputs, outputs = [positive_documents_score, negative_documents_score], name="deep_rank")


p_loss = K.mean(K.maximum(0.0, 1.0 - positive_documents_score + negative_documents_score))

deepRank_model.add_loss(p_loss)

deepRank_model.summary() 
#m.predict([Q, Q_t1_passage, Q_t2_passage, Q_t3_passage, Q_t1_passage_pos, Q_t2_passage_pos, Q_t3_passage_pos])

#deepRank_model.compile(loss=pairwise_hinge_loss, optimizer='sgd')

Tensor("query_document_score/TD_cnn_extraction_model/time_distributed/masked_conv2d/MaxPool:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model/time_distributed/masked_conv2d/MaxPool_1:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model/time_distributed/masked_conv2d/MaxPool_2:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model/time_distributed/masked_conv2d/Reshape:0", shape=(?, 100), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model/time_distributed/masked_conv2d/Reshape_1:0", shape=(?, 100), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model/time_distributed/masked_conv2d/Reshape_2:0", shape=(?, 100), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model/time_distributed/masked_conv2d/concat:0", shape=(?, 300), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_1/time_distributed/masked_conv2d/

Tensor("query_document_score/TD_cnn_extraction_model_12/time_distributed/masked_conv2d/MaxPool:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_12/time_distributed/masked_conv2d/MaxPool_1:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_12/time_distributed/masked_conv2d/MaxPool_2:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_12/time_distributed/masked_conv2d/Reshape:0", shape=(?, 100), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_12/time_distributed/masked_conv2d/Reshape_1:0", shape=(?, 100), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_12/time_distributed/masked_conv2d/Reshape_2:0", shape=(?, 100), dtype=float32)
Tensor("query_document_score/TD_cnn_extraction_model_12/time_distributed/masked_conv2d/concat:0", shape=(?, 300), dtype=float32)
Tensor("query_document_score/term_gating_drmm_ffn/Sum:0", shape=(

Tensor("query_document_score_1/TD_cnn_extraction_model_10/time_distributed/masked_conv2d/MaxPool:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("query_document_score_1/TD_cnn_extraction_model_10/time_distributed/masked_conv2d/MaxPool_1:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("query_document_score_1/TD_cnn_extraction_model_10/time_distributed/masked_conv2d/MaxPool_2:0", shape=(?, 1, 1, 100), dtype=float32)
Tensor("query_document_score_1/TD_cnn_extraction_model_10/time_distributed/masked_conv2d/Reshape:0", shape=(?, 100), dtype=float32)
Tensor("query_document_score_1/TD_cnn_extraction_model_10/time_distributed/masked_conv2d/Reshape_1:0", shape=(?, 100), dtype=float32)
Tensor("query_document_score_1/TD_cnn_extraction_model_10/time_distributed/masked_conv2d/Reshape_2:0", shape=(?, 100), dtype=float32)
Tensor("query_document_score_1/TD_cnn_extraction_model_10/time_distributed/masked_conv2d/concat:0", shape=(?, 300), dtype=float32)
Tensor("query_document_score_1/TD_cnn_extraction_mo

In [10]:
WRITE_GRAPH = False
if WRITE_GRAPH:
    from tensorflow.summary import FileWriter

    graph = K.get_session().graph
     # Your model implementation
    #with graph.as_default():
      # compile method actually creates the model in the graph.
      #deepRank_model.compile(loss=identity_loss, optimizer='adam', metrics=['accuracy'])
    writer = FileWriter(logdir='tensorboard/deepRank', graph=graph)
    writer.flush()




In [11]:
#document_score.predict([Q, Q_t1_passage, Q_t2_passage, Q_t3_passage, Q_t1_passage_pos, Q_t2_passage_pos, Q_t3_passage_pos])

In [12]:
path_dl_train = "/backup/results/fast_method_relevant_results/train_data_deep_models_v2.tar.gz"



tar = tarfile.open(path_dl_train)
#open
print("Open",path_dl_train)
m = tar.getmembers()[0]
f = tar.extractfile(m)
train_articles_collection = pickle.load(f)

f.close()

Open /backup/results/fast_method_relevant_results/train_data_deep_models_v2.tar.gz


In [28]:
PARTILLY_POSITIVE_SAMPLES = 3
NEGATIVE_SAMPLES = 1

class TrainDataGenerator(object):
    def __init__(self, article_collection, tokenizer, batch_queries_size):
        
        self.batch_size = batch_queries_size
        self.tokenizer = tokenizer
        
        self.train_data = article_collection["bioasq_data"]
        self.articles = article_collection["collection"]
        self.irrelevant_pmid = article_collection["irrelevant_pmid"]
        
        self.num_steps = len(self.train_data)//self.batch_size
    
    def __len__(self):
        return self.num_steps
    
    def __iter__(self):
        
        
        query = []
        query_positive_doc = []
        query_positive_doc_position = []
        query_negative_doc = []
        query_negative_doc_position = []
        
        while True:
            
            #stop condition
            if len(query)>=self.batch_size:
                #missing fill the gap for the missing query_terms
                query = np.array(query)
                p=np.random.permutation(query.shape[0])
                query = query[p]
                query_positive_doc = np.array(query_positive_doc)[p]
                query_positive_doc_position = np.array(query_positive_doc_position)[p]
                query_negative_doc = np.array(query_negative_doc)[p]
                query_negative_doc_position =  np.array(query_negative_doc_position)[p]
                
                X = [query, query_positive_doc, query_positive_doc_position, query_negative_doc, query_negative_doc_position]
                #Y = [np.zeros((len(query))),np.zeros((len(query)))]
                yield X


                #reset
                query = []
                query_positive_doc = []
                query_positive_doc_position = []
                query_negative_doc = []
                query_negative_doc_position = []
            
            #select a random question
            random_query_index = random.randint(0, len(self.train_data)-1) 
            query_data = self.train_data[random_query_index]
            
            #list of partilly relevant documents
            partilly_positive_pmid_docs = query_data["partilly_positive_pmid"]

            tokenized_query = query_data["query"][:MAX_Q_TERM]
            
            for j in range(PARTILLY_POSITIVE_SAMPLES+NEGATIVE_SAMPLES):
                #select a random positive
                random_doc_index = random.randint(0, len(query_data["positive_pmid"])-1) 
                doc_pmid = query_data["positive_pmid"][random_doc_index]

                tokenized_positive_doc = self.articles[doc_pmid]
                positive_snippets, positive_snippets_position = self.__snippet_interaction(tokenized_query, tokenized_positive_doc)
                
                if j<PARTILLY_POSITIVE_SAMPLES:
                    #select the partilly posivite doc
                    random_ind = bisect(query_data["partially_positive_cumulative_prob"],random.random())
                    random_negative_doc_pmid = query_data["partilly_positive_pmid"][random_ind]
                    #print(self.__get_article(random_negative_doc_pmid))
                    tokenized_negative_doc = self.articles[random_negative_doc_pmid]
                    negative_snippets, negative_snippets_position = self.__snippet_interaction(tokenized_query, tokenized_negative_doc)
                else:
                    #select a random negative
                    random_doc_index = random.randint(0, len(self.irrelevant_pmid)-1) 
                    doc_pmid = self.irrelevant_pmid[random_doc_index]
                    
                    tokenized_negative_doc = self.articles[doc_pmid]
                    negative_snippets, negative_snippets_position = self.__snippet_interaction(tokenized_query, tokenized_negative_doc)
                
                
                ### add ###

                #not efficient
                query.append(tokenized_query)

                #positive doc
                query_positive_doc.append(positive_snippets)
                query_positive_doc_position.append(positive_snippets_position)

                #negative doc
                query_negative_doc.append(negative_snippets)
                query_negative_doc_position.append(negative_snippets_position)
            

            
    def __snippet_interaction(self, tokenized_query, tokenized_doc, snippet_length=QUERY_CENTRIC_CONTEX):
        
        snippets = []
        snippets_position = [] 

        half_size = snippet_length//2
        
        #O(n^2) complexity, probably can do better with better data struct TODO see if is worthit
        for query_token in tokenized_query:
            
            snippets_per_token = []
            snippets_per_token_position = []
            
            if query_token != 0: #jump padded token
            
                for i,doc_token in enumerate(tokenized_doc):

                    if doc_token==query_token:

                        lower_index = i-half_size
                        lower_index = max(0,lower_index)

                        higher_index = i+half_size
                        higher_index = min(len(tokenized_doc),higher_index)

                        snippets_per_token.append(tokenized_doc[lower_index:higher_index])
                        snippets_per_token_position.append(i)
            
            if len(snippets_per_token)==0:
                snippets.append(np.zeros((MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), dtype=np.int32))
                snippets_position.append(np.zeros((MAX_PASSAGES_PER_QUERY), dtype=np.int32)+SNIPPET_POSITION_PADDING_VALUE)
                continue
                
            max_snippets_len = min(MAX_PASSAGES_PER_QUERY, len(snippets_per_token))
            
            ### snippets in matrix format
            #pad
            snippets_per_token = pad_sequences(snippets_per_token, maxlen = QUERY_CENTRIC_CONTEX, padding="post")
            #fill the gaps
            _temp = np.zeros((MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), dtype=np.int32)
            _temp[:max_snippets_len] = snippets_per_token[:max_snippets_len]
            snippets.append(_temp)
            
            ### snippets_position in matrix format
            #pad
            snippets_per_token_position = pad_sequences([snippets_per_token_position], maxlen = MAX_PASSAGES_PER_QUERY, padding="post", value=SNIPPET_POSITION_PADDING_VALUE)[0]
            snippets_position.append(snippets_per_token_position)
            
        return snippets, snippets_position
            
        
        


In [14]:
#X,Y = next(data_generator)

TESTE = False
if TESTE:
    data_generator = TrainDataGenerator(train_articles_collection, tk, 1)
    data_generator = iter(data_generator)
    X,Y = next(data_generator)
    
    print(X[0].shape)
    print(X[1].shape)
    print(X[2].shape)
    print(X[3].shape)
    print(X[4].shape)
    print(Y[0].shape)
    print(Y[1].shape)
    deepRank_model.predict(X)

In [15]:
from tensorflow.keras.optimizers import SGD, Adam,  Adadelta

#sgd = SGD(lr=0.001)
#adam = Adam(lr=0.001)
adadelta = Adadelta(lr=2)

deepRank_model.compile( optimizer=adadelta)



In [29]:
gen = TrainDataGenerator(train_articles_collection, tk, 256)

gen_iter = iter(gen)

# SE DER ERRO POR CAUSA DA SHAPE A 1º AMOSTRA DAS QUERIES NAO TEM PADDING BURRODE MERDA
# MUDA ISSO
#loss = []

for i,line in enumerate(loss):
    
    print("Epoach:",i,"| avg loss:",np.mean(loss[i]),"| max loss:",np.max(loss[i]),"| min loss:",np.min(loss[i]))

import time


for epoach in range(600,800):
    loss_per_epoach = []
    for step in range(len(gen)):
        X = next(gen_iter)
        
        start = time.time()
        loss_per_epoach.append(deepRank_model.train_on_batch(X))
        print("Step:",step,"| loss:",loss_per_epoach[-1],"| current max loss:",np.max(loss_per_epoach),"| current min loss:",np.min(loss_per_epoach),"| time:",time.time()-start,end="\r")
    
    loss.append(loss_per_epoach)
    print("",end="\r")#clear the line
    print("Epoach:",epoach,"| avg loss:",np.mean(loss[-1]),"| max loss:",np.max(loss[-1]),"| min loss:",np.min(loss[-1]))
#deepRank_model.fit_generator(gen_iter, steps_per_epoch=len(gen), verbose=1, epochs=27)

Epoach: 0 | avg loss: 0.7913 | max loss: 1.0618795 | min loss: 0.46426705
Epoach: 1 | avg loss: 0.28404543 | max loss: 0.41293472 | min loss: 0.15731314
Epoach: 2 | avg loss: 0.1978395 | max loss: 0.26958653 | min loss: 0.12772673
Epoach: 3 | avg loss: 0.17477201 | max loss: 0.22347149 | min loss: 0.13149385
Epoach: 4 | avg loss: 0.17481062 | max loss: 0.21199152 | min loss: 0.15189683
Epoach: 5 | avg loss: 0.1647676 | max loss: 0.2269466 | min loss: 0.13477898
Epoach: 6 | avg loss: 0.1689105 | max loss: 0.23404966 | min loss: 0.1150029
Epoach: 7 | avg loss: 0.14810923 | max loss: 0.18586242 | min loss: 0.12309752
Epoach: 8 | avg loss: 0.13462949 | max loss: 0.17335048 | min loss: 0.077673875
Epoach: 9 | avg loss: 0.12295155 | max loss: 0.16165823 | min loss: 0.087628536
Epoach: 10 | avg loss: 0.13260557 | max loss: 0.16866186 | min loss: 0.09964288
Epoach: 11 | avg loss: 0.14011426 | max loss: 0.1801532 | min loss: 0.09014022
Epoach: 12 | avg loss: 0.11646027 | max loss: 0.15067527 | 

Epoach: 253 | avg loss: 0.12833104 | max loss: 0.24614295 | min loss: 0.0934112
Epoach: 254 | avg loss: 0.11724376 | max loss: 0.17380525 | min loss: 0.07709921
Epoach: 255 | avg loss: 0.112171225 | max loss: 0.18124862 | min loss: 0.055527654
Epoach: 256 | avg loss: 0.11262966 | max loss: 0.19989505 | min loss: 0.065707095
Epoach: 257 | avg loss: 0.11024994 | max loss: 0.17043734 | min loss: 0.05789831
Epoach: 258 | avg loss: 0.123399734 | max loss: 0.1557032 | min loss: 0.0762832
Epoach: 259 | avg loss: 0.101884246 | max loss: 0.1473418 | min loss: 0.06542783
Epoach: 260 | avg loss: 0.10480735 | max loss: 0.15775296 | min loss: 0.07111898
Epoach: 261 | avg loss: 0.09542991 | max loss: 0.12660466 | min loss: 0.075599164
Epoach: 262 | avg loss: 0.1210182 | max loss: 0.17749389 | min loss: 0.06298205
Epoach: 263 | avg loss: 0.1021522 | max loss: 0.1728346 | min loss: 0.065358765
Epoach: 264 | avg loss: 0.10175727 | max loss: 0.14613143 | min loss: 0.07817303
Epoach: 265 | avg loss: 0.09

Epoach: 464 | avg loss: 0.11019301 | max loss: 0.18256463 | min loss: 0.0714548
Epoach: 465 | avg loss: 0.120230906 | max loss: 0.14045869 | min loss: 0.09243393
Epoach: 466 | avg loss: 0.11689432 | max loss: 0.15204023 | min loss: 0.08564075
Epoach: 467 | avg loss: 0.100148946 | max loss: 0.14812148 | min loss: 0.06808079
Epoach: 468 | avg loss: 0.10733463 | max loss: 0.15563832 | min loss: 0.083371356
Epoach: 469 | avg loss: 0.10300331 | max loss: 0.13063261 | min loss: 0.07354251
Epoach: 470 | avg loss: 0.10525713 | max loss: 0.13429189 | min loss: 0.058120422
Epoach: 471 | avg loss: 0.11827902 | max loss: 0.15857972 | min loss: 0.09311345
Epoach: 472 | avg loss: 0.11514536 | max loss: 0.14596418 | min loss: 0.05639213
Epoach: 473 | avg loss: 0.10698302 | max loss: 0.17552999 | min loss: 0.04539504
Epoach: 474 | avg loss: 0.103786275 | max loss: 0.12186554 | min loss: 0.08961175
Epoach: 475 | avg loss: 0.10541556 | max loss: 0.13271034 | min loss: 0.074856214
Epoach: 476 | avg loss:

Epoach: 600 | avg loss: 0.18017107 | max loss: 0.21906209 | min loss: 0.127669722766972 | time: 14.4718029499053961
Epoach: 601 | avg loss: 0.2051513 | max loss: 0.2984446 | min loss: 0.16318712.16318712 | time: 14.4547514915466333
Epoach: 602 | avg loss: 0.19760701 | max loss: 0.2505924 | min loss: 0.1334693.1334693 | time: 14.53653073310852213
Epoach: 603 | avg loss: 0.19505942 | max loss: 0.24717616 | min loss: 0.1134258511342585 | time: 14.437435150146484
Epoach: 604 | avg loss: 0.18049185 | max loss: 0.23993 | min loss: 0.1339868813398688 | time: 14.395848274230957
Epoach: 605 | avg loss: 0.20090729 | max loss: 0.24189638 | min loss: 0.1143363.1143363 | time: 14.4685728549957284
Epoach: 606 | avg loss: 0.20795703 | max loss: 0.2542423 | min loss: 0.1577203915772039 | time: 14.3963932991027836
Epoach: 607 | avg loss: 0.1835829 | max loss: 0.21797037 | min loss: 0.13431786.13431786 | time: 14.445376873016357
Epoach: 608 | avg loss: 0.2178486 | max loss: 0.30314606 | min loss: 0.1509

Epoach: 740 | avg loss: 0.18280111 | max loss: 0.21016264 | min loss: 0.1386526.1386526 | time: 14.3188416957855222
Epoach: 741 | avg loss: 0.18429065 | max loss: 0.22245573 | min loss: 0.153061825306182 | time: 14.3206667900085455
Epoach: 742 | avg loss: 0.21404672 | max loss: 0.27120417 | min loss: 0.1029392510293925 | time: 14.286118030548096
Epoach: 743 | avg loss: 0.19454741 | max loss: 0.27496868 | min loss: 0.1196507.1196507 | time: 14.301302194595337
Epoach: 744 | avg loss: 0.21039118 | max loss: 0.26420984 | min loss: 0.1495484614954846 | time: 14.294727087020874
Epoach: 745 | avg loss: 0.21562502 | max loss: 0.27349216 | min loss: 0.12359588623595886 | time: 14.410547733306885
Epoach: 746 | avg loss: 0.2038736 | max loss: 0.2622284 | min loss: 0.15408482.15408482 | time: 14.2936491966247568
Epoach: 747 | avg loss: 0.19341321 | max loss: 0.22901432 | min loss: 0.158348045834804 | time: 14.2889642715454196
Epoach: 748 | avg loss: 0.18538025 | max loss: 0.23477581 | min loss: 0.

In [21]:
data_generator = TrainDataGenerator(train_articles_collection, tk, 20)
data_generator = iter(data_generator)
X = next(data_generator)

#pos, neg = deepRank_model.predict(X)

In [22]:
pos,neg = deepRank_model.predict(X)

## Test (validation) data generator


In [17]:
path_dl_test = "/backup/results/fast_method_relevant_results/test_data_deep_models_v2.tar.gz"



tar = tarfile.open(path_dl_test)
#open
print("Open",path_dl_test)
m = tar.getmembers()[0]
f = tar.extractfile(m)
test_articles_collection = pickle.load(f)

f.close()

Open /backup/results/fast_method_relevant_results/test_data_deep_models_v2.tar.gz


In [18]:

class TestDataGenerator(object):
    def __init__(self, article_collection, tokenizer):
        
        self.tokenizer = tokenizer
        
        self.test_data = article_collection["bioasq_data"] 
        self.articles = article_collection["collection"]
        
        self.num_steps = len(self.test_data)
        
    
    def __get_article(self, pmid):
        return self.article_map(self.articles[pmid])
    
    def __len__(self):
        return self.num_steps
    
    def __iter__(self):
        
        
        query = []
        query_doc = []
        query_doc_position = []
        

        for query_data in self.test_data:

            #tokenized_query = self.tokenizer.texts_to_sequences([query_data["query"]])[0]
            tokenized_query = query_data["query"][:MAX_Q_TERM]
            #manualy remove the stopwords
            #tokenized_query = [ token for token in tokenized_query if token not in biomedical_stop_words_tokens]

            #tokenized_query = pad_sequences([tokenized_query], maxlen = MAX_Q_TERM, padding="post")[0]

            for doc_pmid in query_data["documents"]:
                #positive

                tokenized_doc = self.articles[doc_pmid]
                doc_snippets, doc_snippets_position = self.__snippet_interaction(tokenized_query, tokenized_doc)

                ### add ###

                query.append(tokenized_query)

                #positive doc
                query_doc.append(doc_snippets)
                query_doc_position.append(doc_snippets_position)


            #missing fill the gap for the missing query_terms

            X = [np.array(query), np.array(query_doc), np.array(query_doc_position)]

            yield X

            #reset
            query = []
            query_doc = []
            query_doc_position = []

                
    def __snippet_interaction(self, tokenized_query, tokenized_doc, snippet_length=QUERY_CENTRIC_CONTEX):
        
        snippets = []
        snippets_position = [] 

        half_size = snippet_length//2
        
        #O(n^2) complexity, probably can do better with better data struct TODO see if is worthit
        for query_token in tokenized_query:
            
            snippets_per_token = []
            snippets_per_token_position = []
            
            if query_token != 0: #jump padded token
                
                for i,doc_token in enumerate(tokenized_doc):

                    if doc_token==query_token:

                        lower_index = i-half_size
                        lower_index = max(0,lower_index)

                        higher_index = i+half_size
                        higher_index = min(len(tokenized_doc),higher_index)

                        snippets_per_token.append(tokenized_doc[lower_index:higher_index])
                        snippets_per_token_position.append(i)

            if len(snippets_per_token)==0:
                snippets.append(np.zeros((MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), dtype=np.int32))
                snippets_position.append(np.zeros((MAX_PASSAGES_PER_QUERY), dtype=np.int32)+SNIPPET_POSITION_PADDING_VALUE)
                continue
                
            max_snippets_len = min(MAX_PASSAGES_PER_QUERY, len(snippets_per_token))
            
            ### snippets in matrix format
            #pad
            snippets_per_token = pad_sequences(snippets_per_token, maxlen = QUERY_CENTRIC_CONTEX, padding="post")
            #fill the gaps
            _temp = np.zeros((MAX_PASSAGES_PER_QUERY,QUERY_CENTRIC_CONTEX), dtype=np.int32)
            _temp[:max_snippets_len] = snippets_per_token[:max_snippets_len]
            snippets.append(_temp)
            
            ### snippets_position in matrix format
            #pad
            snippets_per_token_position = pad_sequences([snippets_per_token_position], maxlen = MAX_PASSAGES_PER_QUERY, padding="post",value=SNIPPET_POSITION_PADDING_VALUE)[0]
            snippets_position.append(snippets_per_token_position)
            
        return snippets, snippets_position

In [30]:
def validate_test_data(data):
    data_generator = TestDataGenerator(data, tk)
    data_generator = iter(data_generator)

    query_results = {}

    for i,X in enumerate(data_generator):
        print("Predict query:",i,end="\r")
        deep_ranking = document_score_model.predict(X)
        deep_ranking = map(lambda x:x[0],deep_ranking.tolist())
        bm25_results = data["bioasq_data"][i]["documents"]
        deep_ranking_pmid = list(zip(bm25_results,deep_ranking))
        deep_ranking_pmid.sort(key=lambda x:-x[1])
        query_results[data["bioasq_data"][i]["id"]] = {"result":deep_ranking_pmid,"goldstandard":data["bioasq_data"][i]["positive_pmid"]}
        #print("save query results:",i,end="\r")
        
    return query_results

test_query_results = validate_test_data(test_articles_collection)

Predict query: 548

In [31]:
path_save = "/backup/results/deep_rank"
path_save = os.path.join(path_save, "deep_rank_v3_test_data.p")

with open(path_save, "wb") as f:
    pickle.dump(test_query_results,f)

In [26]:
query_to_test_index = 0

data_generator = TestDataGenerator(test_articles_collection, tk)
data_generator = iter(data_generator)
for _ in range(query_to_test_index+1):
    X = next(data_generator)


In [65]:
re_ranking = document_score_model.predict(X)

In [66]:
re_ranking = map(lambda x:x[0],re_ranking.tolist())

In [67]:
bm25_results = test_articles_collection["bioasq_data"][query_to_test_index]["documents"]
positive_docs = test_articles_collection["bioasq_data"][query_to_test_index]["positive_pmid"]

In [68]:
re_ranking_pmid = list(zip(bm25_results,re_ranking))

In [69]:
re_ranking_pmid.sort(key=lambda x:-x[1])

In [70]:
re_ranking_pmid[:10]

[('24794627', 5.3322014808654785),
 ('30251567', 5.313037872314453),
 ('28796422', 5.227417945861816),
 ('30114722', 5.093368053436279),
 ('29947303', 5.0901007652282715),
 ('30697454', 5.0804266929626465),
 ('30569414', 4.911670207977295),
 ('28901190', 4.814671039581299),
 ('24577791', 4.803395748138428),
 ('26907255', 4.67585563659668)]

In [71]:
test_articles_collection["bioasq_data"][query_to_test_index]["query"]

array([13502,    43,   478,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0], dtype=int32)

In [72]:
positive_docs

['24554704',
 '24784583',
 '24577791',
 '23197849',
 '24035588',
 '21060967',
 '25479728',
 '21755313',
 '24469711',
 '22512788',
 '24911883',
 '24794627',
 '21464439',
 '25059784']

In [73]:
positive_docs_ranked = []
for i,result in enumerate(re_ranking_pmid):
    if result[0] in set(positive_docs):
        positive_docs_ranked.append((i,result[0],result[1]))
        
true_ranked = []
for i,pmid in enumerate(bm25_results):
    if pmid in set(positive_docs):
        true_ranked.append((i,pmid))

print(positive_docs_ranked)
print(true_ranked)

[(0, '24794627', 5.3322014808654785), (8, '24577791', 4.803395748138428), (11, '24554704', 4.6346845626831055), (12, '23197849', 4.612618446350098), (16, '21060967', 4.446898460388184), (18, '24469711', 4.439567565917969), (19, '21755313', 4.431821823120117), (25, '22512788', 4.27816104888916), (34, '21464439', 4.063064098358154), (35, '25059784', 4.056085586547852), (38, '24784583', 4.003556728363037), (43, '25479728', 3.937878131866455), (50, '24035588', 3.7958528995513916), (343, '24911883', 2.838761806488037)]
[(3, '23197849'), (4, '21755313'), (6, '25479728'), (7, '24784583'), (8, '24577791'), (9, '24035588'), (15, '22512788'), (17, '24911883'), (19, '21464439'), (22, '24794627'), (126, '25059784'), (134, '24554704'), (466, '21060967'), (2793, '24469711')]


# Test with train set, check overfit

In [38]:
query_tokens = np.array([X[0][0]])
snippet_list = np.array([X[1][0]])

query_tokens = X[0][:2]
snippet_list = X[1][:2]

model_input = [query_tokens, snippet_list]

In [39]:

print(query_tokens.shape)
print(snippet_list.shape)

(2, 15)
(2, 15, 3, 15)


In [40]:
matrix = input_model.predict(model_input)
np.array(matrix).shape

(2, 15, 3, 15, 15, 1)

In [21]:
matrix[0][4][0].shape

(15, 15, 1)

In [42]:
np.squeeze(matrix[0][7][0])

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.

In [148]:
query_tokens[0]

array([  165, 13502,    26,    61,     8,     1,    43,     2,   478],
      dtype=int32)

In [168]:
snippet_list[0][1][0]

array([  363,     5,  2386,    97,  7598,   774,    32, 13502,    18,
          15,     7, 11695,   117,    17,     0], dtype=int32)

In [169]:
emb_dict[13502]

array([ 0.05703647,  0.19594026,  0.03365219,  0.15514491,  0.00540348,
       -0.02335026, -0.06095085,  0.0226689 , -0.05668721,  0.01571985,
       -0.09896637,  0.13836679,  0.02710932,  0.06420047, -0.03692323,
        0.03899341,  0.00553868, -0.08639584, -0.05358738, -0.02609682,
        0.06495432, -0.00129713, -0.01882407, -0.10850747, -0.02421302,
        0.05556208,  0.00291283, -0.04882976,  0.01770345,  0.0035051 ,
        0.07192209, -0.00432884, -0.15161929, -0.07024549, -0.04793473,
        0.01823143,  0.10337584, -0.04076301,  0.01026187,  0.12004871,
        0.03939956, -0.03548966, -0.10689223, -0.16337523,  0.10883316,
        0.01135785,  0.03041399,  0.06011688, -0.09919181,  0.01741308,
       -0.04328503, -0.00256405, -0.11370766,  0.0522779 ,  0.0702537 ,
        0.01021139,  0.06773005,  0.01114117, -0.05878652,  0.0720681 ,
        0.05551391,  0.08731035,  0.07339004,  0.0031227 ,  0.10792159,
        0.12050318, -0.05851915, -0.08350374, -0.03341928,  0.12

In [147]:
data_generator = TrainDataGenerator(train_articles_collection, tk, 256)
data_generator = iter(data_generator)
for i in range(9):
    print(i,end="\r")
    X,Y = next(data_generator)

8

In [151]:
list(map(lambda x: len(list(filter(lambda y:y!=0,x))),X[0]))

[8,
 8,
 8,
 8,
 8,
 8,
 8,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 12,
 12,
 12,
 12,
 12,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 3,
 3,
 3,
 3,
 3

In [157]:
X[0][26]

array([ 988,  988,  279, 1208,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0], dtype=int32)

In [161]:
X[3][26]

array([[[  1075,      2,    986,      4,      6,    248,    988,   2250,
             44,    713,   6300,   6300,      0],
        [  3974,      2,    986,      4,      6,    248,    988,   2250,
             18,    713,   6300,   6300,      0],
        [    17,     17,  46298,  59753,   9143,   2106,    988,   2250,
             16,      6,    200,    174,      0]],

       [[  1075,      2,    986,      4,      6,    248,    988,   2250,
             44,    713,   6300,   6300,      0],
        [  3974,      2,    986,      4,      6,    248,    988,   2250,
             18,    713,   6300,   6300,      0],
        [    17,     17,  46298,  59753,   9143,   2106,    988,   2250,
             16,      6,    200,    174,      0]],

       [[     2,    248,    986,      4,      1,  10169,    279,      0,
              0,      0,      0,      0,      0],
        [     0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0],
        [   

In [164]:
pos,neg = deepRank_model.predict(X)

In [256]:
query_to_test_index = 1

data_generator = TestDataGenerator(train_articles_collection, tk)
data_generator = iter(data_generator)
for _ in range(query_to_test_index+1):
    X = next(data_generator)


KeyError: 'bioasq_data'

In [257]:
re_ranking = document_score_model.predict(X)

bm25_results = train_articles_collection["test_data"][query_to_test_index]["documents"]
positive_docs = train_articles_collection["test_data"][query_to_test_index]["positive_pmid"]

re_ranking_pmid = list(zip(bm25_results,re_ranking.tolist()))

re_ranking_pmid.sort(key=lambda x:-x[1])

re_ranking_pmid[:10]

[('26671317', 7.931817054748535),
 ('20975159', 7.902041435241699),
 ('20650709', 7.8478264808654785),
 ('19805301', 7.842199802398682),
 ('21731768', 7.818233013153076),
 ('24681619', 7.759010314941406),
 ('26631348', 7.714381217956543),
 ('22196114', 7.696432590484619),
 ('23817568', 7.648404598236084),
 ('26410599', 7.3919854164123535)]

In [216]:
print(tk.sequences_to_texts([train_articles_collection["test_data"][query_to_test_index]["query"]]))
positive_docs

['kind enzyme encoded proto oncogene abl1']


['21435002',
 '20841568',
 '9500553',
 '24012954',
 '18796434',
 '23842646',
 '18528425']

In [217]:
positive_docs_ranked = []
for i,result in enumerate(re_ranking_pmid):
    if result[0] in set(positive_docs):
        positive_docs_ranked.append((i,result[0],result[1]))
        
true_ranked = []
for i,pmid in enumerate(bm25_results):
    if pmid in set(positive_docs):
        true_ranked.append((i,pmid))

print(positive_docs_ranked)
print(true_ranked)

[(5, '24012954', 5.5485920906066895), (29, '9500553', 4.8889594078063965), (87, '21435002', 4.592401504516602), (112, '23842646', 4.525805473327637), (155, '18796434', 4.400295257568359), (342, '18528425', 4.139955043792725), (2244, '20841568', 3.1609909534454346)]
[(1, '9500553'), (29, '21435002'), (187, '24012954'), (309, '18528425'), (333, '23842646'), (610, '20841568'), (2354, '18796434')]


In [None]:
bm25_results