In [134]:
from utils import yaml_loader, load_model_weights
from os.path import join
import numpy as np
# load config
config = yaml_loader("config/Attn-BioDeepRank_7b.yaml")


## Since the objective is only to load Deeprank, lest only select the DeepRank configuration

cache_folder_name = config["cache_folder"]
prefix_name = config["corpora"]["name"]
# get deeprank config
deeprank_config = config["pipeline"][1]["DeepRank"]

In [135]:
from models.DeepRank import DeepRank

deeprank = DeepRank(cache_folder=cache_folder_name,
                    prefix_name=prefix_name,
                    **deeprank_config)


[LOAD FROM CACHE] Load tokenizer from /backup/IR/cache/bioasq_Regex.json
DEBUG created tokenizer bioasq_Regex
True False
[LOAD FROM CACHE] Load embedding matrix from /backup/IR/cache/embedding_BioWordVec_PubMed_MIMICIII_d200_bioasq_Regex


In [136]:
# execute the build and build network dependencies 

deeprank.build()
deeprank.build_network(**deeprank_config)

[TensorShape([Dimension(None), Dimension(13), Dimension(200)]), TensorShape([Dimension(None), Dimension(13), Dimension(101)])]


In [137]:
# manually load the weights
weight_file_name = "last_weights_"+deeprank.name+".h5"
print(weight_file_name)
load_model_weights(join(cache_folder_name,weight_file_name), deeprank.deeprank_model)

last_weights_DeepRank_bioasq_Regex_True_False_embedding_BioWordVec_PubMed_MIMICIII_d200_bioasq_Regex_selu_256_100_0.0001_3_2_2_adadelta_5_13_15_selu_58_100_3_3_.h5


## Minimalistic example to show the data format for inference

In [138]:

query_id = "5a43a139966455904c000008"
query = "What is measured through the NOMe-Seq methodology?"

positive_document = {"original": "CAME: identification of chromatin accessibility from nucleosome occupancy and methylome sequencing. Motivation\nChromatin accessibility plays a key role in epigenetic regulation of gene activation and silencing. Open chromatin regions allow regulatory elements such as transcription factors and polymerases to bind for gene expression while closed chromatin regions prevent the activity of transcriptional machinery. Recently, Methyltransferase Accessibility Protocol for individual templates-Bisulfite Genome Sequencing (MAPit-BGS) and nucleosome occupancy and methylome sequencing (NOMe-seq) have been developed for simultaneously profiling chromatin accessibility and DNA methylation on single molecules. Therefore, there is a great demand in developing computational methods to identify chromatin accessibility from MAPit-BGS and NOMe-seq.\n\n\nResults\nIn this article, we present CAME (Chromatin Accessibility and Methylation), a seed-extension based approach that identifies chromatin accessibility from NOMe-seq. The efficiency and effectiveness of CAME were demonstrated through comparisons with other existing techniques on both simulated and real data, and the results show that our method not only can precisely identify chromatin accessibility but also outperforms other methods.\n\n\nAvailability and Implementation\nCAME is implemented in java and the program is freely available online at http://sourceforge.net/projects/came/.\n\n\nContacts\njechoi@gru.edu or khryu@dblab.chungbuk.ac.kr.\n\n\nSupplementary information\nSupplementary data are available at Bioinformatics online.",
                     "title": "CAME: identification of chromatin accessibility from nucleosome occupancy and methylome sequencing.",
                     "id": "28035030"}

negative_document = {"original": "Built environment, physical activity, and obesity: what have we learned from reviewing the literature? To evaluate the growing literature on the built environment and physical activity/obesity, we conducted a review of review papers. Through a systematic search, we identified 36 reviews that met the inclusion criteria and evaluated these reviews based on key information provided, review methodology, and specificity regarding measurement. We also analyzed research gaps and areas of improvement identified by previous reviews and propose a research agenda. Future studies should develop complex conceptual and statistical models that include moderators and mediators, improve objective and perceived measures of the built environment, and strengthen evidence of causality through better research designs.",
                     "title": "Built environment, physical activity, and obesity: what have we learned from reviewing the literature?",
                     "id": "21983062"}


In [139]:
# prepare the list of 2 documents to be ranked by the Attn-DeepRank

_data = {"query": query, "documents": [positive_document, negative_document]}

data = {query_id: _data}

In [140]:

validation_scores = deeprank.inference(data_to_infer=data, train=False, **deeprank_config)["retrieved"]
print(list(map(lambda x:(x["id"],x["score"]),validation_scores[query_id]["documents"])))

[('28035030', 3.9531190395355225), ('21983062', -0.3284663259983063)]


In [87]:
# the positive document has a score a lot higher than a partially positive

## Passage extraction for the documents

In [141]:
from tensorflow.keras import backend as K

# ADD a function to the computational GRAPH to exctact the models attention weights

snippet_attention_tensor = deeprank.deeprank_model.layers[4].layers[4].attention_weights
q_term_attention_tensor = deeprank.deeprank_model.layers[5].layers[1].attention_weights

get_attn = K.function(deeprank.deeprank_model.input, snippet_attention_tensor + [q_term_attention_tensor])

In [142]:
# generate the batch data to fed to the new function

X, docs, query_id, query = next(deeprank.inference_generator(inference_data=data, train=False, **deeprank_config))


In [143]:
attn = get_attn(X)

$$\underset{M\times 1}{\vec{c}}=\sum_{u_k\ \in\ q}  \left (\underset{1\times 1}{a_{u_k}} \times \sum_{p_i\ \in\ D({u_k})}  \left (\underset{1\times 1}{a_{p_i}} \times \underset{M\times 1}{\vec{h}_{p_i}} \right ) \right ) = \sum_{u_k\ \in\ q}  \left ( \sum_{p_i\ \in\ D({u_k})}  \left (\underbrace{\underset{1\times 1}{a_{u_k}} \times \underset{1\times 1}{a_{p_i}}}_{global\ attention} \times \underset{M\times 1}{\vec{h}_{p_i}} \right ) \right )$$

In [144]:
snippet_attention = np.moveaxis(np.squeeze(np.array(attn[:-1])), 0, 1)
query_attention = np.array(attn[-1])

# global attention is extracted based on the following simplification
global_attention = snippet_attention * query_attention

In [145]:
TOP_SNIPPETS_PER_DOC = 5

for i in range(global_attention.shape[0]):

    g_s = global_attention[i].ravel()
    indexs = g_s.argsort()[-TOP_SNIPPETS_PER_DOC:][::-1]
    g_s_normalized = g_s[indexs]/sum(g_s[indexs])
    
    for j in range(len(g_s_normalized)):
        snippet_tokens = X[1][0][indexs[j]//5][indexs[j]%5]
        print(deeprank.tokenizer.sequences_to_texts([snippet_tokens])[0])


chromatin accessibility from mapit bgs and nome seq results in this article we present
approach that identifies chromatin accessibility from nome seq the efficiency and effectiveness of came
and nucleosome occupancy and methylome sequencing nome seq have been developed for simultaneously profiling
identify chromatin accessibility from mapit bgs and nome seq results in this article we
based approach that identifies chromatin accessibility from nome seq the efficiency and effectiveness of

approach that identifies chromatin accessibility from nome seq the efficiency and effectiveness of came
and nucleosome occupancy and methylome sequencing nome seq have been developed for simultaneously profiling
chromatin accessibility from mapit bgs and nome seq results in this article we present



## Example of application

In [146]:
from IPython.display import HTML as html_print

def red_percentage_print(s, percentage):
    rescale = 100-int(percentage*100)
    return "<text style=background-color:hsl(0,100%,{}%);>{}</text>".format(rescale, s)

def blue_percentage_print(s, percentage):
    rescale = 100-int(percentage*100)
    return "<text style=background-color:hsl(220,100%,{}%);>{}</text>".format(rescale, s)

def highlight_snippets(query, document, TOP_SNIPPETS = 5, TOP_QUERY = 5):
    query_id = "_manual"
    document = {"original": document,
                "title": "_",
                "id": "_"}
    
    _data = {"query": query, "documents": [document]}
    data = {query_id: _data}
    
    X, docs, query_id, query = next(deeprank.inference_generator(inference_data=data, train=False, **deeprank_config))
    
    attn = get_attn(X)
    snippet_attention = np.squeeze(np.array(attn[:-1]))
    query_attention = np.array(attn[-1])
    global_attention = snippet_attention * query_attention

    global_attention = global_attention.ravel()

    #top 5 snippet index
    snippet_indexs = global_attention.argsort()[-TOP_SNIPPETS:][::-1]
    snippet_attention_normalized = global_attention[snippet_indexs]/sum(global_attention[snippet_indexs])
    
    query_list_string = [deeprank.tokenizer.index_word[x] for x in X[0][0] if x != 0]
    
    query_attention = query_attention.ravel()
    TOP_QUERY = min(len(query_list_string),TOP_QUERY)
    top_5_q_terms = query_attention.argsort()[-TOP_QUERY:][::-1]
    highlight_terms = query_attention[top_5_q_terms]/sum(query_attention[top_5_q_terms])

    
    for count,index in enumerate(top_5_q_terms):
        if index>=len(query_list_string):
            continue
            
        query_list_string[index] = red_percentage_print(query_list_string[index],highlight_terms[count])

    doc_tokens = deeprank.tokenizer.texts_to_sequences([document["original"]])[0]
    doc_list_string = [deeprank.tokenizer.index_word[x]+" " for x in doc_tokens if x != 0]

    snippet_position = X[2][0].ravel()

    for count,index in enumerate(snippet_indexs):

        index = snippet_position[index]
        if index==-1:
            continue
        low_index = max(0,index-7)
        high_index = max(0,index+7)

        doc_list_string[low_index:high_index] = list(map(lambda x:blue_percentage_print(x, snippet_attention_normalized[count]), doc_list_string[low_index:high_index]))


    return query_list_string, doc_list_string

In [147]:

query_list_string, doc_list_string = highlight_snippets(query, positive_document["original"])

html_print("<h3>Tokenized query</h3><p>"+" ".join(query_list_string)+"</p><h3>Tokenized document</h3><p>"+"".join(doc_list_string)+"</p>")

#### The previous cell uses html in order to highlight the snippet. The following image correspond to the correct render of the cell with the html tags. (this image is just a precausion to give a visulization even if the render fails)


![title](output.PNG)