In [1]:
from utils import yaml_loader, load_model_weights
from os.path import join
# load config
config = yaml_loader("config/attention_sw_split_6b.yaml")

cache_folder_name = config["cache_folder"]
prefix_name = config["corpora"]["name"]
# get deeprank config
deeprank_config = config["pipeline"][1]["DeepRankDotSplit"]

In [2]:
from models.DeepRankDotSplit import DeepRankDotSplit

deeprank = DeepRankDotSplit(cache_folder=cache_folder_name,
                    prefix_name=prefix_name,
                    **deeprank_config)

deeprank.split_token = deeprank.tokenizer.texts_to_sequences(["."])[0][0]

[LOAD FROM CACHE] Load tokenizer from /backup/IR/cache/pubmed_2018_Regex3.json
DEBUG created tokenizer pubmed_2018_Regex3
True True
[LOAD FROM CACHE] Load embedding matrix from /backup/IR/cache/embedding_BioWordVec_PubMed_MIMICIII_d200_pubmed_2018_Regex3


In [22]:
deeprank.build()
deeprank.build_network(**deeprank_config)

[TensorShape([Dimension(None), Dimension(13), Dimension(200)]), TensorShape([Dimension(None), Dimension(13), Dimension(58)])]


In [29]:
# load
weight_file_name = "0_last_weights_DeepRank_bioasq_Regex_embedding_BioWordVec_PubMed_MIMICIII_d200_bioasq_Regex_selu_False_58_100_3_3_256_adadelta_2_120_0.0001_5_2_3_selu_15_13_5_.h5"
load_model_weights(join(cache_folder_name,"deeprank_xvalidation_weights",weight_file_name), deeprank.deeprank_model)

In [33]:
import pickle
import json

with open("validation_fold.p","rb") as f:
    validation = pickle.load(f)
    
with open("/backup/IR/cache/full_data_validation_BM25_with_bioasq_stem_Bllip_2500_retrieved_results.p","rb") as f:
    bm25_top = pickle.load(f)

with open("/backup/BioASQ-training7b/full_data.json", "r") as f:
    dataset = json.load(f)
    queries = {x["query_id"]:{x["query"]} for x in dataset}
    gold_standard = {x["query_id"]:{x["documents"]} for x in dataset}
    

In [41]:
validation_keys = validation[0]

k_fold_validation_data = {key: {"documents": training_data["train"][key]["positive_ids"]+training_data["train"][key]["partially_positive_ids"],
                                "query": queries[key]} for key in validation_keys}


In [45]:
k_fold_validation_data = {key : bm25_top["train"][key] for key in validation_keys}
validation_scores = deeprank.inference(data_to_infer=k_fold_validation_data, train=False, **deeprank_config)["retrieved"]
print("Metrics on the full validation set")
print(len(validation_scores))
print(len(k_fold_validation_data))
print(len(gold_standard))
deeprank.show_evaluation(validation_scores, gold_standard)

Metrics on the full validation set
544
544
2747


NameError: name 'self' is not defined

In [3]:
# test

query = "sequences enhancers"
abstract = "The mechanism of specific expression of glutathione transferase P gene during hepatocarcinogenesis of the rat has been investigated by cloning the gene and determining the upstream regulatory sequences. Two enhancers and a silencer are located within 3 kb upstream of the promoter. The stronger enhancer designated GPEI has two TPA (12-O-tetradecanoyl phorbol 13-acetate)-response element (TRE)-like sequences arranged in a palindrome at a 3 base pairs spacing. This special combination was found to form a very strong enhancer which could act efficiently even in F9 cells where the collagenase enhancer with a singlet TRE cannot work due to the low c-jun content. Whether this structure is operating with a very low concentration of c-jun/c-fos heterodimer or with any other proteins remains to be determined. These findings suggest that new and more efficient enhancers evolve by a combination of basic enhancer elements. The silencer region consists of several sequences that can bind specific protein(s) and works cooperatively."


data = {0:{"query": query, "documents":[{"id":1,"original":abstract, "title":""}]}}
gen = deeprank.inference_generator(data,train=True)

In [4]:
X = next(gen)

In [5]:
X[0][1].shape

(1, 13, 5, 15)

In [28]:
p = X[0][1][0][0][2]

In [29]:
deeprank.tokenizer.sequences_to_texts([p])[0]

'silencer region consists sequences bind specific protein s works cooperatively'

In [18]:
q_tokenized = deeprank.tokenizer.tokenize_query(query)
a_tokenized = deeprank.tokenizer.tokenize_article(abstract)

In [38]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np 

def test_split(self, tokenized_query, tokenized_article, Q, P, S):

    snippets = []
    snippets_position = []

    half_size = S//2

    # O(n^2) complexity, probably can do better with better data struct TODO see if is worthit
    for query_token in tokenized_query:
        snippets_per_token = []
        snippets_per_token_position = []
        if query_token != 0:  # jump padded token
            for i, article_token in enumerate(tokenized_article):
                if article_token == query_token:

                    lower_index = i-half_size
                    lower_index = max(0, lower_index)

                    higher_index = i+half_size
                    higher_index = min(len(tokenized_article), higher_index)

                    sentence = []

                    for _i in range(lower_index, higher_index):
                        token = tokenized_article[_i]
                        print(token, self.split_token)
                        if token == self.split_token:
                            if _i < i:
                                sentence = []
                                continue
                            else:
                                break

                        sentence.append(token)

                    snippets_per_token.append(sentence)
                    snippets_per_token_position.append(i)

        if len(snippets_per_token) == 0:  # zero pad
            snippets.append(np.zeros((P, S), dtype=np.int32))
            snippets_position.append(np.zeros((P), dtype=np.int32) + self.SNIPPET_POSITION_PADDING_VALUE)
            continue

        max_snippets_len = min(P, len(snippets_per_token))

        # snippets in matrix format
        # pad
        snippets_per_token = pad_sequences(snippets_per_token, maxlen=S, padding="post")
        # fill the gaps
        _temp = np.zeros((P, S), dtype=np.int32)
        _temp[:max_snippets_len] = snippets_per_token[:max_snippets_len]
        snippets.append(_temp)

        # snippets_position in matrix format
        # pad
        snippets_per_token_position = pad_sequences([snippets_per_token_position], maxlen=P, padding="post", value=self.SNIPPET_POSITION_PADDING_VALUE)[0]
        snippets_position.append(snippets_per_token_position)

    return snippets, snippets_position

deeprank.split_token = deeprank.tokenizer.texts_to_sequences(["."])[0][0]

test_split(deeprank,q_tokenized, a_tokenized, 13,5,15)


340 2
282 2
3755 2
107 2
1897 2
3735 2
1083 2
823 2
2 2
5771 2
374 2
2287 2
115 2
1920 2
20644 2
386 2
823 2
7928 2
39672 2
35 2
1224 2
2153 2
9511 2
1366 2
5362 2
1328 2
2 2
29395 2
390 2
2922 2
823 2
2267 2
117 2
64 2
188 2
7079 2
18288 2
107 2
1897 2
3735 2
1083 2
823 2
2 2
44 2
11634 2
29395 2
1236 2
35 2
3082 2
3735 2
1114 2
302 2
2 2
192 2
195 2
133 2
62 2
1151 2
11634 2
8992 2
514 2
1366 2
5362 2
1328 2
2 2


([array([[  340,   282,  3755,   107,  1897,  3735,  1083,   823,     0,
              0,     0,     0,     0,     0,     0],
         [ 5771,   374,  2287,   115,  1920, 20644,   386,   823,  7928,
          39672,    35,  1224,  2153,  9511,     0],
         [29395,   390,  2922,   823,  2267,   117,    64,   188,  7079,
          18288,     0,     0,     0,     0,     0],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0]], dtype=int32),
  array([[   44, 11634, 29395,  1236,    35,  3082,  3735,  1114,     0,
              0,     0,     0,     0,     0,     0],
         [  192,   195,   133,    62,  1151, 11634,  8992,   514,  1366,
           5362,  1328,     0,     0,     0,     0],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,