In [1]:
import json
from collections import defaultdict
from elasticsearch import Elasticsearch, helpers
from mmnrm.utils import set_random_seed, overlap, index_from_list, load_model
from mmnrm.dataset import TrainCollectionV2, TestCollectionV2, BaseCollection

from nltk.tokenize.punkt import PunktSentenceTokenizer

from utils import *

es = Elasticsearch(["http://193.136.175.98:8125"])

index_name = "bioasq_9b"


KeyboardInterrupt: 

In [2]:
def load_data(batch_index, training_data_path = "training9b_wDates.json"):
    
    def get_snippets_by_docid(snippets, doc_id):
        return list(filter(lambda x:x["document"].split("/")[-1]==doc_id and (x["offsetInEndSection"]-x["offsetInBeginSection"])>0 , snippets))
    
    def separate_queries_goldstandard(queires, additional_keys=[]):
        clean_queires = []
        gs = {}
        additional_keys = ["id", "query"] + additional_keys
        
        total_empty = 0
        total_empty_query = 0
        
        for x in queires:
            gs[x["id"]] = {}
            for doc_id in list(map(lambda y : y.split("/")[-1], x["documents"])):
                
                snippets_of_docid = get_snippets_by_docid(x["snippets"], doc_id)
                
                if len(snippets_of_docid)>0:
                    gs[x["id"]][doc_id] = snippets_of_docid
                else:
                    total_empty += 1
                
            
            if len(gs[x["id"]])>0:
                clean_queires.append({k:x[k] for k in additional_keys})
            else:
                # remove this query
                total_empty_query += 1
                del gs[x["id"]]
        
        print("Num empty queries",total_empty_query)
        print("Num docs without snippets",total_empty)
        
        
        return clean_queires, gs
    
    
    queries = load_queries(training_data_path, maps=[("body","query")])

    test_8b = []

    for i in range(1,6):
        id_sets = set()
        with open(f"yearly_data/8B{i}_golden.json","r") as f:
            for q in json.load(f)["questions"]:
                id_sets.add(q["id"])

        test_8b.append(id_sets)

    print(sum([len(x) for x in test_8b]))

    queries_ids_sets = { x["id"] for x in queries }
    train_ids = queries_ids_sets - test_8b[batch_index]
    validations_ids = test_8b[batch_index]

    train_data = subset_byId(queries, train_ids)
    validation_data = subset_byId(queries, validations_ids)

    convert_to_trainable_gs = lambda x: { k:{1:v}for k,v in x.items()}

    train_data_queries, train_data_gs = separate_queries_goldstandard(train_data, additional_keys=["limit_date"])
    validation_data_queries, validation_data_gs = separate_queries_goldstandard(validation_data, additional_keys=["limit_date"])
    
    
    
    return train_data_queries, train_data_gs, validation_data_queries, validation_data_gs

In [3]:
train_data_queries, train_data_gs, validation_data_queries, validation_data_gs = load_data(0)

500
Num empty queries 0
Num docs without snippets 6863
Num empty queries 0
Num docs without snippets 13


In [4]:
K1,BETA,TOP_N = (0.4, 0.14, 250)
query_results = execute_search(es, train_data_queries, TOP_N, index_name, k1=K1, b=BETA)

Setting the k1 and b for BM25
The inquery limit_date will be used
Running query: 3640

In [38]:
import random 
import numpy as np
import tensorflow as tf

class TrainSnippetsCollection(BaseCollection):
    def __init__(self, 
                 query_list, 
                 goldstandard, 
                 query_docs_subset = None,
                 find_relevant_snippets = None, # default use the self contained function 
                 use_soft_label = True,
                 verbose=True, 
                 **kwargs):
        """
        query_list - must be a list with the following format :
                     [
                         {
                             id: <str>
                             query: <str>
                         },
                         ...
                     ]
        
        goldstandard - must be a dictionary with the following format:
                       {
                           id: [{
                                    id: <str>,
                                    snippets: [{<bioasq snippet data>}
                                 }, ...],
                           ...
                       }
                       
        query_docs_subset (optional) - previous retrieved method of the retrieved the TOP_K documents
                                       {
                                           id: [{
                                               id: <str>
                                               snippets: [<str>, <str>]
                                               score: <float>
                                           }, ...],
                                           ...
                                       }
        """
        super(TrainSnippetsCollection, self).__init__(**kwargs)
        self.query_list = query_list # [{query data}]
        self.goldstandard = goldstandard # {query_id:[relevance docs]}
        self.use_soft_label = use_soft_label
        
        # default self contained function association
        self.find_relevant_snippets = self.__find_relevant_snippets if find_relevant_snippets is None else find_relevant_snippets
        
        self.verbose = verbose
        
        if "sub_set_goldstandard" in kwargs:
            self.sub_set_goldstandard = kwargs.pop("sub_set_goldstandard")
        else:
            self.sub_set_goldstandard = None
        
        if "collection" in kwargs:
            self.collection = kwargs.pop("collection")
        else:
            self.collection = None
        
        self.skipped_queries = []

        self.__build(query_docs_subset)
    

    
    def __find_relevant_snippets(self, doc_to_sentence, gs_doc_snippets):
        
        if self.use_soft_label:
            positive_snippet_index = [  random.random()*0.3 for _ in range(len(doc_to_sentence))]
        else:
            positive_snippet_index = [  0 for _ in range(len(doc_to_sentence))]
            
        # a snippet is relevant if it contais some portion of the gs_snippet

        for snippet in gs_doc_snippets:
            # find the matching snippet
            #size = (1+snippet["offsetInEndSection"]) - snippet["offsetInBeginSection"]
            #print(snippet)
            snippet_words = snippet["text"].split(" ")
            bag_do_word_snippet = set(snippet_words)
            
            for index, doc_snippet in enumerate(doc_to_sentence):
                # compute the overlap between the doc_snippet and the gs_snippet
                #_overlap = overlap((snippet["offsetInBeginSection"], snippet["offsetInEndSection"]), (doc_snippet["start"], doc_snippet["end"]))/(size)
                
                #print((snippet["beginSection"] == "title"), doc_snippet["is_title"], (snippet["beginSection"] == "title") ^ doc_snippet["is_title"])
                
                if not((snippet["beginSection"] == "title") ^ doc_snippet["is_title"]):
                    _overlap = overlap((doc_snippet["start"], doc_snippet["end"]),
                                   (snippet["offsetInBeginSection"], snippet["offsetInEndSection"]))/(doc_snippet["end"]+1-doc_snippet["start"])
                    
                    #doc_snippet_chars = doc_snippet["text"]
                    doc_snippet_words = doc_snippet["text"].replace('\n',' \n ').replace('.',' . ').split(" ")
                    bag_do_word_doc_snippet = set(doc_snippet_words)
                    
                    words_overlap = 1-(len(bag_do_word_doc_snippet-bag_do_word_snippet)/len(bag_do_word_doc_snippet))
                    
                   
                    threashold = 6 if _overlap>0 else 9
                    
                    # mudar if para words_overlap e usar _overlap para thearhold de min words para ceitar como gs
                    if words_overlap>0:
                        
                        # self correcting algorithm
                        index_doc_snippet = 0
                        index_snippet = 0
                        start_index = -1
                        end_index = -1
                        early_stop = False
                        while index_doc_snippet<len(doc_snippet_words) :
                            
                            index_snippet = 0
                            while index_snippet<len(snippet_words) and index_doc_snippet<len(doc_snippet_words):

                                if doc_snippet_words[index_doc_snippet] == snippet_words[index_snippet]:
                                    if start_index == -1:
                                        start_index = index_doc_snippet
                                    index_doc_snippet += 1 
                                elif start_index != -1 and (index_doc_snippet-start_index)<threashold:
                                    start_index = -1
                                elif start_index != -1:
                                    early_stop = True
                                    break
                                index_snippet += 1
                                                            
                            index_doc_snippet +=1
                            
                            if early_stop:
                                break
                        
                        if (index_doc_snippet-start_index)<threashold:
                            start_index = -1
                            end_index = -1
                        
                        if start_index != -1:
                            end_index = index_doc_snippet - 1
                        
                        snp_relevance = (end_index-start_index)/len(doc_snippet_words)
                        
                        if snp_relevance>0:
                            
                            if self.use_soft_label:
                                snp_relevance = snp_relevance*0.5+0.5
                            else:
                                snp_relevance = 1
                            
                            positive_snippet_index[index] = snp_relevance
                
                        
        # return in onehot encodding
        return positive_snippet_index
    
    def __build(self, query_docs_subset):
        
        if query_docs_subset is None:
            return 
        
        self.sub_set_goldstandard = {}
        self.collection = {}
        progress = 0
        # filter the goldstandard
        for _id, relevance in query_docs_subset.items():
            print("running query:", progress, end="\r")
            progress+=1
            if _id not in self.goldstandard:
                self.skipped_queries.append(_id)
                continue
            
            # do not use queries without true positives
            # this add an overhead that can be avoided by refactor the follwing for loop!
            unique_relevants = set(self.goldstandard[_id].keys())
            if all([doc["id"] not in unique_relevants for doc in relevance ]):
                self.skipped_queries.append(_id)
                continue
            
            self.sub_set_goldstandard[_id] = defaultdict(list)
            
            for doc in relevance: # for each document that was retrieved
                
                # Splitting and saving the document
                doc_to_sentences = []
                
                for _itter, position in enumerate(PunktSentenceTokenizer().span_tokenize(doc["text"])):
                    start, end = position 
                    _text = doc["text"][start:end]
                    is_title = True
                    if _itter>0: # fix the start and end position for the abstract
                        start = start-(len(doc["title"])-1)
                        end = end-(len(doc["title"])-1)
                        is_title = False
                        
                    _doc = {"text":_text,
                            "start":start,
                            "end":end-1,
                            "is_title": is_title}
                    
                    doc_to_sentences.append(_doc)
                
                self.collection[doc["id"]] = doc_to_sentences
                
                # goldstandard should store the doc_id and the index of the positive snippets
                if doc["id"] in unique_relevants: # if it is relevant document
                    _doc_snippets = {
                        "id": doc["id"],
                        "score":doc["score"],
                        "pos": doc_to_sentences,
                        "snippet_index": self.find_relevant_snippets(doc_to_sentences, self.goldstandard[_id][doc["id"]])
                    }
                    self.sub_set_goldstandard[_id][1].append(_doc_snippets)
                else:
                    # if the document is not relevant we do not assume the relevance of the snippets
                    
                    _doc_snippets = {
                        "doc_text": doc["text"],
                        "id": doc["id"],
                        "score":doc["score"]
                    }
                    self.sub_set_goldstandard[_id][0].append(_doc_snippets)
                
        # remove the skipped queries from the data
        index_to_remove = []
        
        for skipped in self.skipped_queries:
            _index = index_from_list(self.query_list, lambda x: x["id"]==skipped)
            if _index>-1:
                index_to_remove.append(_index)
        index_to_remove.sort(key=lambda x:-x)
        print("index to remove length", len(index_to_remove))
        # start removing from the tail
        for _index in index_to_remove:
            del self.query_list[_index]
        
        # stats
        if self.verbose:
            max_keys = max(map(lambda x:max(x.keys()), self.sub_set_goldstandard.values()))
            
            for k in range(max_keys+1):
                print("Minimum number of relevance type({}) in the queries of the goldstandard sub set: {}".format(k, min(map(lambda x: len(x[k]), self.sub_set_goldstandard.values()))))
            
                print("Mean number of relevance type({}) in the queries of the goldstandard sub set: {}".format(k, sum(map(lambda x: len(x[k]), self.sub_set_goldstandard.values()))/len(self.sub_set_goldstandard)))
            
            print("Sub Collection size", len(self.collection))
            print("Number of skipped question, due to lack of true positives", len(self.skipped_queries))
    
    def get_steps(self):

        # an epoch will be defined with respect to the total number of positive pairs
        total_positives = sum(map(lambda x: sum([ len(x[k]) for k in x.keys() if k>0]), self.sub_set_goldstandard.values()))
          
        return total_positives//self.b_size

    def _generate(self, collection=None, **kwargs):
        
        # sanity check
        assert(not(self.sub_set_goldstandard==None and collection==None))
        
        training_data = self.sub_set_goldstandard
        
        # TODO this condition is dependent on the previous
        if collection is None:
            collection = self.collection
            
        while True:
            # TODO check if it is worthit to use numpy to vectorize these operations
            
            y_query = []
            y_pos_doc = []
            y_pos_doc_snippet_label = []
            y_neg_doc = []

            # build $batch_size triples and yield
            query_indexes = random.sample(population=list(range(len(self.query_list))), k=self.b_size)
            for q_i in query_indexes:
                selected_query = self.query_list[q_i]
                
                # index of the positive documents
                relevance_group = 1
                
                _pos_len = len(training_data[selected_query["id"]][relevance_group])
                pos_doc_index = random.randint(0, _pos_len-1) if _pos_len>1 else 0
                pos_doc_id = training_data[selected_query["id"]][relevance_group][pos_doc_index]
                pos_doc = {"snippets":collection[pos_doc_id["id"]], "score":pos_doc_id["score"]}
                pos_doc_snippet_label = pos_doc_id["snippet_index"]
                
                _neg_len = len(training_data[selected_query["id"]][relevance_group-1])
                neg_doc_index = random.randint(0, _neg_len-1) if _neg_len>1 else 0
                neg_doc_id = training_data[selected_query["id"]][relevance_group-1][neg_doc_index]
                neg_doc = {"snippets":collection[neg_doc_id["id"]], "score":neg_doc_id["score"]}
                # neg_doc_snippet_label = neg_doc_id["snippet_index"]
                
                y_query.append(selected_query["query"])
                y_pos_doc.append(pos_doc)
                y_pos_doc_snippet_label.append(pos_doc_snippet_label)
                y_neg_doc.append(neg_doc)
                #y_neg_doc_snippet_label.append(neg_doc_snippet_label)
            
            yield (np.array(y_query), np.array(y_pos_doc), np.array(y_pos_doc_snippet_label), np.array(y_neg_doc))#, np.array(y_neg_doc_snippet_label))
    
    def get_config(self):
        super_config = super().get_config()
        
        data_json = {
            "query_list": self.query_list,
            "goldstandard": self.goldstandard,
            "verbose": self.verbose,
            "sub_set_goldstandard": self.sub_set_goldstandard,
            "collection": self.collection,
            "use_soft_label": self.use_soft_label,
        } 
        
        return dict(data_json, **super_config) #fast dict merge
    
    
def build_data_generators(tokenizer, queries_sw=None, docs_sw=None):
    
    def maybe_tokenize(documents):
        if "tokens" not in documents:
            #split = nltk.sent_tokenize(documents["text"])
            documents["tokens"] = tokenizer.texts_to_sequences(map(lambda x: x["text"],documents["snippets"]))
            if docs_sw is not None:
                for tokenized_sentence in documents["tokens"]:
                    tokenized_sentence = [token for token in tokenized_sentence if token not in docs_sw]
    
    def train_generator(data_generator):
        while True:

            # get the batch triplet
            query, pos_docs, pos_label, neg_docs = next(data_generator)

            # tokenization, this can be cached for efficientcy porpuses NOTE!!
            tokenized_query = tokenizer.texts_to_sequences(query)

            if queries_sw is not None:
                for tokens in tokenized_query:
                    tokenized_query = [token for token in tokens if token not in queries_sw] 
            
            saveReturn = True
            
            for batch_index in range(len(pos_docs)):
                
                # tokenizer with cache in [batch_index][tokens]
                maybe_tokenize(pos_docs[batch_index])
                
                # assertion
                if all([ len(sentence)==0  for sentence in pos_docs[batch_index]["tokens"]]):
                    saveReturn = False
                    break # try a new resampling, NOTE THIS IS A EASY FIX PLS REDO THIS!!!!!!!
                          # for obvious reasons
                
                maybe_tokenize(neg_docs[batch_index])
                
            if saveReturn: # this is not true, if the batch is rejected
                yield tokenized_query, pos_docs, pos_label, neg_docs
                
            
    return train_generator, None

def train_generator_for_model(model):

    if "model" in model.savable_config:
        cfg = model.savable_config["model"]
    
    train_gen, test_gen = build_data_generators(model.tokenizer)
    
    pad_tokens = lambda x, max_len, dtype='int32': tf.keras.preprocessing.sequence.pad_sequences(x, 
                                                                                           maxlen=max_len,
                                                                                           dtype=dtype, 
                                                                                           padding='post', 
                                                                                           truncating='post', 
                                                                                           value=0)

    pad_sentences = lambda x, max_lim, dtype='int32': x[:max_lim] + [[]]*(max_lim-len(x))
    
    pad_labels = lambda x, max_lim, dtype='int32': x[:max_lim] + [0]*(max_lim-len(x))
    
    def maybe_padding(document, labels = None):
        if isinstance(document["tokens"], list):
            #overflow prevention
            bounded_doc_passage = min(cfg["max_passages"],len(document["tokens"]))
            document["sentences_mask"] = [True] * bounded_doc_passage + [False] * (cfg["max_passages"]-bounded_doc_passage)
            document["tokens"] = pad_tokens(pad_sentences(document["tokens"], cfg["max_passages"]), cfg["max_p_terms"])
            if labels is not None:
                document["sentences_labels"] = pad_labels(labels, cfg["max_passages"])
            
    def train_generator(data_generator):
 
        for query, pos_docs, pos_label, neg_docs in train_gen(data_generator):
            
            query = pad_tokens(query, cfg["max_q_terms"])
            
            pos_docs_array = []
            pos_snippets_labels = []
            pos_docs_mask_array = []
            neg_docs_array = []
            neg_docs_mask_array = []
            
            # pad docs, use cache here
            for batch_index in range(len(pos_docs)):
                maybe_padding(pos_docs[batch_index], pos_label[batch_index])
                pos_docs_array.append(pos_docs[batch_index]["tokens"])
                pos_snippets_labels.append(pos_docs[batch_index]["sentences_labels"])
                pos_docs_mask_array.append(pos_docs[batch_index]["sentences_mask"])
                maybe_padding(neg_docs[batch_index])
                neg_docs_array.append(neg_docs[batch_index]["tokens"])
                neg_docs_mask_array.append(neg_docs[batch_index]["sentences_mask"])
            
            yield [query, np.array(pos_docs_array), np.array(pos_docs_mask_array), np.array(pos_snippets_labels)], [query, np.array(neg_docs_array), np.array(neg_docs_mask_array)]
            
    return train_generator

In [23]:
t_collection = TrainSnippetsCollection(train_data_queries, 
                                   train_data_gs, 
                                   query_results,
                                   use_soft_label = True,
                                   use_relevance_groups=False)\
                            .batch_size(32)

index to remove length 0
Minimum number of relevance type(0) in the queries of the goldstandard sub set: 189
Mean number of relevance type(0) in the queries of the goldstandard sub set: 243.6429398148148
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(1) in the queries of the goldstandard sub set: 6.357060185185185
Sub Collection size 654830
Number of skipped question, due to lack of true positives 187


In [None]:
model = load_model("trained_models/earthy-glade-11_val_collection0_map@10")

In [39]:


train_input_generator = train_generator_for_model(model)

t_collection.set_transform_inputs_fn(train_input_generator)
#t_collection.save("")

<__main__.TrainSnippetsCollection at 0x7fe0004a36a0>

In [41]:
Y_pos, Y_neg = next(t_collection.generator())




In [42]:
Y_pos

[array([[1328,   11,    1, ...,    0,    0,    0],
        [4079,   60, 2214, ...,    0,    0,    0],
        [  11,  101,  199, ...,    0,    0,    0],
        ...,
        [  37,   23,    1, ...,    0,    0,    0],
        [1328,   11,    1, ...,    0,    0,    0],
        [  11, 4953, 3531, ...,    0,    0,    0]], dtype=int32),
 array([[[    231,    6757,   34530, ...,       0,       0,       0],
         [  41324,     359,    4881, ...,       0,       0,       0],
         [1877509,    1312,    1525, ...,       0,       0,       0],
         ...,
         [      0,       0,       0, ...,       0,       0,       0],
         [      0,       0,       0, ...,       0,       0,       0],
         [      0,       0,       0, ...,       0,       0,       0]],
 
        [[    963,       3,     765, ...,       0,       0,       0],
         [    183,    1069,      11, ...,       0,       0,       0],
         [     14,     555,   12135, ...,       0,       0,       0],
         ...,
     

In [438]:
## Test Collection




1122


[(0.7228915662650602, 0.22580645161290325),
 (0.7846153846153846, 0.045454545454545414),
 (0.5470085470085471, 0.23076923076923073),
 (0.9929577464788732, 0.23809523809523814),
 (1.0, 0.08333333333333337),
 (0.937007874015748, 0.11764705882352944),
 (0.8634686346863468, 0.23529411764705888),
 (1.0, 0.0),
 (1.0, 0.25),
 (0.6688963210702341, 0.125),
 (0.9245283018867925, 0.19999999999999996),
 (0.7391304347826086, 0.23529411764705888),
 (0.8413793103448276, 0.25),
 (0.8653846153846154, 0.13043478260869568),
 (1.0, 0.18181818181818177),
 (1.0, 0.17647058823529416),
 (0.675, 0.10526315789473684),
 (1.0, 0.08695652173913049),
 (0.7663551401869159, 0.23529411764705888),
 (1.0, 0.10526315789473684),
 (0.7606382978723404, 0.2857142857142857),
 (0.6727272727272727, 0.23809523809523814),
 (0.5578231292517006, 0.0625),
 (0.627906976744186, 0.15000000000000002),
 (0.9883720930232558, 0.23809523809523814),
 (0.8852459016393442, 0.25),
 (1.0, 0.2222222222222222),
 (0.9808917197452229, 0.099999999999



ValueError: operands could not be broadcast together with shapes (30,) (0,) 

In [522]:
pos_doc[7]["snippets"]

[{'text': 'The Tzs protein and exogenous cytokinin affect virulence gene expression and bacterial growth of Agrobacterium tumefaciens.',
  'start': 0,
  'end': 122,
  'is_title': True},
 {'text': 'The soil phytopathogen Agrobacterium tumefaciens causes crown gall disease in a wide range of plant species.',
  'start': 2,
  'end': 109,
  'is_title': False},
 {'text': 'The neoplastic growth at the infection sites is caused by transferring, integrating, and expressing transfer DNA (T-DNA) from A. tumefaciens into plant cells.',
  'start': 111,
  'end': 268,
  'is_title': False},
 {'text': 'A trans-zeatin synthesizing (tzs) gene is located in the nopaline-type tumor-inducing plasmid and causes trans-zeatin production in A. tumefaciens.',
  'start': 270,
  'end': 416,
  'is_title': False},
 {'text': 'Similar to known virulence (Vir) proteins that are induced by the vir gene inducer acetosyringone (AS) at acidic pH 5.5, Tzs protein is highly induced by AS under this growth condition but also 

In [517]:
pos_snippet[7]

[0.2733961088491816,
 0.14364575390759185,
 0.9791666666666667,
 0.17308270440886744,
 0.13151067921357903,
 0.15169245897652603,
 0.09091587420313646,
 0.06945705243277091,
 0.18897426932834616]

In [504]:
doc_title[13]

'Angiogenic growth factors and their inhibitors in diabetic retinopathy.'

In [505]:
doc_text[13]

'Angiogenic growth factors and their inhibitors in diabetic retinopathy. Diabetic retinopathy is considered one of the vision-threatening diseases among working-age population. The pathogenesis of the disease is regarded multifactorial and complex: capillary basement membrane thickening, loss of pericytes, microaneuryms, loss of endothelial cells, blood retinal barrier breakdown and other anatomic lesions might contribute to macular edema and/or neovascularization the two major and sight threatening complications of diabetic retinopathy. A number of proangiogenic, angiogenic and antiangiogenic factors are involved in the pathogenesis and progression of diabetic retinal disease, Vascular Endothelial Growth Factor (VEGF) being one of the most important. Other growth factors, which are known to participate in the pathogenesis of the disease, are: Platelet Derived Growth Factor (PDGF), Fibroblast Growth Factor (FGF), Hepatocyte Growth Factor (HGF), Transforming Growth Factor (TGF), Placent

In [506]:
import nltk
nltk.PunktSentenceTokenizer().tokenize(doc_text[13])

['Angiogenic growth factors and their inhibitors in diabetic retinopathy.',
 'Diabetic retinopathy is considered one of the vision-threatening diseases among working-age population.',
 'The pathogenesis of the disease is regarded multifactorial and complex: capillary basement membrane thickening, loss of pericytes, microaneuryms, loss of endothelial cells, blood retinal barrier breakdown and other anatomic lesions might contribute to macular edema and/or neovascularization the two major and sight threatening complications of diabetic retinopathy.',
 'A number of proangiogenic, angiogenic and antiangiogenic factors are involved in the pathogenesis and progression of diabetic retinal disease, Vascular Endothelial Growth Factor (VEGF) being one of the most important.',
 'Other growth factors, which are known to participate in the pathogenesis of the disease, are: Platelet Derived Growth Factor (PDGF), Fibroblast Growth Factor (FGF), Hepatocyte Growth Factor (HGF), Transforming Growth Fact

In [510]:
train_data_gs["53124e84e3eabad02100000c"]["20594164"]

[{'offsetInBeginSection': 1452,
  'offsetInEndSection': 1549,
  'text': 'antioangiogenic factors such as pigment epithelial derived factor (PEDF), angiostatin, endostatin',
  'beginSection': 'abstract',
  'document': 'http://www.ncbi.nlm.nih.gov/pubmed/20594164',
  'endSection': 'abstract'}]

In [509]:
doc_snippet_words = nltk.PunktSentenceTokenizer().tokenize(doc_text[13])[6].replace('\n',' \n ').replace('.',' . ').split(" ")
snippet_words = train_data_gs["532f0c4ed6d3ac6a3400002e"]["17597390"][0]["text"].split(" ")



print(doc_snippet_words)
print(doc_snippet_words[15:25])
print(snippet_words)

# self correcting algorithm
index_doc_snippet = 0
index_snippet = 0
start_index = -1
end_index = -1
early_stop = False
while index_doc_snippet<len(doc_snippet_words) :

    index_snippet = 0
    while index_snippet<len(snippet_words) and index_doc_snippet<len(doc_snippet_words):
        print("i", start_index, end_index, index_doc_snippet, index_snippet)
        if doc_snippet_words[index_doc_snippet] == snippet_words[index_snippet]:
            if start_index == -1:
                start_index = index_doc_snippet
            index_doc_snippet += 1 
        elif start_index != -1 and (index_doc_snippet-start_index)<6:
            start_index = -1
        elif start_index != -1:
            early_stop = True
            break
        index_snippet += 1

    index_doc_snippet +=1

    if early_stop:
        break

if (index_doc_snippet-start_index)<6:
    start_index = -1
    end_index = -1
        
if start_index != -1:
    end_index = index_doc_snippet - 1

print(start_index, end_index)

['However,', 'the', 'intraocular', 'concentration', 'of', 'angiogenic', 'factors', 'is', 'counterbalanced', 'by', 'the', 'ocular', 'synthesis', 'of', 'several', 'antioangiogenic', 'factors', 'such', 'as', 'pigment', 'epithelial', 'derived', 'factor', '(PEDF),', 'angiostatin,', 'endostatin,', 'thrombospondin,', 'steroids,', 'atrial', 'natriuretic', 'peptide', '(ANP),', 'inteferon,', 'aptamer,', 'monoclonal', 'antibodies,', 'VEGF', 'receptor', 'blocker,', 'VEGF', 'gene', 'suppressors,', 'intracellular', 'signal', 'transduction', 'inhibitors,', 'and', 'extracellular', 'matrix', 'antagonists', '.', '']
['antioangiogenic', 'factors', 'such', 'as', 'pigment', 'epithelial', 'derived', 'factor', '(PEDF),', 'angiostatin,']
["Marfan's", 'patients', 'carry', 'increased', 'risk', 'for', 'cardiac', 'arrhythmias.', '']
i -1 -1 0 0
i -1 -1 0 1
i -1 -1 0 2
i -1 -1 0 3
i -1 -1 0 4
i -1 -1 0 5
i -1 -1 0 6
i -1 -1 0 7
i -1 -1 0 8
i -1 -1 1 0
i -1 -1 1 1
i -1 -1 1 2
i -1 -1 1 3
i -1 -1 1 4
i -1 -1 1 5
i -

In [343]:
print(_text[67:][819:1088])
print(doc_text[19][67:][819:1088])

ministration (FDA) approval. Positive clinical trial results have also been reported for several other JAK inhibitors including baricitinib. Several other JAK inhibitors and other small molecular entities are also being developed in studies ranging from preclinical mod
gs Administration (FDA) approval. Positive clinical trial results have also been reported for several other JAK inhibitors including baricitinib. Several other JAK inhibitors and other small molecular entities are also being developed in studies ranging from preclinica


In [217]:
for start, end in PunktSentenceTokenizer().span_tokenize(doc_text[0]):
    print(start,end)
print()
i = 0
for start, end in PunktSentenceTokenizer().span_tokenize(doc_text[0]):
    end = end-len(doc_title[0])-1
    if i>0:
        end -= 1
    print(start-len(doc_title[0])-1,end)
    i+=1

0 42
43 136
137 295
296 403
404 522
523 613
614 726
727 903

-43 -1
0 92
94 251
253 359
361 478
480 569
571 682
684 859


In [221]:
overlap((253, 359),(361, 683))/(359+1-253)

0.0

In [214]:
doc_text[0][404:522+1]

'PARPi has been designed and tested for many years and became a potential supplement for the conventional chemotherapy. '

In [136]:
doc_text[1][72:234]

"Ten years after Fire and Melo's Nobel Prize for discovery of gene silencing by double-stranded RNA, a remarkable progress was achieved in RNA interference (RNAi)."

In [142]:

doc_text[1][323+len(doc_title[1])+1:434+len(doc_title[1])+1]


'The attention of pharmaceutical industry rapidly turned to RNAi, as an opportunity to explore new drug targets.'

In [358]:
train_data_gs["52fb7c512059c6d71c000069"]

{'15890322': [{'offsetInBeginSection': 0,
   'offsetInEndSection': 305,
   'text': 'The short QT syndrome constitutes a new clinical entity that is associated with a high incidence of sudden cardiac death, syncope, and/or atrial fibrillation even in young patients and newborns. Patients with this congenital electrical abnormality are characterized by rate-corrected QT intervals<320 ms. ',
   'beginSection': 'abstract',
   'document': 'http://www.ncbi.nlm.nih.gov/pubmed/15890322',
   'endSection': 'abstract'}],
 '15569843': [{'offsetInBeginSection': 12,
   'offsetInEndSection': 183,
   'text': 'The short-QT syndrome is a new clinical entity characterized by corrected QT intervals <300 ms and a high incidence of ventricular tachycardia (VT) and fibrillation (VF). ',
   'beginSection': 'abstract',
   'document': 'http://www.ncbi.nlm.nih.gov/pubmed/15569843',
   'endSection': 'abstract'}],
 '16482041': [{'offsetInBeginSection': 1114,
   'offsetInEndSection': 1294,
   'text': 'QT interval i

In [175]:
q[0]

'What is the doRiNA database?'

In [501]:
for _q in train_data_queries:
    if _q["query"] == q[13]:
        print(_q["id"])

53124e84e3eabad02100000c
