In [1]:
from mmnrm.utils import set_random_seed, load_neural_model, load_model, flat_list
from mmnrm.dataset import TestCollectionV2, sentence_splitter_builderV2
from mmnrm.evaluation import BioASQ_Evaluator

from collections import defaultdict
import os
import pickle
import numpy as np
import sys
import math
import time
import tensorflow as tf
from nltk.tokenize.punkt import PunktSentenceTokenizer

from utils import *

In [2]:
import nltk
# some queries to use as an example
def build_data_generators(tokenizer, use_joint=True, queries_sw=None, docs_sw=None):
    
    punkt_sent_tokenizer = PunktSentenceTokenizer().span_tokenize

    def maybe_tokenize(documents):
        if "tokens" not in documents:
            if use_joint:
                split = []
                spans = []
                for _itter, position in enumerate(punkt_sent_tokenizer(documents["text"])):
                    start, end = position
                    _text = documents["text"][start:end]

                    is_title = True
                    if _itter>0: # fix the start and end position for the abstract
                        
                        if _itter == 1: # auxiliar correction to set the abstract at 0 index
                            diff = (len(documents["title"])-1)+(start-(len(documents["title"])-1))
                            
                        start = start-diff
                        end = end-diff
                        is_title = False
                    
                    split.append(_text)
                    spans.append({"start":start,
                                  "end":end-1,
                                  "text":_text,
                                  "is_title":is_title,
                                  "snippet_id":documents["id"]+"_"+str(_itter),
                                  "doc_id":documents["id"]})
                documents["spans"] = spans
            else:
                split = nltk.sent_tokenize(documents["text"])
                
            documents["tokens"] = tokenizer.texts_to_sequences(split)
            if docs_sw is not None:
                for tokenized_sentence in documents["tokens"]:
                    tokenized_sentence = [token for token in tokenized_sentence if token not in docs_sw]

    def test_generator(data_generator):
        for _id, query, docs in data_generator:
            tokenized_queries = []
            for i in range(len(_id)):
                # tokenization
                tokenized_query = tokenizer.texts_to_sequences([query[i]])[0]

                if queries_sw is not None:
                    tokenized_query = [token for token in tokenized_query if token not in queries_sw] 
                
                tokenized_queries.append(tokenized_query)
                    
        
                for doc in docs[i]:
                    maybe_tokenize(doc)
                                                 
            yield _id, tokenized_queries, docs
            
    return test_generator

def get_test_generator_for_model(model, use_joint):

    if "model" in model.savable_config:
        cfg = model.savable_config["model"]
    
    test_gen = build_data_generators(model.tokenizer, use_joint)
    
    pad_tokens = lambda x, max_len, dtype='int32': tf.keras.preprocessing.sequence.pad_sequences(x, 
                                                                                           maxlen=max_len,
                                                                                           dtype=dtype, 
                                                                                           padding='post', 
                                                                                           truncating='post', 
                                                                                           value=0)

    pad_sentences = lambda x, max_lim, dtype='int32': x[:max_lim] + [[]]*(max_lim-len(x))
    
    def maybe_padding(document):
        if isinstance(document["tokens"], list):
            #overflow prevention
            bounded_doc_passage = min(cfg["max_passages"],len(document["tokens"]))
            document["sentences_mask"] = [True] * bounded_doc_passage + [False] * (cfg["max_passages"]-bounded_doc_passage)
            document["tokens"] = pad_tokens(pad_sentences(document["tokens"], cfg["max_passages"]), cfg["max_p_terms"])
            document["spans"] = document["spans"][:cfg["max_passages"]]
            
    def test_generator(data_generator):
        
        for ids, query, docs in test_gen(data_generator):
            
            docs_spans = []
            docs_ids = []
            docs_array = []
            docs_mask_array = []
            query_array = []
            query_ids = []
            
            for i in range(len(ids)):
                
                for doc in docs[i]:
                    # pad docs, use cache here
                    maybe_padding(doc)
                    docs_array.append(doc["tokens"])
                    docs_mask_array.append(doc["sentences_mask"])
                    docs_ids.append(doc["id"])
                    docs_spans.append(doc["spans"])
                    
                query_tokens = pad_tokens([query[i]], cfg["max_q_terms"])[0]
                query_tokens = [query_tokens] * len(docs[i])
                query_array.append(query_tokens)
                    
                query_ids.append([ids[i]]*len(docs[i]))
            
            #print(np.array(docs_mask_array))
            
            yield flat_list(query_ids), [np.array(flat_list(query_array)), np.array(docs_array), np.array(docs_mask_array)], docs_ids, docs_spans
            
    return test_generator

In [3]:
def rank(model, t_collection):

    generator_Y = t_collection.generator()
                
    q_scores = defaultdict(list)

    for query_id, Y, docs_ids, docs_spans in generator_Y:
        s_time = time.time()
        
        scores = model.predict(Y)
        doc_scores = scores[0][:,0].tolist()
        snippets_scores = scores[1].tolist()
        
        for i in range(len(doc_scores)):
            
            for j in range(len(docs_spans[i])):
                docs_spans[i][j]["score"] = snippets_scores[i][j][0]
            
            #q_scores[query_id].extend(list(zip(docs_ids,scores)))
            q_scores[query_id[i]].append({"id":docs_ids[i],
                                          "score":doc_scores[i],
                                          "snippets":docs_spans[i]})
        
        print("\rEvaluation {} | time {}".format(len(q_scores), time.time()-s_time), end="\r")

    # sort the rankings
    for query_id in q_scores.keys():
        q_scores[query_id].sort(key=lambda x:-x["score"])
        q_scores[query_id] = q_scores[query_id]
    
    return q_scores

def snippetRank_byThreshold(results, threshold):
    snippets_results = {}
    # this will follow the document order first
    for q in results.keys():
        snippets_results[q] = [y for y in flat_list([x["snippets"] for x in results[q]]) if y["score"] >= threshold]
        
    return snippets_results

def snippetRank_byThreshold_and_TopK(results, threshold, topK):
    snippets_results = {}
    # this will follow the document order first
    for q in results.keys():
        snippets_results[q] = [y for y in flat_list([x["snippets"] for x in results[q][:topK]]) if y["score"] >= threshold]
        
    return snippets_results

def rerank_run(baseline_file, top_k, t=0.08, snippet_topK=None):
    run = load_document_run(baseline_file, dict_format=True)

    tCollection = TestCollectionV2(queries, run)\
                      .batch_size(top_k)\
                      .set_transform_inputs_fn(test_input_generator)
    
    results = rank(ranking_model, tCollection)
    
    if snippet_topK is None:
        snippets = snippetRank_byThreshold(results, t)
    else:
        snippets = snippetRank_byThreshold_and_TopK(results, t, snippet_topK)
    
    return create_document_run(queries, results, snippets=snippets) ## update the run results




# Round 4

In [4]:
rnd_path = "runs/rnd4"

In [5]:
queries = load_queries("BioASQ-task9bPhaseA-testset4", maps=[("body","query")])

checkpoints_name = ["happy-wave-21_val_collection0_map@10",
                    "logical-river-23_val_collection0_map@10",
                    "distinctive-oath-24_val_collection0_map@10",
                    "honest-frog-22_val_collection0_map@10",
                    "happy-wave-21_val_collection1_recall@10",
                    "logical-river-23_val_collection1_recall@10",
                    "distinctive-oath-24_val_collection1_recall@10",
                    "honest-frog-22_val_collection1_recall@10",]

chached_snippets = {}

for checkpoint_name in checkpoints_name:
    trec_name = checkpoint_name+".trec"
    
    if not os.path.exists(os.path.join(rnd_path, trec_name)):
        ranking_model = load_model(os.path.join("trained_models", checkpoint_name))
        test_input_generator = get_test_generator_for_model(ranking_model, True)

        rerank = rerank_run(os.path.join(rnd_path, "bm25-baseline.run"), 100)
        print([len(q["documents"]) for q in rerank], min([len(q["documents"]) for q in rerank]), len(rerank))
        
        for q in rerank:
            for s in q["snippets"]:
                chached_snippets[s["snippet_id"]] = s
        
        write_as_trec_snippets(rerank, os.path.join(rnd_path, trec_name+".snippets"))
        write_as_trec(rerank, os.path.join(rnd_path, trec_name))
        

        del ranking_model
                  
# fusion
print("RRF Fusion")
fusion_rrf([os.path.join(rnd_path, p+".trec") for p in checkpoints_name], os.path.join(rnd_path, "BIT.UA-02.trec"))
fusion_rrf([os.path.join(rnd_path, p+".trec.snippets") for p in checkpoints_name], os.path.join(rnd_path, "BIT.UA-02.trec.snippets"))

convert_trec_run_to_bioasq_wSnippets(os.path.join(rnd_path, "BIT.UA-02.trec"),
                                     os.path.join(rnd_path, "BIT.UA-02.trec.snippets"),
                                     chached_snippets,
                                     queries,
                                     os.path.join(rnd_path, "BIT.UA-02.json"))

DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embedding_bioasq_9b_gensim_iter_15_freq0_200_Regex_word2vec_bioasq_9b_RegexTokenizer
Using einsum for mask bq,bps->bpqs and with embedding dim bqe,bpse->bpqs
[EMBEDDING MATRIX SHAPE] (5322623, 200)
[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100] 100 100
DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embed

KeyError: 'snippet_id'

In [20]:
# batch 04

queries = load_queries("BioASQ-task9bPhaseA-testset4", maps=[("body","query")])

pickle_run_names = ["earnest-lion-31_val_collection0_map@10",
                    "earnest-lion-31_val_collection0_recall@10"]

chached_snippets = {}

for p_name in pickle_run_names:
    with open(os.path.join(rnd_path,p_name+".p"),"rb") as f:
        rerank = pickle.load(f)
        
    for q in rerank:
        for s in q["snippets"]:
            chached_snippets[s["snippet_id"]] = s
            
    write_as_trec_snippets(rerank, os.path.join(rnd_path, p_name+".trec.snippets"))
    write_as_trec(rerank, os.path.join(rnd_path, p_name+".trec"))

print("RRF Fusion")
fusion_rrf([os.path.join(rnd_path, p_name+".trec") for p_name in pickle_run_names], os.path.join(rnd_path, "BIT.UA-04.trec"))
fusion_rrf([os.path.join(rnd_path, p_name+".trec.snippets") for p_name in pickle_run_names], os.path.join(rnd_path, "BIT.UA-04.trec.snippets"))

convert_trec_run_to_bioasq_wSnippets(os.path.join(rnd_path, "BIT.UA-04.trec"),
                                     os.path.join(rnd_path, "BIT.UA-04.trec.snippets"),
                                     chached_snippets,
                                     queries,
                                     os.path.join(rnd_path, "BIT.UA-04.json"))

RRF Fusion
File runs/rnd4/BIT.UA-04.trec writen.
File runs/rnd4/BIT.UA-04.trec.snippets writen.


In [11]:
snippet_run[0]

{'id': '6027fcd31cb411341a0000ef',
 'type': 'list',
 'query': 'Which methods exist for efficient calculation of Elementary flux modes (EFMs) in genome-scale metabolic networks (GSMNs)?',
 'snippets': [{'id': '25380956_2', 'score': '0.13114754098360656'},
  {'id': '32348455_1', 'score': '0.12852022529441884'},
  {'id': '24497502_3', 'score': '0.12749615975422426'},
  {'id': '24728852_0', 'score': '0.125'},
  {'id': '24141488_1', 'score': '0.1204193486712128'},
  {'id': '19793869_0', 'score': '0.1184560537501714'},
  {'id': '21685054_0', 'score': '0.1175109121896922'},
  {'id': '25754258_1', 'score': '0.11262865734656398'},
  {'id': '23664840_1', 'score': '0.11237818100563199'},
  {'id': '26940826_1', 'score': '0.11007421990110185'},
  {'id': '26091045_1', 'score': '0.10996340021381398'},
  {'id': '32059585_1', 'score': '0.10932827919169291'},
  {'id': '25701571_1', 'score': '0.10825432463363498'},
  {'id': '28406903_1', 'score': '0.10778414577985232'},
  {'id': '26748294_1', 'score': '0

In [22]:
#batch 05

queries = load_queries("BioASQ-task9bPhaseA-testset4", maps=[("body","query")])

pickle_run_name = "earnest-lion-31_val_collection0_map@10-nodate"

chached_snippets = {}

with open(os.path.join(rnd_path,pickle_run_name+".p"),"rb") as f:
    rerank = pickle.load(f)

            
write_as_bioasq(rerank,os.path.join(rnd_path, "BIT.UA-05.json") )

{'id': '602747921cb411341a0000e0',
 'type': 'summary',
 'query': 'What is the mechanism of action of vosoritide?',
 'documents': [{'id': '31269546',
   'score': 14.455413818359375,
   'snippets': [{'start': 0,
     'end': 75,
     'text': 'C-Type Natriuretic Peptide Analogue Therapy in Children with Achondroplasia.',
     'is_title': True,
     'snippet_id': '31269546_0',
     'doc_id': '31269546',
     'score': 0.07988996803760529},
    {'start': 0,
     'end': 181,
     'text': 'BACKGROUND\nAchondroplasia is a genetic disorder that inhibits endochondral ossification, resulting in disproportionate short stature and clinically significant medical complications.',
     'is_title': False,
     'snippet_id': '31269546_1',
     'doc_id': '31269546',
     'score': 0.07993191480636597},
    {'start': 183,
     'end': 296,
     'text': 'Vosoritide is a biologic analogue of C-type natriuretic peptide, a potent stimulator of endochondral ossification.',
     'is_title': False,
     'snippet_id'

# Round 5

In [4]:
rnd_path = "runs/rnd5"

In [7]:
queries = load_queries("BioASQ-task9bPhaseA-testset5", maps=[("body","query")])

checkpoints_name = [("sleek-rain-68_val_collection0_doc_map@10", 0.7 ,1),
                    ("firm-cherry-61_val_collection0_doc_map@10", 0.7 ,1),
                    ("honest-morning-60_val_collection0_doc_map@10",0.08,1),
                    ("daily-night-30_val_collection0_doc_map@10",0.15,1),
                    ("celestial-wildflower-39_val_collection0_doc_map@10",0.15,1),
                    ("silvery-breeze-32_val_collection0_doc_map@10",0.15,1),
                    ("sleek-rain-68_val_collection0_doc_r@10", 0.7 ,1),
                    ("honest-morning-60_val_collection0_doc_r@10",0.08,1),
                    ("silvery-breeze-32_val_collection0_doc_r@10",0.15,1),
                   ]

chached_snippets = {}

for checkpoint_name, t, snippet_topk in checkpoints_name:
    trec_name = checkpoint_name+".trec"
    
    if not os.path.exists(os.path.join(rnd_path, trec_name)):
        ranking_model = load_model(os.path.join("trained_models", checkpoint_name))
        test_input_generator = get_test_generator_for_model(ranking_model, True)

        rerank = rerank_run(os.path.join(rnd_path, "bm25-baseline.run"), 100, t, snippet_topk)
        print([len(q["documents"]) for q in rerank], min([len(q["documents"]) for q in rerank]), len(rerank))
        
        for q in rerank:
            for s in q["snippets"]:
                chached_snippets[s["snippet_id"]] = s
        
        write_as_trec_snippets(rerank, os.path.join(rnd_path, trec_name+".snippets"))
        write_as_trec(rerank, os.path.join(rnd_path, trec_name))
        

        del ranking_model
                  
# fusion
print("RRF Fusion")
fusion_rrf([os.path.join(rnd_path, p+".trec") for p, _, _ in checkpoints_name], os.path.join(rnd_path, "BIT.UA-02.trec"))
fusion_rrf([os.path.join(rnd_path, p+".trec.snippets") for p, _, _ in checkpoints_name], os.path.join(rnd_path, "BIT.UA-02.trec.snippets"))

convert_trec_run_to_bioasq_wSnippets(os.path.join(rnd_path, "BIT.UA-02.trec"),
                                     os.path.join(rnd_path, "BIT.UA-02.trec.snippets"),
                                     chached_snippets,
                                     queries,
                                     os.path.join(rnd_path, "BIT.UA-02.json"))

DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embedding_bioasq_9b_gensim_iter_15_freq0_200_Regex_word2vec_bioasq_9b_RegexTokenizer
Using einsum for mask bq,bps->bpqs and with embedding dim bqe,bpse->bpqs
[EMBEDDING MATRIX SHAPE] (5322623, 200)
[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100] 100 100
DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embed

In [12]:
queries = load_queries("BioASQ-task9bPhaseA-testset5", maps=[("body","query")])

pickle_run_names = [("warm-donkey-58_val_collection0_doc_map@10",0.15,1),
                    ("zesty-frog-66_val_collection0_doc_map@10",-3.5,1),
                    ("warm-donkey-58_val_collection0_doc_r@10",0.15,1),]

chached_snippets = {}

for p_name, t, snippet_topK in pickle_run_names:
    with open(os.path.join(rnd_path,p_name+".p"),"rb") as f:
        rerank = pickle.load(f)
    
    snippets = snippetRank_byThreshold_and_TopK(rerank, t, snippet_topK)
    
    rerank = create_document_run(queries, rerank, snippets=snippets)
    
    for q in rerank:
        for s in q["snippets"]:
            chached_snippets[s["snippet_id"]] = s
            
    write_as_trec_snippets(rerank, os.path.join(rnd_path, p_name+".trec.snippets"))
    write_as_trec(rerank, os.path.join(rnd_path, p_name+".trec"))

print("RRF Fusion")
fusion_rrf([os.path.join(rnd_path, p_name+".trec") for p_name,_,_ in pickle_run_names], os.path.join(rnd_path, "BIT.UA-04.trec"))
fusion_rrf([os.path.join(rnd_path, p_name+".trec.snippets") for p_name,_,_ in pickle_run_names], os.path.join(rnd_path, "BIT.UA-04.trec.snippets"))

convert_trec_run_to_bioasq_wSnippets(os.path.join(rnd_path, "BIT.UA-04.trec"),
                                     os.path.join(rnd_path, "BIT.UA-04.trec.snippets"),
                                     chached_snippets,
                                     queries,
                                     os.path.join(rnd_path, "BIT.UA-04.json"))

RRF Fusion
File runs/rnd5/BIT.UA-04.trec writen.
File runs/rnd5/BIT.UA-04.trec.snippets writen.


In [9]:
# run 5, top map


queries = load_queries("BioASQ-task9bPhaseA-testset5", maps=[("body","query")])

pickle_run_name = "warm-donkey-58_val_collection0_doc_map@10"
t = 0.15
snippet_topK = 1

chached_snippets = {}

with open(os.path.join(rnd_path,pickle_run_name+".p"),"rb") as f:
    rerank = pickle.load(f)

snippets = snippetRank_byThreshold_and_TopK(rerank, t, snippet_topK)

rerank = create_document_run(queries, rerank, snippets=snippets)
            
write_as_bioasq(rerank,os.path.join(rnd_path, "BIT.UA-05.json") )