In [1]:
from mmnrm.utils import set_random_seed, load_neural_model, load_model, flat_list
from mmnrm.dataset import TestCollectionV2, sentence_splitter_builderV2
from mmnrm.evaluation import BioASQ_Evaluator

from collections import defaultdict
import os
import pickle
import numpy as np
import sys
import math
import time
import tensorflow as tf
from nltk.tokenize.punkt import PunktSentenceTokenizer

from utils import *

In [2]:
import nltk



def build_data_generators(tokenizer, use_joint=True, queries_sw=None, docs_sw=None):
    
    punkt_sent_tokenizer = PunktSentenceTokenizer().span_tokenize
    
    def sent_tokenize(document):
        return [ document[start:end] for start,end in punkt_sent_tokenizer(document) ]
    
    def maybe_tokenize(documents):
        if "tokens" not in documents:
            if use_joint:
                split = sent_tokenize(documents["text"])
            else:
                split = nltk.sent_tokenize(documents["text"])
                
            documents["tokens"] = tokenizer.texts_to_sequences(split)
            if docs_sw is not None:
                for tokenized_sentence in documents["tokens"]:
                    tokenized_sentence = [token for token in tokenized_sentence if token not in docs_sw]
    
    def train_generator(data_generator):
        while True:

            # get the batch triplet
            query, pos_docs, neg_docs = next(data_generator)

            # tokenization, this can be cached for efficientcy porpuses NOTE!!
            tokenized_query = tokenizer.texts_to_sequences(query)

            if queries_sw is not None:
                for tokens in tokenized_query:
                    tokenized_query = [token for token in tokens if token not in queries_sw] 
            
            saveReturn = True
            
            for batch_index in range(len(pos_docs)):
                
                # tokenizer with cache in [batch_index][tokens]
                maybe_tokenize(pos_docs[batch_index])
                
                # assertion
                if all([ len(sentence)==0  for sentence in pos_docs[batch_index]["tokens"]]):
                    saveReturn = False
                    break # try a new resampling, NOTE THIS IS A EASY FIX PLS REDO THIS!!!!!!!
                          # for obvious reasons
                
                maybe_tokenize(neg_docs[batch_index])
                
            if saveReturn: # this is not true, if the batch is rejected
                yield tokenized_query, pos_docs, neg_docs

    def test_generator(data_generator):
        for _id, query, docs in data_generator:
            tokenized_queries = []
            for i in range(len(_id)):
                # tokenization
                tokenized_query = tokenizer.texts_to_sequences([query[i]])[0]

                if queries_sw is not None:
                    tokenized_query = [token for token in tokenized_query if token not in queries_sw] 
                
                tokenized_queries.append(tokenized_query)
                    
        
                for doc in docs[i]:
                    maybe_tokenize(doc)
                                                 
            yield _id, tokenized_queries, docs
            
    return train_generator, test_generator

def model_train_generator_for_model(model):

    if "model" in model.savable_config:
        cfg = model.savable_config["model"]
    
    train_gen, test_gen = build_data_generators(model.tokenizer)
    
    pad_tokens = lambda x, max_len, dtype='int32': tf.keras.preprocessing.sequence.pad_sequences(x, 
                                                                                           maxlen=max_len,
                                                                                           dtype=dtype, 
                                                                                           padding='post', 
                                                                                           truncating='post', 
                                                                                           value=0)

    pad_sentences = lambda x, max_lim, dtype='int32': x[:max_lim] + [[]]*(max_lim-len(x))
    
    def maybe_padding(document):
        if isinstance(document["tokens"], list):
            #overflow prevention
            bounded_doc_passage = min(cfg["max_passages"],len(document["tokens"]))
            document["sentences_mask"] = [True] * bounded_doc_passage + [False] * (cfg["max_passages"]-bounded_doc_passage)
            document["tokens"] = pad_tokens(pad_sentences(document["tokens"], cfg["max_passages"]), cfg["max_p_terms"])
            
    def train_generator(data_generator):
 
        for query, pos_docs, neg_docs in train_gen(data_generator):
            
            query = pad_tokens(query, cfg["max_q_terms"])
            
            pos_docs_array = []
            pos_docs_mask_array = []
            neg_docs_array = []
            neg_docs_mask_array = []
            
            # pad docs, use cache here
            for batch_index in range(len(pos_docs)):
                maybe_padding(pos_docs[batch_index])
                pos_docs_array.append(pos_docs[batch_index]["tokens"])
                pos_docs_mask_array.append(pos_docs[batch_index]["sentences_mask"])
                maybe_padding(neg_docs[batch_index])
                neg_docs_array.append(neg_docs[batch_index]["tokens"])
                neg_docs_mask_array.append(neg_docs[batch_index]["sentences_mask"])
            
            yield [query, np.array(pos_docs_array), np.array(pos_docs_mask_array)], [query, np.array(neg_docs_array), np.array(neg_docs_mask_array)]
            
    def test_generator(data_generator):
        
        for ids, query, docs in test_gen(data_generator):
            
            docs_ids = []
            docs_array = []
            docs_mask_array = []
            query_array = []
            query_ids = []
            
            for i in range(len(ids)):
                
                for doc in docs[i]:
                    # pad docs, use cache here
                    maybe_padding(doc)
                    docs_array.append(doc["tokens"])
                    docs_mask_array.append(doc["sentences_mask"])
                    docs_ids.append(doc["id"])
                
                query_tokens = pad_tokens([query[i]], cfg["max_q_terms"])[0]
                query_tokens = [query_tokens] * len(docs[i])
                query_array.append(query_tokens)
                    
                query_ids.append([ids[i]]*len(docs[i]))
            
            #print(np.array(docs_mask_array))
            
            yield flat_list(query_ids), [np.array(flat_list(query_array)), np.array(docs_array), np.array(docs_mask_array)], docs_ids, None
            
    return train_generator, test_generator

In [3]:


def rank(model, t_collection):

    generator_Y = t_collection.generator()
                
    q_scores = defaultdict(list)

    for query_id, Y, docs_ids, offsets_docs in generator_Y:
        s_time = time.time()
        
        scores = model.predict(Y)
        scores = scores[:,0].tolist()
        
        for i in range(len(scores)):
            
            #q_scores[query_id].extend(list(zip(docs_ids,scores)))
            q_scores[query_id[i]].append({"id":docs_ids[i],
                                          "score":scores[i]})
        
        print("\rEvaluation {} | time {}".format(len(q_scores), time.time()-s_time), end="\r")

    # sort the rankings
    for query_id in q_scores.keys():
        q_scores[query_id].sort(key=lambda x:-x["score"])
        q_scores[query_id] = q_scores[query_id]
    
    return q_scores

def rerank_run(baseline_file, top_k):
    run = load_document_run(baseline_file, dict_format=True)

    tCollection = TestCollectionV2(queries, run)\
                      .batch_size(top_k)\
                      .set_transform_inputs_fn(test_input_generator)
    
    results = rank(ranking_model, tCollection)
    
    return create_document_run(queries, results) ## update the run results

## Round 1

In [4]:
queries = load_queries("BioASQ-task9bPhaseA-testset1", maps=[("body","query")])
ranking_model = load_model("trained_models/comic-morning-47_val_collection0_map@10")
_, test_input_generator = model_train_generator_for_model(ranking_model)

rerank = rerank_run("runs/rnd1/bm25-baseline-long.run", 100)
print([len(q["documents"]) for q in rerank], min([len(q["documents"]) for q in rerank]), len(rerank))
write_as_bioasq(rerank, "runs/rnd1/BIT.UA-01-long.json")
write_as_trec(rerank, "runs/rnd1/BIT.UA-01-long.trec")

    

DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embedding_bioasq_9b_gensim_iter_15_freq0_200_Regex_word2vec_bioasq_9b_RegexTokenizer
Using einsum for mask bq,bps->bpqs and with embedding dim bqe,bpse->bpqs
[EMBEDDING MATRIX SHAPE] (5322623, 200)
[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100] 100 100


In [4]:
#trained_models/
checkpoints_name = ["comic-morning-47_val_collection0_map@10",
                   "feasible-glade-44_val_collection0_map@10",
                   "cosmic-darkness-43_val_collection0_map@10",
                   "bright-dawn-45_val_collection0_map@10",
                   "breezy-leaf-46_val_collection0_map@10"
                  ]

for checkpoint_name in checkpoints_name:
    trec_name = checkpoint_name+".trec"
    
    if not os.path.exists(os.path.join("runs","rnd1",trec_name)):
        ranking_model = load_model(os.path.join("trained_models", checkpoint_name))
        _, test_input_generator = model_train_generator_for_model(ranking_model)

        rerank = rerank_run("runs/rnd1/bm25-baseline.run", 100)
        print([len(q["documents"]) for q in rerank], min([len(q["documents"]) for q in rerank]), len(rerank))
        write_as_trec(rerank, os.path.join("runs","rnd1",trec_name))

        del ranking_model
                  
# fusion
print("RRF Fusion")
fusion_rrf([os.path.join("runs","rnd1",p+".trec") for p in checkpoints_name], os.path.join("runs","rnd1","BIT.UA-02.trec"))
convert_trec_run_to_bioasq(os.path.join("runs","rnd1","BIT.UA-02.trec"),
                           queries,
                           os.path.join("runs","rnd1","BIT.UA-02.json"))

RRF Fusion
File runs/rnd1/BIT.UA-02.trec writen.


In [9]:
# run 3
convert_trec_run_to_bioasq(os.path.join("runs","rnd1","graceful-donkey-42_val_collection0_map@10.trec"), 
                           queries,
                           os.path.join("runs","rnd1","BIT.UA-03.json"))



In [10]:
# run 4

trec_run_names = ["graceful-donkey-42_val_collection0_map@10.trec",
                  "graceful-donkey-42_val_collection0_recall@10.trec"]

fusion_rrf([os.path.join("runs","rnd1",t) for t in trec_run_names], 
            os.path.join("runs","rnd1","BIT.UA-04.trec"))

convert_trec_run_to_bioasq(os.path.join("runs","rnd1","BIT.UA-04.trec"), 
                           queries,
                           os.path.join("runs","rnd1","BIT.UA-04.json"))

File runs/rnd1/BIT.UA-04.trec writen.


In [11]:
# run 5
trec_run_names = ["graceful-donkey-42_val_collection0_map@10.trec",
                  "graceful-donkey-42_val_collection0_recall@10.trec",
                  "comic-morning-47_val_collection0_map@10.trec",
                  "feasible-glade-44_val_collection0_map@10.trec",
                  "bm25-baseline.trec"
                  ]


fusion_rrf([os.path.join("runs","rnd1",t) for t in trec_run_names], 
            os.path.join("runs","rnd1","BIT.UA-05.trec"))

convert_trec_run_to_bioasq(os.path.join("runs","rnd1","BIT.UA-05.trec"),
                           queries,
                           os.path.join("runs","rnd1","BIT.UA-05.json"))

File runs/rnd1/BIT.UA-05.trec writen.


# Round 2

In [4]:
rnd_path = "runs/rnd2"

In [5]:
queries = load_queries("BioASQ-task9bPhaseA-testset2", maps=[("body","query")])

ranking_model = load_model("trained_models/earthy-glade-11_val_collection0_map@10")
_, test_input_generator = model_train_generator_for_model(ranking_model)

rerank = rerank_run(os.path.join(rnd_path, "bm25-baseline.run"), 100)
print([len(q["documents"]) for q in rerank], min([len(q["documents"]) for q in rerank]), len(rerank))
write_as_bioasq(rerank, os.path.join(rnd_path, "BIT.UA-01.json"))
write_as_trec(rerank, os.path.join(rnd_path, "BIT.UA-01.trec"))

DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embedding_bioasq_9b_gensim_iter_15_freq0_200_Regex_word2vec_bioasq_9b_RegexTokenizer
Using einsum for mask bq,bps->bpqs and with embedding dim bqe,bpse->bpqs
[EMBEDDING MATRIX SHAPE] (5322623, 200)
[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100] 100 100


In [None]:
queries = load_queries("BioASQ-task9bPhaseA-testset2", maps=[("body","query")])

checkpoints_name = ["earthy-glade-11_val_collection0_map@10",
                    "easy-pyramid-4_val_collection0_map@10",
                    "peach-feather-6_val_collection0_map@10",
                    "glorious-dragon-3_val_collection0_map@10",
                    "tough-resonance-7_val_collection0_map@10",
                    "earthy-glade-11_val_collection1_map@10",
                    "easy-pyramid-4_val_collection1_map@10",
                    "glorious-dragon-3_val_collection1_map@10",
                    "tough-resonance-7_val_collection1_map@10",
                    "peach-feather-6_val_collection1_map@10",
                  ]

for checkpoint_name in checkpoints_name:
    trec_name = checkpoint_name+".trec"
    
    if not os.path.exists(os.path.join(rnd_path, trec_name)):
        ranking_model = load_model(os.path.join("trained_models", checkpoint_name))
        _, test_input_generator = model_train_generator_for_model(ranking_model)

        rerank = rerank_run(os.path.join(rnd_path, "bm25-baseline.run"), 100)
        print([len(q["documents"]) for q in rerank], min([len(q["documents"]) for q in rerank]), len(rerank))
        write_as_trec(rerank, os.path.join(rnd_path, trec_name))

        del ranking_model
                  
# fusion
print("RRF Fusion")
fusion_rrf([os.path.join(rnd_path, p+".trec") for p in checkpoints_name], os.path.join(rnd_path, "BIT.UA-02.trec"))
convert_trec_run_to_bioasq(os.path.join(rnd_path, "BIT.UA-02.trec"),
                           queries,
                           os.path.join(rnd_path, "BIT.UA-02.json"))

DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embedding_bioasq_9b_gensim_iter_15_freq0_200_Regex_word2vec_bioasq_9b_RegexTokenizer
Using einsum for mask bq,bps->bpqs and with embedding dim bqe,bpse->bpqs
[EMBEDDING MATRIX SHAPE] (5322623, 200)
[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100] 100 100
DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embed

In [6]:
queries = load_queries("BioASQ-task9bPhaseA-testset2", maps=[("body","query")])

# run 3
#convert_trec_run_to_bioasq(os.path.join(rnd_path,"wandering-donkey-2_val_collection0_map@10.trec"), 
#                           queries,
#                           os.path.join(rnd_path,"BIT.UA-03.json"))

convert_trec_run_to_bioasq(os.path.join(rnd_path,"graceful-donkey-42_val_collection0_map@10.trec"), 
                           queries,
                           os.path.join(rnd_path,"BIT.UA-03-long.json"))

In [9]:
# run 4

queries = load_queries("BioASQ-task9bPhaseA-testset2", maps=[("body","query")])

trec_run_names = ["wandering-donkey-2_val_collection0_map@10.trec",
                  "wandering-donkey-2_val_collection0_recall@10.trec",
                  "wandering-donkey-2_val_collection1_recall@10.trec",
                  "northern-spaceship-17_val_collection0_map@10.trec",
                  "northern-spaceship-17_val_collection1_map@10.trec",
                  "northern-spaceship-17_val_collection0_recall@10.trec",]

fusion_rrf([os.path.join(rnd_path,t) for t in trec_run_names], 
            os.path.join(rnd_path,"BIT.UA-04.trec"))

convert_trec_run_to_bioasq(os.path.join(rnd_path,"BIT.UA-04.trec"), 
                           queries,
                           os.path.join(rnd_path,"BIT.UA-04.json"))

File runs/rnd2/BIT.UA-04.trec writen.


In [10]:
#run 5

queries = load_queries("BioASQ-task9bPhaseA-testset2", maps=[("body","query")])

trec_run_names = ["earthy-glade-11_val_collection0_map@10.trec",
                  "easy-pyramid-4_val_collection0_map@10.trec",
                  "peach-feather-6_val_collection0_map@10.trec",
                  "wandering-donkey-2_val_collection0_map@10.trec",
                  "wandering-donkey-2_val_collection0_recall@10.trec",
                  "wandering-donkey-2_val_collection1_recall@10.trec",
                  "bm25-baseline.trec"]

fusion_rrf([os.path.join(rnd_path,t) for t in trec_run_names], 
            os.path.join(rnd_path,"BIT.UA-05.trec"))

convert_trec_run_to_bioasq(os.path.join(rnd_path,"BIT.UA-05.trec"), 
                           queries,
                           os.path.join(rnd_path,"BIT.UA-05.json"))

File runs/rnd2/BIT.UA-05.trec writen.


# Round 3

In [5]:
rnd_path = "runs/rnd3"


In [12]:
queries = load_queries("BioASQ-task9bPhaseA-testset3", maps=[("body","query")])

ranking_model = load_model("trained_models/absurd-snow-7_val_collection0_map@10")
_, test_input_generator = model_train_generator_for_model(ranking_model)

rerank = rerank_run(os.path.join(rnd_path, "bm25-baseline.run"), 100)
print([len(q["documents"]) for q in rerank], min([len(q["documents"]) for q in rerank]), len(rerank))
write_as_bioasq(rerank, os.path.join(rnd_path, "BIT.UA-01.json"))
write_as_trec(rerank, os.path.join(rnd_path, "BIT.UA-01.trec"))

DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embedding_bioasq_9b_gensim_iter_15_freq0_200_Regex_word2vec_bioasq_9b_RegexTokenizer
Using einsum for mask bq,bps->bpqs and with embedding dim bqe,bpse->bpqs
[EMBEDDING MATRIX SHAPE] (5322623, 200)
[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100] 100 100


In [13]:
queries = load_queries("BioASQ-task9bPhaseA-testset3", maps=[("body","query")])

checkpoints_name = ["absurd-snow-7_val_collection0_map@10",
                    "eternal-rain-6_val_collection0_map@10",
                    "kind-frog-5_val_collection0_map@10",
                    "grateful-wind-8_val_collection0_recall@10",
                    "clean-puddle-3_val_collection0_recall@10",
                    "absurd-snow-7_val_collection0_recall@10"
                  ]

for checkpoint_name in checkpoints_name:
    trec_name = checkpoint_name+".trec"
    
    if not os.path.exists(os.path.join(rnd_path, trec_name)):
        ranking_model = load_model(os.path.join("trained_models", checkpoint_name))
        _, test_input_generator = model_train_generator_for_model(ranking_model)

        rerank = rerank_run(os.path.join(rnd_path, "bm25-baseline.run"), 100)
        print([len(q["documents"]) for q in rerank], min([len(q["documents"]) for q in rerank]), len(rerank))
        write_as_trec(rerank, os.path.join(rnd_path, trec_name))

        del ranking_model
                  
# fusion
print("RRF Fusion")
fusion_rrf([os.path.join(rnd_path, p+".trec") for p in checkpoints_name], os.path.join(rnd_path, "BIT.UA-02.trec"))
convert_trec_run_to_bioasq(os.path.join(rnd_path, "BIT.UA-02.trec"),
                           queries,
                           os.path.join(rnd_path, "BIT.UA-02.json"))

DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embedding_bioasq_9b_gensim_iter_15_freq0_200_Regex_word2vec_bioasq_9b_RegexTokenizer
Using einsum for mask bq,bps->bpqs and with embedding dim bqe,bpse->bpqs
[EMBEDDING MATRIX SHAPE] (5322623, 200)
[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100] 100 100
DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embed

In [6]:
queries = load_queries("BioASQ-task9bPhaseA-testset3", maps=[("body","query")])

# run 3
#convert_trec_run_to_bioasq(os.path.join(rnd_path,"swift-dawn-1_val_collection0_map@10.trec"), 
#                           queries,
#                           os.path.join(rnd_path,"BIT.UA-03.json"))

convert_trec_run_to_bioasq(os.path.join(rnd_path,"graceful-donkey-42_val_collection0_map@10.trec"), 
                           queries,
                           os.path.join(rnd_path,"BIT.UA-03-long.json"))

In [16]:
# run 4

queries = load_queries("BioASQ-task9bPhaseA-testset3", maps=[("body","query")])

trec_run_names = ["swift-dawn-1_val_collection0_map@10.trec",
                  "swift-dawn-1_val_collection0_recall@10.trec",
                  "swift-dawn-1_val_collection1_recall@10.trec"]

fusion_rrf([os.path.join(rnd_path,t) for t in trec_run_names], 
            os.path.join(rnd_path,"BIT.UA-04.trec"))

convert_trec_run_to_bioasq(os.path.join(rnd_path,"BIT.UA-04.trec"), 
                           queries,
                           os.path.join(rnd_path,"BIT.UA-04.json"))

File runs/rnd3/BIT.UA-04.trec writen.


In [17]:
# run 5

queries = load_queries("BioASQ-task9bPhaseA-testset3", maps=[("body","query")])

trec_run_names = ["absurd-snow-7_val_collection0_map@10.trec",
                  "eternal-rain-6_val_collection0_map@10.trec",
                  "kind-frog-5_val_collection0_map@10.trec",
                  "swift-dawn-1_val_collection0_map@10.trec",
                  "swift-dawn-1_val_collection0_recall@10.trec",
                  "swift-dawn-1_val_collection1_recall@10.trec",
                  "bm25-baseline.trec"]

fusion_rrf([os.path.join(rnd_path,t) for t in trec_run_names], 
            os.path.join(rnd_path,"BIT.UA-05.trec"))

convert_trec_run_to_bioasq(os.path.join(rnd_path,"BIT.UA-05.trec"), 
                           queries,
                           os.path.join(rnd_path,"BIT.UA-05.json"))

File runs/rnd3/BIT.UA-05.trec writen.


# Round 4

In [5]:
rnd_path = "runs/rnd4"

In [8]:
queries = load_queries("BioASQ-task9bPhaseA-testset4", maps=[("body","query")])

checkpoints_name = ["stilted-night-3_val_collection0_map@10",
                    "trim-energy-7_val_collection0_map@10",
                    "dashing-snowflake-6_val_collection0_map@10",
                    "sage-serenity-2_val_collection0_map@10",
                    "copper-yogurt-5_val_collection0_map@10",
                    "quiet-thunder-4_val_collection0_map@10",
                    "stilted-night-3_val_collection1_recall@10",
                    "trim-energy-7_val_collection1_recall@10",
                    "dashing-snowflake-6_val_collection1_recall@10",
                    "sage-serenity-2_val_collection1_recall@10",
                    "copper-yogurt-5_val_collection1_recall@10",
                    "quiet-thunder-4_val_collection1_recall@10"
                  ]

for checkpoint_name in checkpoints_name:
    trec_name = checkpoint_name+".trec"
    
    if not os.path.exists(os.path.join(rnd_path, trec_name)):
        ranking_model = load_model(os.path.join("trained_models", checkpoint_name))
        _, test_input_generator = model_train_generator_for_model(ranking_model)

        rerank = rerank_run(os.path.join(rnd_path, "bm25-baseline.run"), 100)
        print([len(q["documents"]) for q in rerank], min([len(q["documents"]) for q in rerank]), len(rerank))
        write_as_trec(rerank, os.path.join(rnd_path, trec_name))

        del ranking_model
                  
# fusion
print("RRF Fusion")
fusion_rrf([os.path.join(rnd_path, p+".trec") for p in checkpoints_name], os.path.join(rnd_path, "BIT.UA-01.trec"))
convert_trec_run_to_bioasq(os.path.join(rnd_path, "BIT.UA-01.trec"),
                           queries,
                           os.path.join(rnd_path, "BIT.UA-01.json"))

DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embedding_bioasq_9b_gensim_iter_15_freq0_200_Regex_word2vec_bioasq_9b_RegexTokenizer
Using einsum for mask bq,bps->bpqs and with embedding dim bqe,bpse->bpqs
[EMBEDDING MATRIX SHAPE] (5322623, 200)
[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100] 100 100
DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embed

DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embedding_bioasq_9b_gensim_iter_15_freq0_200_Regex_word2vec_bioasq_9b_RegexTokenizer
Using einsum for mask bq,bps->bpqs and with embedding dim bqe,bpse->bpqs
[EMBEDDING MATRIX SHAPE] (5322623, 200)
[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100] 100 100
DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embed

In [9]:
# run 3

queries = load_queries("BioASQ-task9bPhaseA-testset4", maps=[("body","query")])

trec_run_names = ["brisk-butterfly-1_val_collection0_map@10.trec",
                  "brisk-butterfly-1_val_collection0_recall@10.trec",
                  "dandy-elevator-14_val_collection0_map@10.trec",
                  "dandy-elevator-14_val_collection0_recall@10.trec",]

fusion_rrf([os.path.join(rnd_path,t) for t in trec_run_names], 
            os.path.join(rnd_path,"BIT.UA-03.trec"))

convert_trec_run_to_bioasq(os.path.join(rnd_path,"BIT.UA-03.trec"), 
                           queries,
                           os.path.join(rnd_path,"BIT.UA-03.json"))

File runs/rnd4/BIT.UA-03.trec writen.


# Round 5

In [4]:
rnd_path = "runs/rnd5"

In [5]:
queries = load_queries("BioASQ-task9bPhaseA-testset5", maps=[("body","query")])

checkpoints_name = ["misty-plant-8_val_collection1_map@10",
                    "gentle-monkey-6_val_collection1_map@10",
                    "hopeful-oath-3_val_collection1_map@10",
                    "stellar-silence-5_val_collection1_map@10",
                    "distinctive-lake-7_val_collection1_map@10",
                    "misty-plant-8_val_collection1_recall@10",
                    "gentle-monkey-6_val_collection1_recall@10",
                    "hopeful-oath-3_val_collection1_recall@10",
                    "stellar-silence-5_val_collection1_recall@10",
                    "distinctive-lake-7_val_collection1_recall@10",
                  ]

for checkpoint_name in checkpoints_name:
    trec_name = checkpoint_name+".trec"
    
    if not os.path.exists(os.path.join(rnd_path, trec_name)):
        ranking_model = load_model(os.path.join("trained_models", checkpoint_name))
        _, test_input_generator = model_train_generator_for_model(ranking_model)

        rerank = rerank_run(os.path.join(rnd_path, "bm25-baseline.run"), 100)
        print([len(q["documents"]) for q in rerank], min([len(q["documents"]) for q in rerank]), len(rerank))
        write_as_trec(rerank, os.path.join(rnd_path, trec_name))

        del ranking_model
                  
# fusion
print("RRF Fusion")
fusion_rrf([os.path.join(rnd_path, p+".trec") for p in checkpoints_name], os.path.join(rnd_path, "BIT.UA-01.trec"))
convert_trec_run_to_bioasq(os.path.join(rnd_path, "BIT.UA-01.trec"),
                           queries,
                           os.path.join(rnd_path, "BIT.UA-01.json"))

DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embedding_bioasq_9b_gensim_iter_15_freq0_200_Regex_word2vec_bioasq_9b_RegexTokenizer
Using einsum for mask bq,bps->bpqs and with embedding dim bqe,bpse->bpqs
[EMBEDDING MATRIX SHAPE] (5322623, 200)
[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100] 100 100
DEBUG created tokenizer bioasq_9b_RegexTokenizer
False False
[LOAD FROM CACHE] Load embedding matrix from /backup/BioASQ-9b/embeddings/WORD2VEC_embed

File runs/rnd5/BIT.UA-01.trec writen.


In [6]:
# run 3

queries = load_queries("BioASQ-task9bPhaseA-testset5", maps=[("body","query")])

trec_run_names = ["iconic-wave-1_val_collection0_map@10.trec",
                  "iconic-wave-1_val_collection0_recall@10.trec"]

fusion_rrf([os.path.join(rnd_path,t) for t in trec_run_names], 
            os.path.join(rnd_path,"BIT.UA-03.trec"))

convert_trec_run_to_bioasq(os.path.join(rnd_path,"BIT.UA-03.trec"), 
                           queries,
                           os.path.join(rnd_path,"BIT.UA-03.json"))

File runs/rnd5/BIT.UA-03.trec writen.
