In [1]:
import os
os.chdir("/home/tiagoalmeida/bioASQ-taskb/")

import json
import sys
import pickle
import numpy as np

module_path = os.path.abspath(os.path.join('pubmed_data'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from pubmed_data import pubmed_helper as ph

### Load train data results and Dataset

In [2]:
path="/backup/results/bm25"


files = sorted(filter(lambda x: "test_phaseA_5b_01" in x,os.listdir(path)))
print("Test files:",files)
bm25_test_results = []
for file in files:
    print("\nLoad:",file,end="\r")
    with open(os.path.join(path,file),"rb") as f:
        bm25_test_results.extend(pickle.load(f))

print("Number of tested queries in test set:",len(bm25_test_results))

Test files: ['results_test_phaseA_5b_01_00.p', 'results_test_phaseA_5b_01_01.p']

Load: results_test_phaseA_5b_01_00.p
Number of tested queries in test set: 100


In [8]:
bioASQ_data_path = "/backup/bioASQ_test_set/process/"


def load_prepare_data(path, bm25_results):
    bioASQ_data = json.load(open(path))
    print("len bioasq", len(bioASQ_data), "len bm25", len(bm25_results))
    bioASQ_data = bioASQ_data[:len(bm25_results)]
    
    #verify the training data
    n_doc_per_query = map(lambda x:len(x["documents"]), bioASQ_data)
    
    empty_index = sorted([i for i, x in enumerate(n_doc_per_query) if x == 0], key=lambda x : -x)
    print("Empty goldstandard indexes:", empty_index)
    for i in empty_index:
        del bioASQ_data[i]
        del bm25_results[i]
    
    return bioASQ_data

print("TEST_SET")
path_test = os.path.join(bioASQ_data_path,"phaseB_5b_01.json")
bioASQ_data_test = load_prepare_data(path_test, bm25_test_results)



TEST_SET
len bioasq 100 len bm25 100
Empty goldstandard indexes: []


In [9]:
#analysis
from collections import Counter

len_doc_distribution = sorted(Counter(map(lambda x : len(x), bm25_test_results)).items(), key=lambda x:x[0])
print("bm25_test_results distribution",len_doc_distribution)

bm25_test_results distribution [(1, 1), (2, 2), (11, 1), (28, 1), (231, 1), (814, 1), (13253, 1), (15521, 1), (23377, 1), (30233, 1), (100000, 89)]


In [25]:
from models.generic_model import f_recall, f_map

recall_at_ranges = [100,1000,2500,5000]
CHOOSEN_RECALL = 1000

def check_recall(bioasq_data, bm25_results, recall_at_ranges):
    expectations = list(map(lambda x:x["documents"],bioasq_data))

    for i in recall_at_ranges:
        print("Data recall at",i,":",f_recall(bm25_results,expectations,at=i))
    
    print("Data map at",10,":",f_map(bm25_results,expectations,bioASQ=True))
    
def clip_at_recall(bm25_results, max_recall):
    data_set_unique_pmid = set()

    for fast_results in bm25_results:
        data_set_unique_pmid.update(set(map(lambda x:x[0],fast_results[:max_recall])))
    
    return data_set_unique_pmid



print("TEST_SET")
check_recall(bioASQ_data_test, bm25_test_results, recall_at_ranges)

test_data_set_unique_pmid = clip_at_recall(bm25_test_results, CHOOSEN_RECALL)
    

TEST_SET
Data recall at 100 : 0.5800245636716224
Data recall at 1000 : 0.7680816454787041
Data recall at 2500 : 0.8652040508511094
Data recall at 5000 : 0.8959687028657615
Data map at 10 : 0.0645845238095238


In [17]:
#check map
expectations = list(map(lambda x:x["documents"],bioASQ_data_test))

f_map(bm25_test_results, expectations, bioASQ=True)



0.0645845238095238

In [18]:
pmid_index_map = ph.pmid_index_mapping()

articles_generator = ph.create_pubmed_collection_generator()

Load /backup/saved_models/pmid_index_mapping.p
Open /backup/pubmed_archive_json/pubmed_ready.tar.gz
Creating generator


In [19]:
test_data_set_unique_index = [ pmid_index_map[pmid] for pmid in test_data_set_unique_pmid]

assert(len(test_data_set_unique_index) == len(test_data_set_unique_pmid))
assert(len(set(test_data_set_unique_index)) == len(test_data_set_unique_pmid))

In [None]:
articles = []

for article in articles_generator():
    articles.extend(article)

Open the file: pubmed_ready_00000000_to_02776362
Returning: 2776363 articles
Force garbage collector 0
Open the file: pubmed_ready_02776363_to_05519968
Returning: 2743606 articles
Force garbage collector 0
Open the file: pubmed_ready_05519969_to_08241071
Returning: 2721103 articles
Force garbage collector 0
Open the file: pubmed_ready_08241072_to_11124313
Returning: 2883242 articles
Force garbage collector 0
Open the file: pubmed_ready_11124314_to_13996815


In [None]:
test_relevant_articles = { pmid_index_map.inverse[i]:articles[i] for i in test_data_set_unique_index }

In [None]:
test_data_deep_models = []

for i,fast_results in enumerate(bm25_test_results):
    goldstandard = bioASQ_data_test[i]["documents"]

    positive = [ pos_doc_pmid for pos_doc_pmid in bioASQ_data_test[i]["documents"] if pos_doc_pmid in set(map(lambda x:x[0],fast_results[:CHOOSEN_RECALL])) ]

    top_results = list(map(lambda x:x[0],fast_results[:CHOOSEN_RECALL]))

    test_data_deep_models.append({"id":bioASQ_data_test[i]["id"],"query":bioASQ_data_test[i]["body"], "documents":top_results,"positive_pmid":positive,"goldstandard":goldstandard})

test_data_deep_models = {"bioasq_data":test_data_deep_models, "collection":test_relevant_articles}

In [23]:
#import pad_sequences
article_map = lambda x:x["title"]+" "+x["abstract"]

#load tokenizer
MODE = "regex_full_tokens"
tk = ph.load_tokenizer(mode=MODE)
biomedical_stop_words = ["a", "about", "again", "all", "almost", "also", "although", "always", "among", "an", "and", "another", "any", "are", "as", "at", "be", "because", "been", "before", "being", "between", "both", "but", "by", "can", "could", "did", "do", "does", "done", "due", "during", "each", "either", "enough", "especially", "etc", "for", "found", "from", "further", "had", "has", "have", "having", "here", "how", "however", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "kg", "km", "made", "mainly", "make", "may", "mg", "might", "ml", "mm", "most", "mostly", "must", "nearly", "neither", "no", "nor", "obtained", "of", "often", "on", "our", "overall", "perhaps", "pmid", "quite", "rather", "really", "regarding", "seem", "seen", "several", "should", "show", "showed", "shown", "shows", "significantly", "since", "so", "some", "such", "than", "that", "the", "their", "theirs", "them", "then", "there", "therefore", "these", "they", "this", "those", "through", "thus", "to", "upon", "use", "used", "using", "various", "very", "was", "we", "were", "what", "when", "which", "while", "with", "within", "without", "would"]
biomedical_stop_words_tokens = set(tk.texts_to_sequences([biomedical_stop_words])[0])

MAX_Q_TERM = 13

from tensorflow.keras.preprocessing.sequence import pad_sequences

def pre_process_data(data_deep_models):

    for query_data in data_deep_models["bioasq_data"]:
        tokenized_query = tk.texts_to_sequences([query_data["query"]])[0]
        tokenized_query = [ token for token in tokenized_query if token not in biomedical_stop_words_tokens]
        tokenized_query = pad_sequences([tokenized_query], maxlen = MAX_Q_TERM, padding="post")[0] #REMOVE THIS IS IN THE WRONG PLACE
        query_data["query"] = tokenized_query

    print("tokenize collection")    

    for key,doc in data_deep_models["collection"].items():

        data_deep_models["collection"][key] = tk.texts_to_sequences([article_map(doc)])[0]

pre_process_data(test_data_deep_models)

tokenize collection


In [24]:

#Save
path_save = "/backup/results/fast_method_relevant_results"
path_save = os.path.join(path_save, "test_phaseA_5b_01.p")

with open(path_save, "wb") as f:
    pickle.dump(test_data_deep_models,f)