In [1]:
import os
os.chdir("/home/tiagoalmeida/bioASQ-taskb/")

import json
import sys
import pickle
import numpy as np

module_path = os.path.abspath(os.path.join('pubmed_data'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from pubmed_data import pubmed_helper as ph

import


### Load train data results and Dataset

In [2]:
path="/backup/results/bm25"


GROUP_TRAIN_SET = False
GROUP_TEST_SET = True

if GROUP_TRAIN_SET:
    files = sorted(filter(lambda x: "train" in x,os.listdir(path)))
    print("Train files:",files)
    bm25_train_results = []
    for file in files:
        print("\nLoad:",file,end="\r")
        with open(os.path.join(path,file),"rb") as f:
            bm25_train_results.extend(pickle.load(f))

    print("Number of tested queries in train set:",len(bm25_train_results))

if GROUP_TEST_SET:
    files = sorted(filter(lambda x: "test" in x,os.listdir(path)))
    files=files[:2]
    print("Test files:",files)
    bm25_test_results = []
    for file in files:
        print("\nLoad:",file,end="\r")
        with open(os.path.join(path,file),"rb") as f:
            bm25_test_results.extend(pickle.load(f))

    print("Number of tested queries in test set:",len(bm25_test_results))

Test files: ['results_test_0.p', 'results_test_1.p']

Load: results_test_0.p
Number of tested queries in test set: 549


In [3]:
bioASQ_data_path = "/backup/BioASQ-training7b/"


def load_prepare_data(path, bm25_results):
    bioASQ_data = json.load(open(path))
    print("len bioasq", len(bioASQ_data), "len bm25", len(bm25_results))
    bioASQ_data = bioASQ_data[:len(bm25_results)]
    
    #verify the training data
    n_doc_per_query = map(lambda x:len(x["documents"]), bioASQ_data)
    
    empty_index = sorted([i for i, x in enumerate(n_doc_per_query) if x == 0], key=lambda x : -x)
    print("Empty goldstandard indexes:", empty_index)
    for i in empty_index:
        del bioASQ_data[i]
        del bm25_results[i]
    
    return bioASQ_data

if GROUP_TRAIN_SET: 
    print("TRAIN_SET")
    path_train = os.path.join(bioASQ_data_path,"7b_train_split.json")
    bioASQ_data_train = load_prepare_data(path_train, bm25_train_results)

if GROUP_TEST_SET:
    print("TEST_SET")
    path_test = os.path.join(bioASQ_data_path,"7b_test_split.json")
    bioASQ_data_test = load_prepare_data(path_test, bm25_test_results)



TEST_SET
len bioasq 549 len bm25 549
Empty goldstandard indexes: []


In [4]:
#analysis
from collections import Counter

if GROUP_TRAIN_SET: 
    len_doc_distribution = sorted(Counter(map(lambda x : len(x), bm25_train_results)).items(), key=lambda x:x[0])
    print("bm25_train_results distribution",len_doc_distribution)

if GROUP_TEST_SET:  
    len_doc_distribution = sorted(Counter(map(lambda x : len(x), bm25_test_results)).items(), key=lambda x:x[0])
    print("bm25_test_results distribution",len_doc_distribution)

bm25_test_results distribution [(1, 5), (4, 2), (7, 1), (8, 1), (15, 1), (20, 1), (24, 1), (34, 1), (42, 1), (75, 1), (126, 1), (1254, 1), (4114, 1), (7902, 1), (7944, 1), (15244, 1), (18614, 1), (26376, 1), (43843, 1), (50960, 1), (89781, 1), (100000, 523)]


In [5]:
from models.generic_model import f_recall, f_map

recall_at_ranges = [10,50,100,1000,2500]
CHOOSEN_RECALL = 2500

def check_recall(bioasq_data, bm25_results, recall_at_ranges):
    expectations = list(map(lambda x:x["documents"],bioasq_data))

    for i in recall_at_ranges:
        print("Data recall at",i,":",f_recall(bm25_results,expectations,at=i))
        
    print("Data map at",10,":",f_map(bm25_results,expectations,bioASQ=True))
    
def clip_at_recall(bm25_results, max_recall):
    data_set_unique_pmid = set()

    for fast_results in bm25_results:
        data_set_unique_pmid.update(set(map(lambda x:x[0],fast_results[:max_recall])))
    
    return data_set_unique_pmid


if GROUP_TRAIN_SET:
    print("TRAIN_SET")
    check_recall(bioASQ_data_train, bm25_train_results, recall_at_ranges)
    
    train_data_set_unique_pmid = clip_at_recall(bm25_train_results, CHOOSEN_RECALL)

if GROUP_TEST_SET: 
    print("TEST_SET")
    check_recall(bioASQ_data_test, bm25_test_results, recall_at_ranges)
    
    test_data_set_unique_pmid = clip_at_recall(bm25_test_results, CHOOSEN_RECALL)
    

TEST_SET
Data recall at 10 : 0.33122603972484665
Data recall at 50 : 0.5504357446726348
Data recall at 100 : 0.639780257070587
Data recall at 1000 : 0.8323096260300467
Data recall at 2500 : 0.8746784560651255
Data map at 10 : 0.15353167375025284


In [6]:
pmid_index_map = ph.pmid_index_mapping()

articles_generator = ph.create_pubmed_collection_generator()

Load /backup/saved_models/pmid_index_mapping.p
Open /backup/pubmed_archive_json/pubmed_ready.tar.gz
Creating generator


In [7]:
#convert pmid to articles index
if GROUP_TRAIN_SET: 
    train_data_set_unique_index = [ pmid_index_map[pmid] for pmid in train_data_set_unique_pmid]
    
    assert(len(train_data_set_unique_index) == len(train_data_set_unique_pmid))
    assert(len(set(train_data_set_unique_index)) == len(train_data_set_unique_pmid))
    
if GROUP_TEST_SET: 
    test_data_set_unique_index = [ pmid_index_map[pmid] for pmid in test_data_set_unique_pmid]
    
    assert(len(test_data_set_unique_index) == len(test_data_set_unique_pmid))
    assert(len(set(test_data_set_unique_index)) == len(test_data_set_unique_pmid))

In [8]:
articles = []

for article in articles_generator():
    articles.extend(article)

Open the file: pubmed_ready_00000000_to_02776362
Returning: 2776363 articles
Force garbage collector 0
Open the file: pubmed_ready_02776363_to_05519968
Returning: 2743606 articles
Force garbage collector 0
Open the file: pubmed_ready_05519969_to_08241071
Returning: 2721103 articles
Force garbage collector 0
Open the file: pubmed_ready_08241072_to_11124313
Returning: 2883242 articles
Force garbage collector 0
Open the file: pubmed_ready_11124314_to_13996815
Returning: 2872502 articles
Force garbage collector 0
Open the file: pubmed_ready_13996816_to_18824354
Returning: 4827539 articles
Force garbage collector 0


In [9]:
if GROUP_TRAIN_SET: 
    
    #Irrelevant documents
    num_train_irrelevant_docs = len(train_data_set_unique_index)*2
    set_collection_index = set(range(len(articles)))

    irrelevant_index = set_collection_index-set(train_data_set_unique_index)


    #Random selection 
    from random import sample 
    selected_irrelevant_index = sample(list(irrelevant_index), num_train_irrelevant_docs)
    selected_irrelevant_pmid = list(map(lambda x:pmid_index_map.inverse[x], selected_irrelevant_index))

    #verification
    assert len(set(selected_irrelevant_index)-set(train_data_set_unique_index)) == len(selected_irrelevant_index)
    
    train_relevant_articles = { pmid_index_map.inverse[i]:articles[i] for i in train_data_set_unique_index+selected_irrelevant_index }

if GROUP_TEST_SET:
    test_relevant_articles = { pmid_index_map.inverse[i]:articles[i] for i in test_data_set_unique_index }

In [10]:
if GROUP_TRAIN_SET: 
    train_data_deep_models = []
        
    for i,fast_results in enumerate(bm25_train_results):
        
        positive = [ pos_doc_pmid for pos_doc_pmid in bioASQ_data_train[i]["documents"] if pos_doc_pmid in set(map(lambda x:x[0],fast_results[:CHOOSEN_RECALL])) ]
        
        #CHECK THIS CONDITION
        if len(positive)==0:
            continue
            
        top_results = fast_results[:CHOOSEN_RECALL]
    
        partially_positive = [x for x in top_results if x[0] not in set(positive)]

        #TODO change dont use the JUMP!
        if len(partially_positive)==0:
            continue

        partially_positive_bm25_score = list(map(lambda x:x[1], partially_positive))
        partially_positive = list(map(lambda x:x[0], partially_positive))
        #calculate negatives probabilities
        partially_positive_logits = np.power(1.1, np.array(partially_positive_bm25_score))
        partially_positive_prob = (partially_positive_logits/sum(partially_positive_logits)).tolist()

        #cumulative prob
        partially_positive_logits_cdf = [partially_positive_prob[0]]
        for k in range(1, len(partially_positive_prob)):
            partially_positive_logits_cdf.append(partially_positive_logits_cdf[-1] + partially_positive_prob[k])
                    
        top_results = list(map(lambda x:x[0],top_results))

        train_data_deep_models.append({"id":bioASQ_data_train[i]["id"],"query":bioASQ_data_train[i]["body"], "documents":top_results, "positive_pmid":positive ,"partilly_positive_pmid":partially_positive,"partially_positive_cumulative_prob":partially_positive_logits_cdf})
    
    train_data_deep_models = {"bioasq_data":train_data_deep_models,"irrelevant_pmid":selected_irrelevant_pmid, "collection":train_relevant_articles}
    
if GROUP_TEST_SET:
    test_data_deep_models = []

    for i,fast_results in enumerate(bm25_test_results):
        goldstandard = bioASQ_data_test[i]["documents"]
        
        positive = [ pos_doc_pmid for pos_doc_pmid in bioASQ_data_test[i]["documents"] if pos_doc_pmid in set(map(lambda x:x[0],fast_results[:CHOOSEN_RECALL])) ]
        
        top_results = list(map(lambda x:x[0],fast_results[:CHOOSEN_RECALL]))

        test_data_deep_models.append({"id":bioASQ_data_test[i]["id"],"query":bioASQ_data_test[i]["body"], "documents":top_results,"positive_pmid":positive,"goldstandard":goldstandard})
    
    test_data_deep_models = {"bioasq_data":test_data_deep_models, "collection":test_relevant_articles}

In [11]:
#import pad_sequences
article_map = lambda x:x["title"]+" "+x["abstract"]
#load tokenizer
MODE = "regex_full_tokens"
tk=ph.load_tokenizer(mode=MODE)
biomedical_stop_words = ["a", "about", "again", "all", "almost", "also", "although", "always", "among", "an", "and", "another", "any", "are", "as", "at", "be", "because", "been", "before", "being", "between", "both", "but", "by", "can", "could", "did", "do", "does", "done", "due", "during", "each", "either", "enough", "especially", "etc", "for", "found", "from", "further", "had", "has", "have", "having", "here", "how", "however", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "kg", "km", "made", "mainly", "make", "may", "mg", "might", "ml", "mm", "most", "mostly", "must", "nearly", "neither", "no", "nor", "obtained", "of", "often", "on", "our", "overall", "perhaps", "pmid", "quite", "rather", "really", "regarding", "seem", "seen", "several", "should", "show", "showed", "shown", "shows", "significantly", "since", "so", "some", "such", "than", "that", "the", "their", "theirs", "them", "then", "there", "therefore", "these", "they", "this", "those", "through", "thus", "to", "upon", "use", "used", "using", "various", "very", "was", "we", "were", "what", "when", "which", "while", "with", "within", "without", "would"]
biomedical_stop_words_tokens = set(tk.texts_to_sequences([biomedical_stop_words])[0])

MAX_Q_TERM = 13

from tensorflow.keras.preprocessing.sequence import pad_sequences

def pre_process_data(data_deep_models):

    for query_data in data_deep_models["bioasq_data"]:
        tokenized_query = tk.texts_to_sequences([query_data["query"]])[0]
        tokenized_query = [ token for token in tokenized_query if token not in biomedical_stop_words_tokens]
        tokenized_query = pad_sequences([tokenized_query], maxlen = MAX_Q_TERM, padding="post")[0] #REMOVE THIS IS IN THE WRONG PLACE
        query_data["query"] = tokenized_query

    print("tokenize collection")    

    for key,doc in data_deep_models["collection"].items():

        data_deep_models["collection"][key] = tk.texts_to_sequences([article_map(doc)])[0]

if GROUP_TRAIN_SET:
    pre_process_data(train_data_deep_models)
    
if GROUP_TEST_SET:
    pre_process_data(test_data_deep_models)

Load regex_full_tokens_tokenizer.p
tokenize collection


In [12]:
if GROUP_TRAIN_SET: 

    #Save
    path_save = "/backup/results/fast_method_relevant_results"
    path_save = os.path.join(path_save, "train_data_deep_models_v3.p")

    with open(path_save, "wb") as f:
        pickle.dump(train_data_deep_models,f)
        
if GROUP_TEST_SET:
    
    #Save
    path_save = "/backup/results/fast_method_relevant_results"
    path_save = os.path.join(path_save, "test_data_deep_models_v3.p")

    with open(path_save, "wb") as f:
        pickle.dump(test_data_deep_models,f)

In [13]:
num_train_irrelevant_docs

NameError: name 'num_train_irrelevant_docs' is not defined