## README

PERFROM PREDICTION ON THE TEST SET
 

In [1]:
import os 
import sys
import numpy as np
import types
import pickle
import json
import gc
import heapq

module_path = os.path.abspath(os.path.join('..','..','pubmed_data'))
if module_path not in sys.path:
    sys.path.append(module_path)

module_path = os.path.abspath(os.path.join('..',))
if module_path not in sys.path:
    sys.path.append(module_path)

from generic_model import ModelAPI
import pubmed_helper as ph

from bm25_inverted_index import InvertedIndex, DocumentLengthTable
from bm25_score import score_BM25

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


saved_models_path = '/backup/saved_models/'

In [2]:
from multiprocessing import Process, Pool

class BM25(ModelAPI):

    def __init__(self, num_threads=4, num_index_files=5, full_tokens = False, saved_models_path=None):
        if saved_models_path is None:
            super().__init__()
        else:
            super().__init__(saved_models_path=saved_models_path)

        self.document_len_table = DocumentLengthTable()
        self.num_threads = num_threads
        self.num_index_files = num_index_files
        self.full_tokens = full_tokens
        
    def _training_process(self, data):
        
        print("Note: The training process for the BM25, will consist in the build of the inverted index and doc table")
        
        def __multi_process(strat_doc_id,documents):
            print("Thread strat",os.getpid())
            inverted_index = InvertedIndex()
            document_len_table = DocumentLengthTable() 
        
            for i,document in enumerate(documents):
                #build inverted index
                for token in document:

                    inverted_index.add(token, strat_doc_id+i)

                document_len_table.add(len(document), strat_doc_id+i)
                
            file_name_inverted_index = "/backup/saved_models/bm25_to_merge/inverted_index_{0:08}".format(strat_doc_id)
            file_name_document_len = "/backup/saved_models/bm25_to_merge/document_len_{0:08}".format(strat_doc_id)
            
            print("save:",file_name_inverted_index)
            with open(file_name_inverted_index,"wb") as file:
                #json.dump(inverted_index,file)
                pickle.dump(inverted_index,file)
            
            print("save:",file_name_document_len)
            with open(file_name_document_len,"wb") as file: 
                pickle.dump(document_len_table,file)
            
            print("Thread End",os.getpid())
        
        document_start_index = 0
        document_cumulative_index = 0
        # If the input is a generator
        if isinstance(data, types.GeneratorType):
            for documents in data:
                docs_per_thread = len(documents)//self.num_threads
                batch = list(range(0,len(documents),docs_per_thread))
                
                if len(batch)==self.num_threads:
                    batch.append(len(documents))
                else:
                    batch[self.num_threads] = len(documents)
                
                #Run multithread
                threads = []
                
                for i in range(self.num_threads):
                    #documents[batch[i]:batch[i+1]]
                    
                    threads.append(Process(target=__multi_process, args=(document_start_index, documents[batch[i]:batch[i+1]],)))
                    document_start_index = document_cumulative_index + batch[i+1]
                
                document_cumulative_index = document_start_index
                
                for t in threads:
                    t.start()
                
                print("Wait for the threads")
                for t in threads:
                    t.join()
            
            print("Document table merge")
            path_root = "/backup/saved_models/bm25_to_merge/"
            
            for f_name in sorted(filter(lambda x:"document" in x,os.listdir(path_root))):
                #verify
                print("open file",f_name)
                with open(os.path.join(path_root,f_name),"rb") as f:
                    for docid, token_len in pickle.load(f).table.items():
                        self.document_len_table.table[docid] = token_len
            
            print("Start the inverted_index merge")
            #Load tokenizer to get word_counts
            #merge inverted index
            
            tk = ph.load_tokenizer(mode="bllip_stem")
            if self.full_tokens:
                tk.num_words = None
                valid_tokens = map(lambda x:(tk.word_index[x[0]],x[1]),tk.word_counts.items())
            else:
                #for this tokenizer only the top 20 were used
                valid_tokens = map(lambda x:(tk.word_index[x[0]],x[1]),filter(lambda x:x[1]>=20,tk.word_counts.items()))
            
            print("Sorting token_word_count")
            #remove the first token that is the "."
            token_freq = list(sorted(valid_tokens, key = lambda x:-x[1]))[1:]
            print("End Sort")
            total_tokens = sum(map(lambda x:x[1],token_freq))
            
            #remove tokenizer
            del tk
            
            num_tokens_per_index = total_tokens//self.num_index_files

            file_division = [[] for _ in range(self.num_index_files)]

            count = 0
            index_current_token = 0
            for i in range(self.num_index_files):

                while count<num_tokens_per_index and index_current_token<len(token_freq):

                    file_division[i].append(token_freq[index_current_token][0])
                    count += token_freq[index_current_token][1]
                    index_current_token += 1

                count=0
            
            print("Invert index division:",list(map(lambda x:len(x),file_division)))
            
            files_to_merge = sorted(filter(lambda x:"index" in x,os.listdir(path_root)))

            for i in range(self.num_index_files):
                print("BUILD: index file",i)
                inverted_index = {}

                for f_name in files_to_merge:

                    print("open file",f_name)
                    with open(os.path.join(path_root,f_name),"rb") as f:
                        loadded_inv_index = pickle.load(f).index

                    for token in file_division[i]:
                        if token in loadded_inv_index:
                            if token not in inverted_index:
                                inverted_index[token] = loadded_inv_index[token]
                            else:
                                inverted_index[token].update(loadded_inv_index[token])
                    
                    del loadded_inv_index
                
                file_name = "inverted_index_"+str(i)+".p"
                print("Saving:",file_name)
                with open(os.path.join(self.saved_models_path, "bm25_data", file_name),"wb") as f:
                    pickle.dump(inverted_index,f)
        
            
        else:
            raise RuntimeError("Only work with data generators")

            
    
    
    def _predict_process(self, queries, **kwargs):
        
        if 'use_precomputed_score' in kwargs:
            use_precomputed_score = kwargs.pop('use_precomputed_score')
        else:
            use_precomputed_score = False
        
        if 'mapping_f' in kwargs:
            mapping_f = kwargs.pop('mapping_f')
        else:
            mapping_f = None
        
        if kwargs:
            raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))

        print("Precomputed score:","Enable" if use_precomputed_score else "Disable")
        print("Using map function:","Disable" if mapping_f is None else "Enable")
        
        results = [ {} for _ in queries]
        
        doc_N = len(self.document_len_table)
        avgdl = self.document_len_table.get_average_length()    
            
        for i in range(self.num_index_files):
            if use_precomputed_score:
                index_file_name = "inverted_index_bm25_"+str(i)+".p"
            else:
                index_file_name = "inverted_index_"+str(i)+".p"
            print("Load",index_file_name)
            with open(os.path.join(self.saved_models_path,"bm25_data",index_file_name),"rb") as f:
                inverted_index = InvertedIndex(index=pickle.load(f))
            
            
            for j,query in enumerate(queries):
                if j%10==0:
                    print("query",j,end="\r")
                self.__single_query(query, results[j], inverted_index,doc_N, avgdl, use_precomputed_score)

                
            print("Number of matching documents to query 0",len(results[0]))

            del inverted_index
            gc.collect() #force garbage collector
            
        print("Sorting the results")
        #TODO CHANGE TO: heapq.nlargest(k, dictionary, key=dictionary.get)
        final_result = [heapq.nlargest(100000, q_result.items(), key=lambda x:x[1]) for q_result in results]

        del results
        gc.collect() #force garbage collector
                                 
                                 
        return final_result
        
    
    def __single_query(self, query, query_result, inverted_index, doc_N, avgdl, use_precomputed_score):
        
        #if the query_result is empty, the doc_dict can be copied to the results
        if len(query_result)==0:
            
            #get low index term of the query to maximize efficience
            min_index_query_term = min(query)
            
            #this term is in the index so copy the doc_dict to the results
            if min_index_query_term in inverted_index:
                query_result.update(inverted_index[min_index_query_term])
                query.remove(min_index_query_term)

        
        for term in query:
            #print(term,":",term in inverted_index)
            #search
            if term in inverted_index:
                doc_dict = inverted_index[term] # retrieve index entry
                
                for docid, freq in doc_dict.items(): #for each document and its word frequency
                    
                    if use_precomputed_score:
                        score = freq
                    else:
                        score = score_BM25(n=len(doc_dict), 
                                           f=freq, 
                                           qf=1, 
                                           r=0, 
                                           N=doc_N,
                                           dl=self.document_len_table.get_length(docid), 
                                           avdl=avgdl) # calculate score
                    
                    if docid in query_result: #this document has already been scored once
                        query_result[docid] += score
                    else:
                        query_result[docid] = score
        
        #print(len(query_result))
                        
    def pre_compute_index_bm25(self, qf=1,r=0):
        
        doc_N = len(self.document_len_table)
        avgdl = self.document_len_table.get_average_length()
        
        
        for i in range(self.num_index_files):
            index_file_name = "inverted_index_"+str(i)+".p"
            print("Load",index_file_name)
            with open(os.path.join(self.saved_models_path,"bm25_data",index_file_name),"rb") as f:
                inverted_index = InvertedIndex(index=pickle.load(f))
            
            for term, doc_dict in inverted_index.index.items():
                
                for docid, freq in doc_dict.items(): 
                    
                    inverted_index[term][docid] = score_BM25(n=len(doc_dict), 
                                                               f=freq, 
                                                               qf=qf, 
                                                               r=r, 
                                                               N=doc_N,
                                                               dl=self.document_len_table.get_length(docid), 
                                                               avdl=avgdl) # calculate score
            
            index_file_name = "inverted_index_bm25_"+str(i)+".p"
            print("Save",index_file_name)
            
            with open(os.path.join(self.saved_models_path,"bm25_data",index_file_name),"wb") as f:
                pickle.dump(inverted_index.index,f)
                
            

            #results[j] = dict(list(map(lambda x:(self.mapping(x[0]),x[1]),results[j].items())))
            
            del inverted_index
        
    @staticmethod
    def load(path = '/backup/saved_models/',full_tokens = False):
        table = {}
        try:
            with open(os.path.join(path, "bm25_data", "document_len.p"),"rb") as file:
                table = pickle.load(file)
        except :
            print("Error when loading the model, a non trained model will be returned")
        
        bm25 = BM25()
        bm25.full_tokens = full_tokens
        bm25.saved_models_path = path
        
        if len(table)!=0:
            bm25.document_len_table.table = table
            bm25.trained = True
        
        return bm25
    
    def save(self):
        with open(os.path.join(self.saved_models_path, "bm25_data", "document_len.p"),"wb") as file:
            pickle.dump(self.document_len_table.table,file)
        
        


In [3]:

FULL_TOKENS = True
MODE = "bllip_stem_full_tokens"

bm25_model = BM25.load(full_tokens=FULL_TOKENS)



True

## Testing in the dataset

In [7]:
with open("/backup/bioASQ_test_set/BioASQ-TaskB-testData/phaseB_5b_01.json","r") as f:
    bioASQ_data = json.load(f)["questions"]
    
bioASQ_data_documents = list(map(lambda x:{"id":x["id"],"documents":list(map(lambda x: x.split("/")[-1],x["documents"])),"body":x["body"]},bioASQ_data))
    
with open("/backup/bioASQ_test_set/process/phaseB_5b_01.json","w") as f:
    json.dump(bioASQ_data_documents,f)
    
tk = ph.load_tokenizer(mode=MODE)

if FULL_TOKENS:
    tk.num_words = None

bioASQ_data = list(map(lambda x:{"body":tk.texts_to_sequences([x["body"]])[0],"documents":x["documents"]},bioASQ_data))

#Load mapping
pmid_index_map = ph.pmid_index_mapping()


Load bllip_stem_full_tokens_tokenizer.p
Load /backup/saved_models/pmid_index_mapping.p


In [8]:

index_pmid_f = lambda x:(pmid_index_map.inverse[x[0]],x[1])

queries = list(map(lambda x:x["body"],bioASQ_data))

#bm25_model.num_threads_predict = 1
#result_0 = bm25_model.predict(queries[:200] ,use_precomputed_score = True)
    

In [9]:
batch_size = 50

batch = list(range(0,len(queries),batch_size))
batch.append(len(queries))

for i in range(len(batch)-1):
    result = bm25_model.predict(queries[batch[i]:batch[i+1]] ,use_precomputed_score = True)
    result=[list(map(index_pmid_f,q_result)) for q_result in result]
    file_name = "/backup/results/bm25/results_test_phaseA_5b_01_{0:02}.p".format(i)
    print("Save:",file_name)
    with open(file_name,"wb") as f:
        pickle.dump(result,f)
    del result
    gc.collect()

Precomputed score: Enable
Using map function: Disable
Load inverted_index_bm25_0.p
Number of matching documents to query 0 6558577
Load inverted_index_bm25_1.p
Number of matching documents to query 0 7340729
Load inverted_index_bm25_2.p
Number of matching documents to query 0 7340729
Load inverted_index_bm25_4.p
Number of matching documents to query 0 7346723
Sorting the results
Save: /backup/results/bm25/results_test_phaseA_5b_01_00.p
Precomputed score: Enable
Using map function: Disable
Load inverted_index_bm25_0.p
Number of matching documents to query 0 0
Load inverted_index_bm25_1.p
Number of matching documents to query 0 1461638
Load inverted_index_bm25_2.p
Number of matching documents to query 0 1461638
Load inverted_index_bm25_3.p
Number of matching documents to query 0 1574556
Load inverted_index_bm25_4.p
Number of matching documents to query 0 1618752
Sorting the results
Save: /backup/results/bm25/results_test_phaseA_5b_01_01.p


In [15]:
#save
result=[list(map(index_pmid_f,q_result)) for q_result in result]


In [16]:
with open("/backup/results/bm25/results_test_1.p","wb") as f:
    pickle.dump(result,f)

In [14]:
result = bm25_model.predict(queries[300:] ,use_precomputed_score = True)

Precomputed score: Enable
Using map function: Disable
Load inverted_index_bm25_0.p
Number of matching documents to query 0 4051524
Load inverted_index_bm25_1.p
Number of matching documents to query 0 4303315
Load inverted_index_bm25_2.p
Number of matching documents to query 0 4813689
Load inverted_index_bm25_3.p
Number of matching documents to query 0 4813689
Load inverted_index_bm25_4.p
Number of matching documents to query 0 4813982
Sorting the results


In [7]:
result=[list(map(index_pmid_f,q_result)) for q_result in result]

In [7]:
result_0[0]

[(12253817, 24.180470358903328),
 (15799559, 23.181407856563723),
 (13213392, 22.88588695111627),
 (15831037, 22.636492624939955),
 (15517695, 22.35078579429482),
 (6634123, 21.60846913279239),
 (8423884, 21.562244538483643),
 (17320845, 21.562244538483643),
 (17644776, 21.52609190855462),
 (1714917, 21.45738243785513),
 (990098, 21.261316175223257),
 (9992858, 21.16576602510993),
 (15757637, 21.1147419849472),
 (632881, 21.11208526474826),
 (4864862, 21.038335650005166),
 (6822191, 20.9544573478477),
 (9992855, 20.898420610739247),
 (2521643, 20.728871437757974),
 (5523678, 20.57017542758215),
 (597293, 20.555718026482445),
 (15538514, 20.500907354783486),
 (12664357, 20.4703301151217),
 (12129754, 20.288474024957058),
 (10235141, 20.16537333978694),
 (4113901, 19.93919970831876),
 (13699711, 19.933835865913803),
 (10793014, 19.928413646691435),
 (970274, 19.797672388126035),
 (1561031, 19.742337415290542),
 (14065170, 19.725397986480225),
 (17681295, 19.70821363388878),
 (2150487, 19

In [37]:
s1

[(16010539, 26.81821950573897),
 (735441, 26.21749008023477),
 (3452125, 25.72648463050294),
 (14188754, 24.539478831249703),
 (18161169, 23.355325498417002),
 (11606014, 23.329962638732184),
 (15869967, 23.020901573749303),
 (13859640, 22.991830258304024),
 (7216053, 22.906480939776696),
 (3640784, 22.90309173769834)]

In [8]:
pmid_index_map.inverse[12253817]

'28796422'

In [29]:
len(bm25_model.document_len_table)

18824355

In [23]:
#744568
tk.word_counts["path2ppi"]

5

In [9]:
bioASQ_test[13]["body"]

'What is Path2PPI?'

In [75]:
##count articles

g = ph.create_tokenized_pubmed_collection_generator(mode=MODE)
total_docs = 0
for docs in g():
    total_docs += len(docs)

Open the pubmed tokenized tar.gz
Creating generator
Open the file: bllip_stem_small_file_00_title_abs_pubmed.p
Returning: 3690895 articles
Force garbage collector 0
Open the file: bllip_stem_small_file_01_title_abs_pubmed.p
Returning: 3643138 articles
Force garbage collector 0
Open the file: bllip_stem_small_file_02_title_abs_pubmed.p
Returning: 3790281 articles
Force garbage collector 0
Open the file: bllip_stem_small_file_03_title_abs_pubmed.p
Returning: 3838006 articles
Force garbage collector 0
Open the file: bllip_stem_small_file_04_title_abs_pubmed.p
Returning: 3862035 articles
Force garbage collector 0


In [77]:
docs = next(g())

Open the file: bllip_stem_small_file_00_title_abs_pubmed.p
Returning: 3690895 articles


In [80]:
docs[853213]

[1145,
 13133,
 95,
 527,
 1007,
 5270,
 4247,
 150,
 4247,
 20,
 4247,
 4247,
 18,
 1395,
 849,
 34418,
 20,
 20,
 5270,
 4247,
 159,
 209,
 163,
 1007,
 4247,
 1007,
 527,
 194,
 18924,
 4247,
 168,
 183,
 20,
 276,
 327,
 725,
 5006,
 725,
 101,
 475,
 881,
 64,
 3028,
 4247,
 20,
 580,
 320,
 168,
 183,
 170,
 104,
 13133,
 95,
 527,
 163,
 2579,
 1658,
 1167,
 3028,
 4247,
 18,
 20,
 759,
 34418,
 20,
 158,
 18924,
 4247,
 102,
 987,
 95,
 170,
 20,
 697,
 18,
 34418,
 20,
 1087,
 170,
 61,
 150,
 20,
 1575,
 262,
 1334,
 686,
 1145,
 13133,
 95,
 170,
 3028,
 4247,
 20,
 18924,
 4247,
 87,
 245,
 209,
 163,
 1007,
 1070,
 3671,
 1372,
 209,
 2526,
 95,
 1145,
 13133,
 95,
 29,
 102,
 101,
 1959,
 1011,
 875,
 163,
 13376,
 159,
 10514,
 1245]

In [81]:
[tk.index_word[w] for w in docs[853213]]

['collagen',
 'computeris',
 'reduc',
 'accord',
 'rather',
 'bacteremia',
 'pco',
 'within',
 'pco',
 'associ',
 'pco',
 'pco',
 'level',
 'pretreat',
 'intracellular',
 'a-a',
 'associ',
 'associ',
 'bacteremia',
 'pco',
 'imag',
 'cultur',
 'experi',
 'rather',
 'pco',
 'rather',
 'accord',
 'area',
 'radiochemotherapi',
 'pco',
 'without',
 'signal',
 'associ',
 'lesion',
 '50',
 'undergo',
 '3-year',
 'undergo',
 'gener',
 'profil',
 'clone',
 'perform',
 'adhd',
 'pco',
 'associ',
 'articl',
 'oxid',
 'without',
 'signal',
 'regul',
 'import',
 'computeris',
 'reduc',
 'accord',
 'experi',
 'wors',
 'deep',
 'capabl',
 'adhd',
 'pco',
 'level',
 'associ',
 'reliabl',
 'a-a',
 'associ',
 'requir',
 'radiochemotherapi',
 'pco',
 'health',
 'allel',
 'reduc',
 'regul',
 'associ',
 'fractur',
 'level',
 'a-a',
 'associ',
 'longer',
 'regul',
 'one',
 'within',
 'associ',
 'cdna',
 'anim',
 'oil',
 'stem',
 'collagen',
 'computeris',
 'reduc',
 'regul',
 'adhd',
 'pco',
 'associ',
 'r

In [35]:
## Verify empty queries with the bllip_stem tokenization

empty = []

for i,q in enumerate(bioASQ_fulldata_tokenized):
    if len(q["body"])==0:
        empty +=[i]

print(len(empty))

0


In [36]:
#find max index

max(map(lambda x: max(x["body"]), bioASQ_fulldata_tokenized))

10655700

In [31]:
bioASQ_fulldata_tokenized[2745]

{'body': [], 'documents': ['26551787']}

In [32]:
bioASQ_fulldata[2745]

{'body': 'What is PNPPP?',
 'documents': ['26551787'],
 'exact_answer': ['personally normalized plasma protein profiles'],
 'id': '5a9da59c4e03427e73000005',
 'ideal_answer': ['personally normalized plasma protein profiles (PNPPP)'],
 'snippets': [{'beginSection': 'abstract',
   'document': 'http://www.ncbi.nlm.nih.gov/pubmed/26551787',
   'endSection': 'abstract',
   'offsetInBeginSection': 10,
   'offsetInEndSection': 197,
   'text': ' To study the impact of genetic and lifestyle factors on protein biomarkers and develop personally normalized plasma protein profiles (PNPPP) controlling for non-disease-related variance.'}],
 'type': 'factoid'}

In [38]:
tk.word_counts['mowat–wilson']

1

In [37]:
tk.index_word[10655700]

'mowat–wilson'

In [40]:
with open("/backup/pubmed_tokenizers/bllip_stem_Nnone_tokenizer_full_pubmed.p","wb") as f:
    pickle.dump(tk,f)