In [1]:
import pickle
import os
import sys
import gc
import numpy as np
from scipy import sparse


from multiprocessing import Process

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pubmed_data.pubmed_helper as ph

In [2]:
#Create a generator for pubmed data
ONLY_TITLE = False

if ONLY_TITLE:
    map_function = lambda x:x["title"]
else:
    map_function = lambda x:x["title"] + " " + x["abstract"] 

pubmed_generator = ph.create_pubmed_collection_generator(map_function)

Open /backup/pubmed_archive_json/pubmed_ready.tar.gz
Creating generator


## Choose the tokenizer

In [3]:
LOAD_TOKENIZER = True
FULL_TOKENS = False
MODE = "regex_less_700k_freq"

In [4]:


if LOAD_TOKENIZER:
            
    if MODE == "bag_of_trigrams":
        from keras_new_text import regex_alfanum_tokenizer
        tk = ph.load_tokenizer("hashtrick_full_tokens")
        
    else:
        tk = ph.load_tokenizer(mode = MODE)
        if FULL_TOKENS:
            tk.num_words=None
    
    print("Num words:",tk.num_words)
else:
    if MODE == "bllip":
        from keras_new_text import Tokenizer, bllip_tokenizer
        create_tokenizer = lambda : Tokenizer(tokenizer = bllip_tokenizer)

    elif MODE == "bllip_stem":
        from keras_new_text import Tokenizer,  bllip_stopW_stem_tokenizer
        create_tokenizer = lambda : Tokenizer(tokenizer = bllip_stopW_stem_tokenizer)
        
    elif MODE == "keras":
        from tensorflow.keras.preprocessing.text import Tokenizer
        create_tokenizer = lambda : Tokenizer()
    elif MODE == "regex_full_tokens":
        from keras_new_text import Tokenizer, regex_alfanum_tokenizer
        create_tokenizer = lambda : Tokenizer(tokenizer = regex_alfanum_tokenizer)
    elif MODE == "hashtrick_full_tokens":
        base_tk = ph.load_tokenizer("regex_full_tokens")
        ##save some memory
        del base_tk.word_index
        del base_tk.word_counts
        del base_tk.word_docs
        del base_tk.index_docs
        
        pubmed_generator = ph.create_tokenized_pubmed_collection_generator(mode="regex_full_tokens")
        from keras_new_text import Tokenizer ,trigram_tokenizer, tri_gram
        create_tokenizer = lambda : Tokenizer(tokenizer = trigram_tokenizer)
        
        gc.collect()




Load regex_less_700k_freq_tokenizer.p
Num words: None


In [5]:
#Multi process functions
if not LOAD_TOKENIZER:
    
    N = 0
    output_tokenizer_file_name = MODE+'_file_{0:03}_tokenizer.p'
    output_merged_tokenizer_file_name = MODE+("_N"+str(N) if not FULL_TOKENS else "")+"_tokenizer.p"
    
    if MODE == "hashtrick_full_tokens":
        
        def fitTokenizeJob(proc_id, articles, base_tokenizer):
            print("Process",proc_id,"Started")
            
            token_mapping = lambda x:base_tokenizer.index_word[x]
            
            #ALL THREADS RUN THIS
            tk = create_tokenizer()
            
            #slow! but only need to run 1 time so...
            token_articles_mapping = map(lambda article: map(lambda x:token_mapping(x) ,article), articles)

            tk.fit_on_texts(token_articles_mapping)
            del articles

            file_name = output_tokenizer_file_name.format(proc_id)
            print("save:",file_name)

            pickle.dump(tk,open(os.path.join('/','backup','pubmed_tokenizers','mp_to_merge',file_name),"wb"))
            del tk
            print("Process",proc_id,"ENDED")
        
        #initialization of the process
        def fitTokenizer_process_init(proc_id, articles, **kwargs):
            if "base_tokenizer" in kwargs:
                base_tokenizer = kwargs.pop("base_tokenizer")
            else:
                raise TypeError("base_tokenizer must be used for the hashtrick tokenization")
            
            return Process(target=fitTokenizeJob, args=(proc_id, articles, base_tokenizer))
        
    else:
       
        #process main function
        def fitTokenizeJob(proc_id, articles):
            print("Process",proc_id,"Started")

            #ALL THREADS RUN THIS
            tk = create_tokenizer()
            tk.fit_on_texts(articles)
            del articles

            file_name = output_tokenizer_file_name.format(proc_id)
            print("save:",file_name)

            pickle.dump(tk,open(os.path.join('/','backup','pubmed_tokenizers','mp_to_merge',file_name),"wb"))
            del tk
            print("Process",proc_id,"ENDED")

         
        #initialization of the process
        def fitTokenizer_process_init(proc_id, articles, **kwargs):
            return Process(target=fitTokenizeJob, args=(proc_id, articles,))
        
        

In [6]:
def multithread_loop(process_init, n_thread_per_file = [12,12,12,12,12,20],**kwargs):
    
    for i,texts in enumerate(pubmed_generator()):
        process = []

        t_len = len(texts)
        t_itter = t_len//n_thread_per_file[i]

        batch = list(range(0,t_len,t_itter))

        if len(batch)==n_thread_per_file[i]:
            batch.append(t_len)
        else:
            batch[n_thread_per_file[i]] = t_len

        for j in range(n_thread_per_file[i]):
            process.append(process_init(sum(n_thread_per_file[:i])+j,texts[batch[j]:batch[j+1]],**kwargs))

        print("Start",n_thread_per_file[i],"working threads")
        for p in process:
            p.start()

        print("Wait",n_thread_per_file[i],"working threads")
        for p in process:
            p.join()
        gc.collect()


## Fit the tokenizer in multithread

In [7]:

#FIT TOKENIER MULTITHREAD
if not LOAD_TOKENIZER:
    
    n_thread_per_file = [20]*6
    if MODE == "hashtrick_full_tokens":
        multithread_loop(fitTokenizer_process_init,n_thread_per_file,base_tokenizer=base_tk)
    else:
        multithread_loop(fitTokenizer_process_init,n_thread_per_file)

    gc.collect() ##some clean up
    
    ## Merge the files
    print("Start the MERGE!")
    files = sorted(filter(lambda x:MODE in x,os.listdir("/backup/pubmed_tokenizers/mp_to_merge/")))
    print(files)

    tk = create_tokenizer()

    for file in files:
        print("load:",file,end="\r")
        with open("/backup/pubmed_tokenizers/mp_to_merge/"+file,"rb") as f:
            loaded_tk = pickle.load(f)

        #manual merge
        for w,c in loaded_tk.word_counts.items():
            if w in tk.word_counts:
                tk.word_counts[w] += c
            else:
                tk.word_counts[w] = c

        for w,c in loaded_tk.word_docs.items():
            if w in tk.word_docs:
                tk.word_docs[w] += c
            else:
                tk.word_docs[w] = c
        
        tk.document_count += loaded_tk.document_count
    #CODE FROM KERAS TOKENIZER
    #

    wcounts = list(tk.word_counts.items())
    wcounts.sort(key=lambda x: x[1], reverse=True)
    # forcing the oov_token to index 1 if it exists
    if tk.oov_token is None:
        sorted_voc = []
    else:
        sorted_voc = [tk.oov_token]
    sorted_voc.extend(wc[0] for wc in wcounts)

    # note that index 0 is reserved, never assigned to an existing word
    tk.word_index = dict(
        list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))

    tk.index_word = dict((c, w) for w, c in tk.word_index.items())

    for w, c in list(tk.word_docs.items()):
        tk.index_docs[tk.word_index[w]] = c


    print("Save:",output_merged_tokenizer_file_name)
    if not FULL_TOKENS:
        max_vocabulary = len(list(filter(lambda x:x[1]>=N,tk.word_counts.items())))
        tk.num_words = max_vocabulary + 1
    print("Voc size",tk.num_words)

    with open("/backup/pubmed_tokenizers/"+output_merged_tokenizer_file_name,"wb") as f:
        pickle.dump(tk,f)

## Tokenize in multithread

In [8]:
if ONLY_TITLE:
    output_file_name = MODE+'_file_{0:03}_title_pubmed.p'
else:
    output_file_name = MODE+'_file_{0:03}_title_abs_pubmed.p'

In [9]:
MERGE_FLAG = True

if MODE == "bag_of_trigrams":
    MERGE_FLAG = False
    def tokenizeJob(proc_id, tokenizer, texts):
        print("Process",proc_id,"Started")

        TRIGRAM_VOC = len(tokenizer.word_index) + 1 
        
        #ALL THREADS RUN THIS
        _matrix = np.zeros((len(texts),TRIGRAM_VOC), dtype=np.int8)

        for i,text in enumerate(texts):
            bag_of_word = regex_alfanum_tokenizer(text)
            for j in tokenizer.texts_to_sequences(bag_of_word):
                _matrix[i][j] += 1
        
        file_name = output_file_name.format(proc_id)+".npz"
        print("save:",file_name)
        sparse.save_npz(os.path.join('/','backup','pubmed_archive_tokenized','mp_to_merge',file_name), sparse.coo_matrix(_matrix))
        #np.save(os.path.join('/','backup','pubmed_archive_tokenized','mp_to_merge',file_name),_matrix)
        #pickle.dump(_matrix,open(os.path.join('/','backup','pubmed_archive_tokenized','mp_to_merge',file_name),"wb"),protocol=4)
        del tokenizer
        del texts
        print("Process",proc_id,"ENDED")
    
else:
    def tokenizeJob(proc_id, tokenizer, articles):
        print("Process",proc_id,"Started")

        #ALL THREADS RUN THIS
        tokenized = tokenizer.texts_to_sequences(articles)

        file_name = output_file_name.format(proc_id)
        print("save:",file_name)

        pickle.dump(tokenized,open(os.path.join('/','backup','pubmed_archive_tokenized','mp_to_merge',file_name),"wb"))
        del tokenized
        del articles
        print("Process",proc_id,"ENDED")

#same some memory
del tk.index_docs
del tk.index_word
del tk.word_counts
del tk.word_docs

print(gc.collect())

def tokenize_process_init(proc_id, articles):

    
    return Process(target=tokenizeJob, args=(proc_id, tk, articles,))


multithread_loop(tokenize_process_init)
        
# Grouping the resulting files
gc.collect() ##some clean up
if MERGE_FLAG: 
    files = sorted(filter(lambda x:MODE in x,os.listdir("/backup/pubmed_archive_tokenized/mp_to_merge/")))
    print(files)
    len_files = len(files)
    n_output_files = 5
    iter_files = len_files//n_output_files
    batch = list(range(0,len_files,iter_files))

    if len(batch)==n_output_files:
        batch.append(len_files)
    else:
        batch[n_output_files] = len_files

    for i in range(n_output_files):
        tokenized = []
        for file in files[batch[i]:batch[i+1]]:
            print("load:",file,end="\r")
            with open("/backup/pubmed_archive_tokenized/mp_to_merge/"+file,"rb") as f:
                tokenized.extend(pickle.load(f))

        file_name = output_file_name.format(i)
        print("save:",file_name)

        with open(os.path.join('/','backup','pubmed_archive_tokenized',file_name),"wb") as f:
            pickle.dump(tokenized,f)

0
Open the file: pubmed_ready_00000000_to_02776362
Returning: 2776363 articles
Start 12 working threads
Process 0 Started
Process 1 Started
Process 2 Started
Process 3 Started
Process 4 Started
Process 5 Started
Process 6 Started
Process 7 Started
Process 8 Started
Process 9 Started
Process 10 Started
Process 11 Started
Wait 12 working threads
save: regex_less_700k_freq_file_000_title_abs_pubmed.p
Process 0 ENDED
save: regex_less_700k_freq_file_008_title_abs_pubmed.p
save: regex_less_700k_freq_file_002_title_abs_pubmed.p
save: regex_less_700k_freq_file_010_title_abs_pubmed.p
save: regex_less_700k_freq_file_007_title_abs_pubmed.p
Process 8 ENDED
save: regex_less_700k_freq_file_011_title_abs_pubmed.p
save: regex_less_700k_freq_file_009_title_abs_pubmed.p
Process 2 ENDED
save: regex_less_700k_freq_file_001_title_abs_pubmed.p
Process 10 ENDED
Process 7 ENDED
save: regex_less_700k_freq_file_006_title_abs_pubmed.p
save: regex_less_700k_freq_file_004_title_abs_pubmed.p
Process 11 ENDED
save: 

save: regex_less_700k_freq_file_000_title_abs_pubmed.p
save: regex_less_700k_freq_file_001_title_abs_pubmed.p
save: regex_less_700k_freq_file_002_title_abs_pubmed.p
save: regex_less_700k_freq_file_003_title_abs_pubmed.p
save: regex_less_700k_freq_file_004_title_abs_pubmed.p


In [10]:
%%javascript
Jupyter.notebook.session.delete();

<IPython.core.display.Javascript object>

## Verify the tokenized articles
    
    


In [2]:
#Verify the tokenized articles

tokenized_pubmed_generator = ph.create_tokenized_pubmed_collection_generator(path="/backup/pubmed_archive_tokenized/bllip_N10_title_abs.tar.gz")#mode="bllip")

Open the pubmed tokenized tar.gz
Creating generator


In [3]:
index = 0
index_zero_len = []
for collection in tokenized_pubmed_generator():
    for doc in collection:
        if len(doc)==0:
            index_zero_len.append(index)
        index+=1
        

Open the file: 0
Returning: 3690895 articles
Force garbage collector 0
Open the file: 1
Returning: 3643138 articles
Force garbage collector 0
Open the file: 2
Returning: 3790281 articles
Force garbage collector 0
Open the file: 3
Returning: 3838006 articles
Force garbage collector 0
Open the file: 4
Returning: 3862035 articles
Force garbage collector 0


In [4]:
index

18824355

In [12]:
MODE = "bllip"
files = sorted(filter(lambda x:MODE in x,os.listdir("/backup/pubmed_archive_tokenized/mp_to_merge/")))

In [16]:
len_files = len(files)
n_output_files = 5
iter_files = len_files//n_output_files
batch = list(range(0,len_files,iter_files))

if len(batch)==n_output_files:
    batch.append(len_files)
else:
    batch[n_output_files] = len_files
    
batch

[0, 4, 8, 12, 16, 19]

In [10]:
count=0
for name in files:
    print(name)
    with open(os.path.join("/backup/pubmed_archive_tokenized/mp_to_merge/",name),"rb") as f:
        count+=len(pickle.load(f))

bllip_N10_file_00_title_abs_pubmed.p
bllip_N10_file_01_title_abs_pubmed.p
bllip_N10_file_02_title_abs_pubmed.p
bllip_N10_file_03_title_abs_pubmed.p
bllip_N10_file_04_title_abs_pubmed.p
bllip_N10_file_05_title_abs_pubmed.p
bllip_N10_file_06_title_abs_pubmed.p
bllip_N10_file_07_title_abs_pubmed.p
bllip_N10_file_08_title_abs_pubmed.p
bllip_N10_file_09_title_abs_pubmed.p
bllip_N10_file_10_title_abs_pubmed.p
bllip_N10_file_11_title_abs_pubmed.p
bllip_N10_file_12_title_abs_pubmed.p
bllip_N10_file_13_title_abs_pubmed.p
bllip_N10_file_14_title_abs_pubmed.p
bllip_N10_file_15_title_abs_pubmed.p
bllip_N10_file_16_title_abs_pubmed.p
bllip_N10_file_17_title_abs_pubmed.p
bllip_N10_file_18_title_abs_pubmed.p
bllip_N10_file_19_title_abs_pubmed.p


In [11]:
count

18824349

### Create a index:pmid document mapping

In [2]:
#Create a generator for pubmed data
from bidict import bidict

map_function = lambda x:x["pmid"]

pubmed_generator = ph.create_pubmed_collection_generator(map_function,path="/backup/pubmed_archive_json/pubmed_ready.tar.gz")

index = 0
#equal_pmid = [] #(anterior, novo)
#bidirectional dictionary
doc_pmid_index = bidict()
for collection in pubmed_generator():
    for doc in collection:
        #if doc[0] in doc_pmid_index:
            #equal_pmid.append((doc_pmid_index[doc[0]],doc))
        doc_pmid_index[doc] = index
        index+=1



Open the pubmed tar.gz
Creating generator
Open the file: pubmed_ready_00000000_to_02776362
Returning: 2776363 articles
Force garbage collector 0
Open the file: pubmed_ready_02776363_to_05519968
Returning: 2743606 articles
Force garbage collector 0
Open the file: pubmed_ready_05519969_to_08241071
Returning: 2721103 articles
Force garbage collector 0
Open the file: pubmed_ready_08241072_to_11124313
Returning: 2883242 articles
Force garbage collector 0
Open the file: pubmed_ready_11124314_to_13996815
Returning: 2872502 articles
Force garbage collector 0
Open the file: pubmed_ready_13996816_to_18824354
Returning: 4827539 articles
Force garbage collector 0


In [3]:
doc_pmid_index.inverse[18821518]

'20978152'

18824355

In [None]:
## REMOVE DUPLICATED ARTICLES

In [5]:
import json

pubmed_generator = ph.create_pubmed_collection_generator()

index = 0
last_len = 0
articles = []
for collection in pubmed_generator():
    for doc in collection:
        if doc_pmid_index[doc['pmid']] == index :
            articles.append(doc)
        index+=1
    
    file_name = "/backup/pubmed_archive_json/pubmed_ready_{0:08}_to_{1:08}".format(last_len,(last_len+len(articles))-1)
    print("Save file",file_name,":",end="")
    json.dump(articles,open(file_name,"w"))
    last_len = last_len+len(articles)
    del articles
    articles = []
    

Open the pubmed tar.gz
Creating generator
Open the file: 0
Returning: 3000000 articles
Save file /backup/pubmed_archive_json/pubmed_ready_00000000_to_02776362 :Force garbage collector 32
Open the file: 1
Returning: 3000000 articles
Save file /backup/pubmed_archive_json/pubmed_ready_02776363_to_05519968 :Force garbage collector 32
Open the file: 2
Returning: 3000000 articles
Save file /backup/pubmed_archive_json/pubmed_ready_05519969_to_08241071 :Force garbage collector 32
Open the file: 3
Returning: 3000000 articles
Save file /backup/pubmed_archive_json/pubmed_ready_08241072_to_11124313 :Force garbage collector 32
Open the file: 4
Returning: 3000000 articles
Save file /backup/pubmed_archive_json/pubmed_ready_11124314_to_13996815 :Force garbage collector 32
Open the file: 5
Returning: 4885278 articles
Save file /backup/pubmed_archive_json/pubmed_ready_13996816_to_18824354 :Force garbage collector 32


In [4]:
##count articles

g = ph.create_pubmed_collection_generator()
total_docs = 0
for docs in g():
    total_docs += len(docs)

Open the pubmed tar.gz
Creating generator
Open the file: pubmed_ready_00000000_to_02776362
Returning: 2776363 articles
Force garbage collector 0
Open the file: pubmed_ready_02776363_to_05519968
Returning: 2743606 articles
Force garbage collector 0
Open the file: pubmed_ready_05519969_to_08241071
Returning: 2721103 articles
Force garbage collector 0
Open the file: pubmed_ready_08241072_to_11124313
Returning: 2883242 articles
Force garbage collector 0
Open the file: pubmed_ready_11124314_to_13996815
Returning: 2872502 articles
Force garbage collector 0
Open the file: pubmed_ready_13996816_to_18824354
Returning: 4827539 articles
Force garbage collector 0


In [6]:
docs= next(g())

Open the file: pubmed_ready_00000000_to_02776362
Returning: 2776363 articles


In [9]:
docs[853213]

{'abstract': 'The O protein of bacteriophage lambda is required for initiation of DNA replication at the lambda replicative origin designated ori lambda. The binding sites for O protein are four direct repeats, each of which is an inverted repeat. By means of electron microscopy, we have found that phage lambda O protein utilizes these multiple binding sites to form a specific nucleoprotein structure in which the origin DNA is inferred to be folded or wound. The phage lambda O and P proteins and host DnaB protein interact at ori lambda to generate a larger structure than that formed by O protein alone; P and DnaB proteins fail to form any observable complex when O protein is excluded from the reaction mixture. We conclude that the specialized nucleoprotein structure formed by phage lambda O protein and ori lambda provides for localized initiation of DNA replication by serving as the foundation for the assembly of the initial priming structure. Specialized nucleoprotein structures may b

In [None]:
with open("/backup/pubmed_archive_tokenized/bllip_stem_file_04_title_abs_pubmed.p","rb") as f:
    tok_articles = pickle.load(f)

In [None]:
[tk.index_word[w] for w in tok_articles[853213]]

In [None]:
%%javascript
Jupyter.notebook.session.delete();