In [1]:
import os
os.chdir("/home/tiagoalmeida/bioASQ-taskb/")
import sentencepiece as spm
import pickle
from pubmed_data import pubmed_helper as ph

In [2]:
VOCAB_SIZE = 50000

MODEL_NAME = "model_bpe_50k"

article_map = lambda article:(article["title"]+" "+article["abstract"]).lower()

## Byte Pair Encoding - Pubmed

#### Index
 - [Concatenate all pubmed archives in txt file](#txt)
 - [Create BPE encoder](#bpe)
 - [Load BPE encoder](#bpe_encoder)
 - [Encode pubmed](#pubmed_encode)

<a id='txt'></a>
### Convert the pubmed abstract to a unique txt file

In [3]:
txt_file = open("/backup/pubmed_archive/txt/archive.txt","w",encoding = 'utf-8')


#Load pubmed
#load the articles to the memory
articles_generator = ph.create_pubmed_collection_generator()
articles = []
for docs in articles_generator():
    for doc in map(lambda x:article_map(x)+"\n",docs):
        txt_file.write(doc)
        
txt_file.close()

Open /backup/pubmed_archive_json/pubmed_ready.tar.gz
Creating generator
Open the file: pubmed_ready_00000000_to_02776362
Returning: 2776363 articles
Force garbage collector 0
Open the file: pubmed_ready_02776363_to_05519968
Returning: 2743606 articles
Force garbage collector 0
Open the file: pubmed_ready_05519969_to_08241071
Returning: 2721103 articles
Force garbage collector 0
Open the file: pubmed_ready_08241072_to_11124313
Returning: 2883242 articles
Force garbage collector 0
Open the file: pubmed_ready_11124314_to_13996815
Returning: 2872502 articles
Force garbage collector 0
Open the file: pubmed_ready_13996816_to_18824354
Returning: 4827539 articles
Force garbage collector 0


<a id='bpe'></a>
### Create BPE encoder

In [4]:


def create_bpe_model(input_txt, model_prefix, vocab_size = 50000, model_type="bpe", user_symbols=None):
    model_prefix = os.path.join("/backup/bpe_model",model_prefix)
    command_str = '--input={} --model_prefix={} --vocab_size={} --model_type={}'.format(input_txt, model_prefix, str(vocab_size), model_type)
    if user_symbols is not None:
        command_str = command_str+" --user_defined_symbols="+user_symbols
    print("command:",command_str)

    spm.SentencePieceTrainer.train(command_str)

create_bpe_model("/backup/pubmed_archive/txt/archive.txt",MODEL_NAME,VOCAB_SIZE,user_symbols="<pad>,<$>")

command: --input=/backup/pubmed_archive/txt/archive.txt --model_prefix=/backup/bpe_model/model_bpe_50k --vocab_size=50000 --model_type=bpe


<a id='bpe_encoder'></a>
### Load and Test BPE encoder

In [3]:
bpe_model = ph.load_bpe_model(MODEL_NAME)
str_test = "How could iPSCs be used for the treatment of diabetes?".lower()

In [4]:
print(bpe_model.encode_as_pieces(str_test))

['▁how', '▁could', '▁ipscs', '▁be', '▁used', '▁for', '▁the', '▁treatment', '▁of', '▁diabetes', '?']


In [9]:
bpe_model.encode_as_ids(str_test)

[529, 933, 31168, 121, 491, 78, 14, 362, 23, 1841, 0]

In [6]:
bpe_model.decode_pieces(['▁how', '▁could', '▁ipscs', '▁be', '▁used', '▁for', '▁the', '▁treatment', '▁of', '▁diabetes', '?'])

'how could ipscs be used for the treatment of diabetes?'

In [8]:
bpe_model.decode_ids([529, 933, 31168, 121, 491, 78, 14, 362, 23, 1841, 0])

'how could ipscs be used for the treatment of diabetes ⁇ '

In [8]:
#get dictionary id-subwording
id_subword = { _id:bpe_model.id_to_piece(_id) for _id in range(len(bpe_model))}


In [9]:
id_subword

{0: '<unk>',
 1: '<s>',
 2: '</s>',
 3: '<tit_sep>',
 4: '▁t',
 5: 'in',
 6: '▁a',
 7: 're',
 8: 'on',
 9: 'ti',
 10: 'he',
 11: 'en',
 12: '▁o',
 13: '▁c',
 14: '▁s',
 15: '▁the',
 16: 'er',
 17: '▁p',
 18: 'al',
 19: 'ed',
 20: '▁in',
 21: '▁an',
 22: 'at',
 23: '▁of',
 24: '▁w',
 25: 'es',
 26: 'or',
 27: '▁m',
 28: 'ro',
 29: '▁d',
 30: 'an',
 31: 'is',
 32: 'tion',
 33: 'it',
 34: '▁f',
 35: '▁and',
 36: 'as',
 37: 'ic',
 38: '▁b',
 39: 'ar',
 40: '▁re',
 41: 'ing',
 42: 'ent',
 43: '▁e',
 44: 'le',
 45: 'ation',
 46: '▁to',
 47: 've',
 48: 'ec',
 49: '▁h',
 50: 'ul',
 51: 'us',
 52: '▁n',
 53: 'om',
 54: 'ac',
 55: 'ly',
 56: '▁(',
 57: '▁th',
 58: 'os',
 59: 'et',
 60: 'ol',
 61: '▁l',
 62: 'id',
 63: 'ur',
 64: '▁g',
 65: 'im',
 66: 'ith',
 67: 'un',
 68: '▁with',
 69: '▁we',
 70: 'ter',
 71: 'el',
 72: '▁st',
 73: 'ati',
 74: '▁con',
 75: 'ig',
 76: 'ce',
 77: '▁pro',
 78: 'ra',
 79: '▁for',
 80: 'il',
 81: 'res',
 82: 'ot',
 83: 'uc',
 84: 'od',
 85: '▁v',
 86: 'if',
 87: 'ts

<a id='pubmed_encode'></a>
### Run BPE encoder in pubmed data

In [10]:
if 'articles_generator' not in locals(): #create the var
    articles_generator = ph.create_pubmed_collection_generator()
    
for i,articles in enumerate(articles_generator()):
    encoded_articles = []
    for j,article in enumerate(map(article_map, articles)):
        encoded_articles.append(bpe_model.encode_as_ids(article))
        if j%10000==0:
            print("Article:",j,end="\r")
    
    ##save
    file_name = MODEL_NAME+"_file_{0:03}_pubmed.p".format(i)
    print("save:",file_name)
    print(len(encoded_articles))
    with open(os.path.join('/','backup','pubmed_archive_tokenized',file_name),"wb") as f:
        pickle.dump(encoded_articles,f)
        
    

Open /backup/pubmed_archive_json/pubmed_ready.tar.gz
Creating generator
Open the file: pubmed_ready_00000000_to_02776362
Returning: 2776363 articles
save: model_bpe_50k_file_000_pubmed.p
2776363
Force garbage collector 0
Open the file: pubmed_ready_02776363_to_05519968
Returning: 2743606 articles
save: model_bpe_50k_file_001_pubmed.p
2743606
Force garbage collector 0
Open the file: pubmed_ready_05519969_to_08241071
Returning: 2721103 articles
save: model_bpe_50k_file_002_pubmed.p
2721103
Force garbage collector 0
Open the file: pubmed_ready_08241072_to_11124313
Returning: 2883242 articles
save: model_bpe_50k_file_003_pubmed.p
2883242
Force garbage collector 0
Open the file: pubmed_ready_11124314_to_13996815
Returning: 2872502 articles
save: model_bpe_50k_file_004_pubmed.p
2872502
Force garbage collector 0
Open the file: pubmed_ready_13996816_to_18824354
Returning: 4827539 articles
save: model_bpe_50k_file_005_pubmed.p
4827539
Force garbage collector 0


In [30]:
bpe_model.encode_as_pieces(article_map(articles[0]))

['▁loss',
 '▁of',
 '▁density',
 '-',
 'dependent',
 '▁growth',
 '▁inhibition',
 '▁and',
 '▁dissociation',
 '▁of',
 '▁alpha',
 '-',
 'catenin',
 '▁from',
 '▁e',
 '-',
 'cadherin',
 '.<',
 'tit',
 '_',
 'sep',
 '>',
 'normal',
 '▁human',
 '▁breast',
 '▁epithelial',
 '▁(',
 'hbe',
 ')',
 '▁cells',
 '▁at',
 '▁early',
 '▁(9',
 'th',
 ')',
 '▁passage',
 '▁ceased',
 '▁growth',
 '▁and',
 '▁formed',
 '▁a',
 '▁monolayer',
 '▁when',
 '▁they',
 '▁reached',
 '▁confluence',
 '.',
 '▁immunostaining',
 '▁and',
 '▁western',
 '▁blotting',
 '▁revealed',
 '▁that',
 '▁alpha',
 '-',
 '▁and',
 '▁beta',
 '-',
 'caten',
 'ins',
 '▁colocalized',
 '▁and',
 '▁coprecipitated',
 '▁with',
 '▁e',
 '-',
 'cadherin',
 ',',
 '▁suggesting',
 '▁a',
 '▁complex',
 '▁formation',
 '▁of',
 '▁e',
 '-',
 'cadherin',
 '▁with',
 '▁alpha',
 '-',
 '▁and',
 '▁beta',
 '-',
 'caten',
 'ins',
 '▁in',
 '▁early',
 '▁passage',
 '▁cells',
 '.',
 '▁in',
 '▁contrast',
 ',',
 '▁hbe',
 '▁cells',
 '▁at',
 '▁late',
 '▁(12',
 '-13',
 'th',
 ')',
 