In [1]:
import pubmed_parser as pp
import os
import gc
import sys
import pickle


## Pubmed Rep

In [2]:
pubmed_path = os.path.join("/","backup","pubmed_archive","baseline_2018")
pubmed_store_path = os.path.join("/","backup","pubmed_archive_json")

In [3]:
from lxml import etree

def dir_tree_run(action, base_dir=pubmed_path):
    """
    Apply funcition "action" to the individual files from tree directory
    """
    _temp_f_name = ""
    for f_name in os.listdir(base_dir):
        _temp_f_name = os.path.join(base_dir,f_name)
        if os.path.isdir(_temp_f_name):
            dir_tree_run(action,_temp_f_name)
        else:
            action(_temp_f_name)


def open_pubmed_file(file_name, articles, error_files):
    try:
        articles.extend(pp.parse_medline_xml(file_name, year_info_only=False, nlm_category=False))
    except etree.XMLSyntaxError:
        error_files.append(file_name)
        print("Error on File " + file_name)
    


In [4]:
#example /backup/pubmed_archive/2011/11/pubs_2011-11_43.xml.gz
articles_tar_names = []
dir_tree_run(lambda x:articles_tar_names.append(x))
articles_tar_names = sorted(articles_tar_names)

In [5]:
## Multi process read of all the files!
from multiprocessing import Process

def open_xml(proc_id, xml_files):
    import pubmed_parser as pp
    
    print("[Process-{}] Started".format(proc_id))
    articles = []
    for file_name in xml_files:
        print(proc_id, file_name)
        try:
            articles.extend(pp.parse_medline_xml(file_name, year_info_only=False, nlm_category=False))
        except etree.XMLSyntaxError:
            error_files.append(file_name)
            print("Error on File " + file_name)
            
    articles_filter = filter(lambda x: (x["abstract"] is not None and len(x["abstract"])>0), articles)

    articles_mapped = list(map(lambda x:{"id":x["pmid"],"title":x["title"],"abstract":x["abstract"]},articles_filter))

    file_name = "/backup/pubmed_archive_json/pubmed_2018_{0:03}.p".format(proc_id)
    print("[Process-{}]: Store {}".format(proc_id, file_name))

    with open(file_name, "wb") as f:
        pickle.dump(articles, f)

    del articles
    print("[Process-{}] Ended".format(proc_id))

process = []
size = len(articles_tar_names)
itter = 50
for e,i in enumerate(range(0, size, itter)):
    process.append(Process(target=open_xml,args=(e, articles_tar_names[i:i+itter])))

print("[MULTIPROCESS LOOP] Starting", len(process), "process")
for p in process:
    p.start()

print("[MULTIPROCESS LOOP] Wait", len(process), "process")
for p in process:
    p.join()

[MULTIPROCESS LOOP] Starting 19 process
[Process-0] Started
0 /backup/pubmed_archive/baseline_2018/pubmed18n0001.xml.gz
[Process-1] Started
1 /backup/pubmed_archive/baseline_2018/pubmed18n0051.xml.gz
[Process-2] Started
2 /backup/pubmed_archive/baseline_2018/pubmed18n0101.xml.gz
[Process-3] Started
3 /backup/pubmed_archive/baseline_2018/pubmed18n0151.xml.gz
[Process-4] Started
4 /backup/pubmed_archive/baseline_2018/pubmed18n0201.xml.gz
[Process-5] Started
5 /backup/pubmed_archive/baseline_2018/pubmed18n0251.xml.gz
[Process-6] Started
6 /backup/pubmed_archive/baseline_2018/pubmed18n0301.xml.gz
[Process-7] Started
7 /backup/pubmed_archive/baseline_2018/pubmed18n0351.xml.gz
[Process-8] Started
8 /backup/pubmed_archive/baseline_2018/pubmed18n0401.xml.gz
[Process-9] Started
9 /backup/pubmed_archive/baseline_2018/pubmed18n0451.xml.gz
10 /backup/pubmed_archive/baseline_2018/pubmed18n0501.xml.gz
11 /backup/pubmed_archive/baseline_2018/pubmed18n0551.xml.gz
[Process-10] Started
[Process-11] Star

7 /backup/pubmed_archive/baseline_2018/pubmed18n0357.xml.gz
9 /backup/pubmed_archive/baseline_2018/pubmed18n0459.xml.gz
6 /backup/pubmed_archive/baseline_2018/pubmed18n0308.xml.gz
5 /backup/pubmed_archive/baseline_2018/pubmed18n0257.xml.gz
0 /backup/pubmed_archive/baseline_2018/pubmed18n0008.xml.gz
12 /backup/pubmed_archive/baseline_2018/pubmed18n0607.xml.gz
16 /backup/pubmed_archive/baseline_2018/pubmed18n0806.xml.gz
1 /backup/pubmed_archive/baseline_2018/pubmed18n0058.xml.gz
4 /backup/pubmed_archive/baseline_2018/pubmed18n0209.xml.gz
11 /backup/pubmed_archive/baseline_2018/pubmed18n0558.xml.gz
15 /backup/pubmed_archive/baseline_2018/pubmed18n0757.xml.gz
3 /backup/pubmed_archive/baseline_2018/pubmed18n0159.xml.gz
10 /backup/pubmed_archive/baseline_2018/pubmed18n0508.xml.gz
9 /backup/pubmed_archive/baseline_2018/pubmed18n0460.xml.gz
8 /backup/pubmed_archive/baseline_2018/pubmed18n0408.xml.gz
2 /backup/pubmed_archive/baseline_2018/pubmed18n0109.xml.gz
13 /backup/pubmed_archive/baseline_

0 /backup/pubmed_archive/baseline_2018/pubmed18n0017.xml.gz
17 /backup/pubmed_archive/baseline_2018/pubmed18n0862.xml.gz
1 /backup/pubmed_archive/baseline_2018/pubmed18n0065.xml.gz
10 /backup/pubmed_archive/baseline_2018/pubmed18n0514.xml.gz
5 /backup/pubmed_archive/baseline_2018/pubmed18n0265.xml.gz
4 /backup/pubmed_archive/baseline_2018/pubmed18n0217.xml.gz
3 /backup/pubmed_archive/baseline_2018/pubmed18n0167.xml.gz
18 /backup/pubmed_archive/baseline_2018/pubmed18n0914.xml.gz
8 /backup/pubmed_archive/baseline_2018/pubmed18n0414.xml.gz
16 /backup/pubmed_archive/baseline_2018/pubmed18n0812.xml.gz
0 /backup/pubmed_archive/baseline_2018/pubmed18n0018.xml.gz
13 /backup/pubmed_archive/baseline_2018/pubmed18n0664.xml.gz
2 /backup/pubmed_archive/baseline_2018/pubmed18n0117.xml.gz
6 /backup/pubmed_archive/baseline_2018/pubmed18n0316.xml.gz
7 /backup/pubmed_archive/baseline_2018/pubmed18n0365.xml.gz
15 /backup/pubmed_archive/baseline_2018/pubmed18n0763.xml.gz
14 /backup/pubmed_archive/baseline

7 /backup/pubmed_archive/baseline_2018/pubmed18n0371.xml.gz
5 /backup/pubmed_archive/baseline_2018/pubmed18n0272.xml.gz
9 /backup/pubmed_archive/baseline_2018/pubmed18n0476.xml.gz
11 /backup/pubmed_archive/baseline_2018/pubmed18n0571.xml.gz
13 /backup/pubmed_archive/baseline_2018/pubmed18n0671.xml.gz
8 /backup/pubmed_archive/baseline_2018/pubmed18n0421.xml.gz
15 /backup/pubmed_archive/baseline_2018/pubmed18n0768.xml.gz
2 /backup/pubmed_archive/baseline_2018/pubmed18n0125.xml.gz
10 /backup/pubmed_archive/baseline_2018/pubmed18n0521.xml.gz
3 /backup/pubmed_archive/baseline_2018/pubmed18n0176.xml.gz
0 /backup/pubmed_archive/baseline_2018/pubmed18n0027.xml.gz
17 /backup/pubmed_archive/baseline_2018/pubmed18n0868.xml.gz
14 /backup/pubmed_archive/baseline_2018/pubmed18n0720.xml.gz
16 /backup/pubmed_archive/baseline_2018/pubmed18n0818.xml.gz
11 /backup/pubmed_archive/baseline_2018/pubmed18n0572.xml.gz
8 /backup/pubmed_archive/baseline_2018/pubmed18n0422.xml.gz
12 /backup/pubmed_archive/baseli

13 /backup/pubmed_archive/baseline_2018/pubmed18n0678.xml.gz
10 /backup/pubmed_archive/baseline_2018/pubmed18n0527.xml.gz
1 /backup/pubmed_archive/baseline_2018/pubmed18n0080.xml.gz
0 /backup/pubmed_archive/baseline_2018/pubmed18n0035.xml.gz
8 /backup/pubmed_archive/baseline_2018/pubmed18n0430.xml.gz
4 /backup/pubmed_archive/baseline_2018/pubmed18n0233.xml.gz
5 /backup/pubmed_archive/baseline_2018/pubmed18n0280.xml.gz
11 /backup/pubmed_archive/baseline_2018/pubmed18n0579.xml.gz
3 /backup/pubmed_archive/baseline_2018/pubmed18n0184.xml.gz
9 /backup/pubmed_archive/baseline_2018/pubmed18n0483.xml.gz
15 /backup/pubmed_archive/baseline_2018/pubmed18n0774.xml.gz
2 /backup/pubmed_archive/baseline_2018/pubmed18n0133.xml.gz
14 /backup/pubmed_archive/baseline_2018/pubmed18n0726.xml.gz
7 /backup/pubmed_archive/baseline_2018/pubmed18n0378.xml.gz
17 /backup/pubmed_archive/baseline_2018/pubmed18n0874.xml.gz
6 /backup/pubmed_archive/baseline_2018/pubmed18n0331.xml.gz
18 /backup/pubmed_archive/baseline

2 /backup/pubmed_archive/baseline_2018/pubmed18n0140.xml.gz
6 /backup/pubmed_archive/baseline_2018/pubmed18n0338.xml.gz
3 /backup/pubmed_archive/baseline_2018/pubmed18n0192.xml.gz
0 /backup/pubmed_archive/baseline_2018/pubmed18n0044.xml.gz
16 /backup/pubmed_archive/baseline_2018/pubmed18n0829.xml.gz
5 /backup/pubmed_archive/baseline_2018/pubmed18n0288.xml.gz
1 /backup/pubmed_archive/baseline_2018/pubmed18n0089.xml.gz
10 /backup/pubmed_archive/baseline_2018/pubmed18n0534.xml.gz
13 /backup/pubmed_archive/baseline_2018/pubmed18n0684.xml.gz
9 /backup/pubmed_archive/baseline_2018/pubmed18n0492.xml.gz
17 /backup/pubmed_archive/baseline_2018/pubmed18n0880.xml.gz
8 /backup/pubmed_archive/baseline_2018/pubmed18n0441.xml.gz
7 /backup/pubmed_archive/baseline_2018/pubmed18n0385.xml.gz
14 /backup/pubmed_archive/baseline_2018/pubmed18n0732.xml.gz
4 /backup/pubmed_archive/baseline_2018/pubmed18n0241.xml.gz
6 /backup/pubmed_archive/baseline_2018/pubmed18n0339.xml.gz
15 /backup/pubmed_archive/baseline_

10 /backup/pubmed_archive/baseline_2018/pubmed18n0541.xml.gz
[Process-0]: Store /backup/pubmed_archive_json/pubmed_2018_000.p
6 /backup/pubmed_archive/baseline_2018/pubmed18n0347.xml.gz
4 /backup/pubmed_archive/baseline_2018/pubmed18n0249.xml.gz
9 /backup/pubmed_archive/baseline_2018/pubmed18n0498.xml.gz
12 /backup/pubmed_archive/baseline_2018/pubmed18n0640.xml.gz
[Process-8] Ended
7 /backup/pubmed_archive/baseline_2018/pubmed18n0391.xml.gz
[Process-3]: Store /backup/pubmed_archive_json/pubmed_2018_003.p
[Process-0] Ended
2 /backup/pubmed_archive/baseline_2018/pubmed18n0148.xml.gz
5 /backup/pubmed_archive/baseline_2018/pubmed18n0298.xml.gz
1 /backup/pubmed_archive/baseline_2018/pubmed18n0097.xml.gz
13 /backup/pubmed_archive/baseline_2018/pubmed18n0691.xml.gz
17 /backup/pubmed_archive/baseline_2018/pubmed18n0887.xml.gz
11 /backup/pubmed_archive/baseline_2018/pubmed18n0594.xml.gz
[Process-3] Ended
4 /backup/pubmed_archive/baseline_2018/pubmed18n0250.xml.gz
14 /backup/pubmed_archive/basel

15 /backup/pubmed_archive/baseline_2018/pubmed18n0799.xml.gz
16 /backup/pubmed_archive/baseline_2018/pubmed18n0847.xml.gz
[Process-17]: Store /backup/pubmed_archive_json/pubmed_2018_017.p
15 /backup/pubmed_archive/baseline_2018/pubmed18n0800.xml.gz
[Process-17] Ended
16 /backup/pubmed_archive/baseline_2018/pubmed18n0848.xml.gz
[Process-15]: Store /backup/pubmed_archive_json/pubmed_2018_015.p
[Process-15] Ended
16 /backup/pubmed_archive/baseline_2018/pubmed18n0849.xml.gz
16 /backup/pubmed_archive/baseline_2018/pubmed18n0850.xml.gz
[Process-16]: Store /backup/pubmed_archive_json/pubmed_2018_016.p
[Process-16] Ended


In [11]:
import json

json_format = sorted(list(filter(lambda x: "pubmed_2018_" in x, os.listdir("/backup/pubmed_archive_json"))))

articles = []

for f_n in json_format:
    with open(os.path.join("/backup/pubmed_archive_json",f_n), "rb") as f:
        articles.extend(pickle.load(f))




27837540
Save file /backup/pubmed_archive_json/pubmedMedline_2018_00000000_to_03000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_03000000_to_06000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_06000000_to_09000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_09000000_to_12000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_15000000_to_18000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_18000000_to_21000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_21000000_to_24000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_24000000_to_27000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_27000000_to_27837540 :Done


In [None]:
size = len(articles)
print(size)
itter = int(3e6)

for i in range(0, size, itter):
    file_name = "/backup/pubmed_archive_json/pubmedMedline_2018_{0:08}_to_{1:08}".format(i, min(size, i+itter))
    print("Save file",file_name,":",end="")
    json.dump(articles[i:i+itter], open(file_name,"w"))
    print("Done")


In [16]:
articles_new

size = len(articles_new)
print(size)
itter = int(3e6)

for i in range(0, size, itter):
    file_name = "/backup/pubmed_archive_json/pubmedMedline_2018_{0:08}_to_{1:08}".format(i, min(size, i+itter))
    print("Save file",file_name,":",end="")
    json.dump(articles_new[i:i+itter], open(file_name,"w"))
    print("Done")


17731043
Save file /backup/pubmed_archive_json/pubmedMedline_2018_00000000_to_03000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_03000000_to_06000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_06000000_to_09000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_09000000_to_12000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_12000000_to_15000000 :Done
Save file /backup/pubmed_archive_json/pubmedMedline_2018_15000000_to_17731043 :Done


In [None]:
articles_filter = filter(lambda x: (x["abstract"] is not None and len(x["abstract"])>0), articles)

articles_mapped = list(map(lambda x:{"id":x["pmid"],"title":x["title"],"abstract":x["abstract"]},articles_filter))

size = len(articles_mapped)
itter = int(3e6)
for i in range(0, size, itter):
    file_name = "/backup/pubmed_archive_json/pubmed_2018_{0:08}_to_{1:08}".format(i, min(size, i+itter))
    print("Save file",file_name,":",end="")
    json.dump(articles_mapped[i:i+itter], open(file_name,"w"))
    print("Done")

Save file /backup/pubmed_archive_json/pubmed_tit_abs_00000000_to_03000000 :

In [2]:
fast_reduce = []    

for i,article in enumerate(articles):
    if i%100000==0:
        print("Articles filter:",i,end="\r")
    
    if len(article["abstract"])==0:
        continue
    fast_reduce.append(articles[i])
    

print("Total articles:",len(articles))
print("Articles with empty abstract",len(zero_index))
print("Total of articles without empty abstract",len(fast_reduce))

#Clean up some vars
del articles
articles = fast_reduce
del fast_reduce
del zero_index
gc.collect()

['pubmed_dump_20470668_to_23293992.json',
 'pubmed_dump_08922282_to_11867708.json',
 'Pubmed Collection analysis.ipynb',
 'BAD_XML_FORMAT',
 'pubmed_dump_14837500_to_17728469.json',
 'pubmed_dump_23293993_to_25615338.json',
 'pubmed_dump_00000000_to_02974738.json',
 '__init__.py',
 '__pycache__',
 'pubmed_dump_02974739_to_05951961.json',
 'pubmed_dump_05951962_to_08922281.json',
 '.ipynb_checkpoints',
 'pubmed_dump_11867709_to_14837499.json',
 'Pubmed data XML READER.ipynb',
 'pubmed_dump_17728470_to_20470667.json',
 'pubmed_helper.py']

In [10]:
import json
#Save in disk
FULL_SAVE = True #save all the xml parametrs, if FALS only  title, abstract and pmid will be saved

if FULL_SAVE == False:
    small_articles=list(map(lambda x:{"pmid":x["pmid"],"title":x["title"],"abstract":x["abstract"]},articles))
    _backup_articles = articles
    articles = small_articles

batch = list(range(0,len(articles),int(3e6)))
batch[-1] = len(articles)-1

for i in range(0,len(batch)-1):
    file_name = "/backup/pubmed_archive_json/pubmed_full_{0:08}_to_{1:08}".format(batch[i],batch[i+1])
    print("Save file",file_name,":",end="")
    json.dump(articles[batch[i]:batch[i+1]],open(file_name,"w"))
    print("Done")
    
#articles = _backup_articles

gc.collect()

Save file /backup/pubmed_archive_json/pubmed_full_00000000_to_03000000 :Done
Save file /backup/pubmed_archive_json/pubmed_full_03000000_to_06000000 :Done
Save file /backup/pubmed_archive_json/pubmed_full_06000000_to_09000000 :Done
Save file /backup/pubmed_archive_json/pubmed_full_09000000_to_12000000 :Done
Save file /backup/pubmed_archive_json/pubmed_full_12000000_to_15000000 :Done
Save file /backup/pubmed_archive_json/pubmed_full_15000000_to_18000000 :Done
Save file /backup/pubmed_archive_json/pubmed_full_18000000_to_21000000 :Done
Save file /backup/pubmed_archive_json/pubmed_full_21000000_to_24000000 :Done
Save file /backup/pubmed_archive_json/pubmed_full_24000000_to_27000000 :Done
Save file /backup/pubmed_archive_json/pubmed_full_27000000_to_30862349 :Done


557

32