In [2]:
!pip install --user xmltodict

Collecting xmltodict
  Downloading https://files.pythonhosted.org/packages/42/a9/7e99652c6bc619d19d58cdd8c47560730eb5825d43a7e25db2e1d776ceb7/xmltodict-0.11.0-py2.py3-none-any.whl
Installing collected packages: xmltodict
Successfully installed xmltodict-0.11.0
[33mYou are using pip version 9.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [30]:
import xmltodict
import string
import subprocess
import os

In [31]:
printable = set(string.printable)
def get_data(input_file):
    with open(input_file,"r") as fp:
        data = xmltodict.parse(fp.read())
    return data["PubmedArticleSet"]["PubmedArticle"]

def get_text(input_data):
    text= ""
    if isinstance(input_data,dict):
        for key in input_data.keys():
            text += get_text(input_data[key]) 
    elif isinstance(input_data,list):
        for item in input_data:
            text += get_text(item)
    elif isinstance(input_data,str):
        return input_data+" "
    elif input_data:
        out_text = filter(lambda x: x in printable,input_data)
        return out_text+" "
    else:
        return ""
    return text
    

In [32]:
from dask.distributed import Client

In [33]:
hostname = subprocess.check_output('hostname').strip()
client = Client(hostname +':8786')

In [34]:
client

0,1
Client  Scheduler: tcp://ip-172-31-18-54:8786  Dashboard: http://ip-172-31-18-54:8787/status,Cluster  Workers: 4  Cores: 64  Memory: 130.65 GB


In [36]:
%%time
filename1 = "/shared/data/pubmed18n0929.xml"
filename2 = "/shared/data/pubmed18n0930.xml"
data_list = client.map(get_data,[filename1,filename2])
data_list = client.gather(data_list)

CPU times: user 2min 43s, sys: 8.25 s, total: 2min 51s
Wall time: 8min 12s


In [37]:
len(data_list[1])

30000

In [38]:
text_data = client.map(get_text,data_list[1])
final_text_data = client.gather(text_data)

In [39]:
len(final_text_data)

30000

In [40]:
final_text_data[0]

u'MEDLINE NLM 1 27406060 2017 11 16 2017 11 28 Print Electronic 1937-3392 Internet 22 8 2016 Aug Tissue engineering. Part C, Methods Tissue Eng Part C Methods Antioxidant N-Acetylcysteine and Glutathione Increase the Viability and Proliferation of MG63 Cells Encapsulated in the Gelatin Methacrylate/VA-086/Blue Light Hydrogel System. 792-800 doi Y 10.1089/ten.TEC.2016.0025 Photoencapsulation of cells inside a hydrogel system can provide a suitable path to establish a gel in situ for soft tissue regeneration applications. However, the presence of photoinitiators and blue or UV light irradiation can result in cell damage and an increase of reactive oxygen species. We here evaluate the benefits of an antioxidant pretreatment on the photoencapsulated cells. We study this by evaluating proliferation and viability of MG63 cells, which we combined with a gelatin methacrylate (GelMA) hydrogel system, using the photoinitiator, VA-086, cured with 440nm blue light. We found that blue light irradia

In [42]:
dirname = "/shared/data/pubmed/abstracts/"
if not os.path.exists(dirname):
    os.makedirs(dirname)
    
for _id,article in enumerate(final_text_data):
    filename = dirname + "/abstract_" + str(_id) + ".txt"
    with open(filename,"w") as fp:
        fp.write(article)

In [43]:
from rocketml.io import DocumentSet
from rocketml.decomposition import TruncatedSVD
from rocketml.preprocessing import TextFilter
from rocketml import Pipeline

In [44]:
%%time
abstract_filenames = [dirname+ff for ff in os.listdir(dirname)]
docset = DocumentSet(filelist=abstract_filenames)
tf = TextFilter(stop_word_filter=True,stemming_filter=True,length_filter=True)
svd = TruncatedSVD(n_components=20)
pipeline = Pipeline([tf,svd])
docset_transofrm = pipeline.fit_transform(docset)
len(docset.documents)
svd.components_.shape

<rocketml.preprocessing.text.TextFilter object at 0x7fbbdb7be110>
text
------------------
Cluster Resources
------------------
Nodes        = 4
Sockets/node = 1
Cores/socket = 8
--------------------------
Total Cores    = 32
Total Memory(GB)   = 121.68
--------------------------
/shared/jobs_folder/81a42b2b/output.txt
CPU times: user 560 ms, sys: 748 ms, total: 1.31 s
Wall time: 31.8 s


In [45]:
import pandas as pd
df = pd.DataFrame([[key,value] for key,value in docset.terms_document_frequencies.iteritems()],columns=["Term","Frequency"])
df_high = df[df["Frequency"] > 10000].sort_values(by=["Frequency"],ascending=False)

In [46]:
from bokeh.plotting import figure,show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.models.ranges import Range1d
import xmltodict
import string
%matplotlib inline

In [47]:
output_notebook()

In [48]:
source = ColumnDataSource(df_high)
p = figure(plot_width=1000,plot_height=500,title="Term Frequencies",x_range=df_high["Term"].values,y_range=Range1d(0,df_high["Frequency"].max()))
p.vbar(x="Term",top = "Frequency",width=0.2,source=source)
p.add_tools(HoverTool(tooltips=[("Term","@Term"),("Frequency","@Frequency")]))
show(p)

In [69]:
terms = sorted(df["Term"])
top_n = 10
comp0 = svd.components_[19]
top_indices = comp0.argsort()[-top_n:][::-1]
for index in top_indices:
    print(terms[index])

korea
seoul
republ
pediatr
israel
south
tel
imag
aviv
sleep


180875