# Search Pubmed

## Define Search Criteria

In [1]:
from Bio import Entrez
import numpy as np

def search(query):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.esearch(db='pubmed', 
                            sort='most recent', 
                            retmax='5000',
                            retmode='xml', 
                            reldate = 7, #only within n days from now
                            term=query)
    results = Entrez.read(handle)
    return results

#search terms (can test string with Pubmed Advanced Search)
search_results = search('(Biomech*[Title/Abstract] OR locomot*[Title/Abstract])')

## Perform Search and Save Paper Titles

In [2]:


def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'your.email@example.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

id_list = search_results['IdList']
papers = fetch_details(id_list)
print("")
titles = [0 for i in enumerate(papers['PubmedArticle'])]
keywords = ['' for i in enumerate(papers['PubmedArticle'])]
authors = ['' for i in enumerate(papers['PubmedArticle'])]
links = ['' for i in enumerate(papers['PubmedArticle'])]
journals = ['' for i in enumerate(papers['PubmedArticle'])]


for i, paper in enumerate(papers['PubmedArticle']):
    titles[i] = papers['PubmedArticle'][i]['MedlineCitation']['Article']['ArticleTitle']








## (Optional) Print Paper Info in Github Markdown Format

In [3]:
for i, paper in enumerate(papers['PubmedArticle']):
    print("* [%s](https://www.ncbi.nlm.nih.gov/pubmed/%s)" % (paper['MedlineCitation']['Article']['ArticleTitle'],paper['MedlineCitation']['PMID']))
    links[i] = "* [%s](https://www.ncbi.nlm.nih.gov/pubmed/%s)" % (paper['MedlineCitation']['Article']['ArticleTitle'],paper['MedlineCitation']['PMID'])
    auths = []
    for auth in paper['MedlineCitation']['Article']['AuthorList']:
        try:
            auth_name = [auth['LastName'],auth['Initials']+',']
            auth_name = ' '.join(auth_name)
            print(auth_name)
            auths.append(auth_name)
        except:
            print('NAME ERROR')
    authors[i] = ' '.join(auths)
    print('*%s*' % (paper['MedlineCitation']['Article']['Journal']['Title']) )
    journals[i] = '*%s*' % (paper['MedlineCitation']['Article']['Journal']['Title']) 
    #store keywords 
#     print(" - ") #uncomment to print keywords (1 of 3)
    if paper['MedlineCitation']['KeywordList'] != []:
        kwds = []
        for kw in paper['MedlineCitation']['KeywordList'][0]:
#             print(kw,'/') #uncomment to print keywords (2 of 3)
            kwds.append(kw[:])         
        keywords[i] = ' '.join(kwds)
#     else:
#         print("NO_KEYWORDS") #uncomment to print keywords (3 of 3)
    print("<br>  ") #linebreak for github md 
    #end keywords test
    print("")

* [Validation of a freehand technique for cortical bone trajectory screws in the lumbar spine.](https://www.ncbi.nlm.nih.gov/pubmed/31003218)
Tan Z,
McLachlin S,
Whyne C,
Finkelstein J,
*Journal of neurosurgery. Spine*
<br>  

* [Short and long-term effects of bisphenol S (BPS) exposure during pregnancy and lactation on plasma lipids, hormones, and behavior in rats.](https://www.ncbi.nlm.nih.gov/pubmed/31003143)
da Silva BS,
Pietrobon CB,
Bertasso IM,
Lopes BP,
Carvalho JC,
Peixoto-Silva N,
Santos TR,
Claudio-Neto S,
Manhães AC,
Oliveira E,
de Moura EG,
Lisboa PC,
*Environmental pollution (Barking, Essex : 1987)*
<br>  

* [Cryopreservation of tendon tissue using dimethyl sulfoxide combines conserved cell vitality with maintained biomechanical features.](https://www.ncbi.nlm.nih.gov/pubmed/31002728)
Hochstrat E,
Müller M,
Frank A,
Michel P,
Hansen U,
Raschke MJ,
Kronenberg D,
Stange R,
*PloS one*
<br>  

* [Fluid supplementation accelerates epithelial repair during chemical colitis.](h

### Example:

* [Skeletal muscles of hibernating black bears show minimal atrophy and phenotype shifting despite prolonged physical inactivity and starvation.](https://www.ncbi.nlm.nih.gov/pubmed/30998788)
Miyazaki M,
Shimozuru M,
Tsubota T,
*PloS one*
<br>  

* [Phase space methods for non-linear analysis of pedalling forces in cycling.](https://www.ncbi.nlm.nih.gov/pubmed/30998746)
Kunert A,
Ott M,
Reuter T,
Koska D,
Maiwald C,
*PloS one*
<br>  

# Clean up title and keyword strings

In [4]:
print('Number of Papers: ',np.size(titles,0)) #number of papers. limited to 500. (retmax)
#clean up titles
import re
titles = [t.lower() for t in titles] #same case
titles = [t.replace('<sub>',' ').replace('</sub>','') for t in titles] #subscript
titles = [t.replace('<i>',' ').replace('</i>','') for t in titles] #italics
titles = [t.replace('[','').replace(']','') for t in titles] #remove brackets from html parser
#clean up keywords
keywords = [k.lower() for k in keywords] #same case


Number of Papers:  155


# Load Top-performing Model

In [5]:
from keras.models import model_from_json
# load json and create model
json_file = open('Model_Files/keras_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("Model_Files/keras_model.h5")
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print("\nLoaded model from disk")
 

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Loaded model from disk


# Load Associated Vectorizer

In [6]:
import pickle
# import numpy as np
from sklearn.preprocessing import LabelEncoder
#load vectorizer and label encoder
vect = pickle.load(open('Model_Files/keras_Vectorizer.pkl','rb'))
le = LabelEncoder()
le.classes_   = np.load('Model_Files/keras_LabelEncoder.npy')
print('\nLoaded Vectorizer')



Loaded Vectorizer


# Vectorize Strings

In [7]:
#get titles for this week's literature update
import pandas as pd
papers_df = pd.DataFrame({'title': titles, 'keywords': keywords})
#join keywords with titles
papers_df['everything'] = papers_df['title'] + ' ' + papers_df['keywords']

#vectorize 
titles_vec = vect.transform(papers_df['everything'])
#OR if you don't want to use just the title:
# titles_vec = vect.transform(papers_df['title'])

# Predict Topics For Each Paper

In [8]:
prediction_vec = model.predict(titles_vec)
topics = [le.inverse_transform([np.argmax(top_val)])[0] for top_val in model.predict(titles_vec)]
papers_df['topic'] = topics
papers_df[['title','topic']].head()

Unnamed: 0,title,topic
0,validation of a freehand technique for cortica...,ORTHOPAEDICS/SPINE
1,short and long-term effects of bisphenol s (bp...,TISSUE/BIOMATERIAL
2,cryopreservation of tendon tissue using dimeth...,CELLULAR/SUBCELLULAR
3,fluid supplementation accelerates epithelial r...,CELLULAR/SUBCELLULAR
4,computational modeling of bone cells and their...,BONE


# Save Paper Titles and Topics

### Store everything in DataFrame and sort by Topic

In [9]:
#add info for github markdown format
papers_df['authors'] = authors
papers_df['journal'] = journals
papers_df['links'] = links
#generate filename
import datetime
now = datetime.datetime.now()
strings = [str(now.year), str(now.month), str(now.day),'lit_update.csv']
fname = 'Literature_Updates/'+'_'.join(strings)
strings = [str(now.year), str(now.month), str(now.day),'lit_update.md']
mdname = 'Literature_Updates/'+'_'.join(strings)

print('Filename: ',fname)

# papers_df = papers_df.sort_values(by = ['topic'])
# papers_df = papers_df.reset_index(drop = True)
papers_df.head()

Filename:  Literature_Updates/2019_4_20_lit_update.csv


Unnamed: 0,title,keywords,everything,topic,authors,journal,links
0,validation of a freehand technique for cortica...,cbt = cortical bone trajectory lis = less inva...,validation of a freehand technique for cortica...,ORTHOPAEDICS/SPINE,"Tan Z, McLachlin S, Whyne C, Finkelstein J,",*Journal of neurosurgery. Spine*,* [Validation of a freehand technique for cort...
1,short and long-term effects of bisphenol s (bp...,breastfeeding developmental plasticity gestati...,short and long-term effects of bisphenol s (bp...,TISSUE/BIOMATERIAL,"da Silva BS, Pietrobon CB, Bertasso IM, Lopes ...","*Environmental pollution (Barking, Essex : 1987)*",* [Short and long-term effects of bisphenol S ...
2,cryopreservation of tendon tissue using dimeth...,,cryopreservation of tendon tissue using dimeth...,CELLULAR/SUBCELLULAR,"Hochstrat E, Müller M, Frank A, Michel P, Hans...",*PloS one*,* [Cryopreservation of tendon tissue using dim...
3,fluid supplementation accelerates epithelial r...,,fluid supplementation accelerates epithelial r...,CELLULAR/SUBCELLULAR,"Burgueño JF, Lang JK, Santander AM, Fernández ...",*PloS one*,* [Fluid supplementation accelerates epithelia...
4,computational modeling of bone cells and their...,,computational modeling of bone cells and their...,BONE,"Wang L, Dong J, Xian CJ,",*Critical reviews in eukaryotic gene expression*,* [Computational Modeling of Bone Cells and Th...


### Save as .csv 

In [10]:
header = ['title','topic']
papers_df.sort_values('topic').to_csv(fname, index = False, columns = header)
print('\nLiterature Update Exported')


Literature Update Exported


# Compile papers grouped by topic

In [27]:
topic_list = np.unique(papers_df.sort_values('topic')['topic'])
print('# Literature Update: ',str(now.year)+'-'+str(now.month)+'-'+str(now.day),'  ', file = open(mdname,'w'))
print('### Created by: [Ryan Alcantara](https://alcantarar.github.io) & [Gary Bruening](https://github.com/GBruening) - University of Colorado Boulder', file=open(mdname, "a"))
print('#### Table Of Contents: ', file=open(mdname, "a"))
for topic in topic_list:
    print('['+topic+']'+'(#'+str.lower(topic).replace('/','')+')  ', file=open(mdname, "a"))
print('', file=open(mdname,"a"))
for topic in topic_list:
    print('----', file=open(mdname, "a"))
    print(topic, file=open(mdname, "a"))
    print('----', file=open(mdname, "a"))
    print('', file=open(mdname, "a"))
    print('[Back to top](#table-of-contents)', file=open(mdname, "a"))
    print('', file=open(mdname, "a"))
    papers_subset = pd.DataFrame(papers_df[papers_df.topic == topic].reset_index(drop = True))
    for i,paper in enumerate(papers_subset['links']):
        print(paper, file=open(mdname, "a"))
        print(papers_subset['authors'][i], file=open(mdname, "a"))
        print(papers_subset['journal'][i]+'.  ', file=open(mdname, "a"))
#         print('<br>  ', file=open("output.md", "a"))
        print('', file=open(mdname, "a"))
    
print('Markdown File Saved')

Markdown File Saved
