In [101]:
##################
# Aldreen Venzon
# 3.18.19
# Create a Bibliography from User Input
##################

##################
# Part 1: JSON and XML Webpage
##################
# Step 0: Import libraries
import json # To read and parse JSON
import requests # To read and parse URL
import xml.etree.ElementTree as ET # To read and parse XML

# Step 1: Get user's input (Ask user for num of publication ID and term to search for. Strip all white spaces. Then add to full URL name) 
while True:
    try:
        pub_num = int(input("Enter max publication ID to search for: ").strip())
        if pub_num > 0:
            pub_num
            break
        else:
            print("Please enter a number above 0") # Must be a number > 0
    except (ValueError, EOFError):
        print("Please input something") # cannot be blank
        continue
    
while True:
    try:
        term = input("Enter search term: ").strip()
        if not term:
            print("Please input something") # cannot be blank
            continue
        else:
            break
    except (EOFError):
        print("Please input something") # cannot be blank
        continue

json_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=" + str(pub_num) + "&sort=relevance&term=" + str(term)

# Step 2: Define a function that outputs list of pulication ID's from JSON
def get_idlist(json_url):
    # Read full URL and JSON in it and load it as a dictionary
    content = requests.get(json_url)
    jdict = json.loads(content.content)

    # Find the list of ID's inside "eserachresult" and "idlist" key and store them
    pub_idlist = jdict["esearchresult"]["idlist"]

    # Store publication ID list separated by comma
    return ','.join(pub_idlist)

# Step 3: Combine user's ID list with new URL to get XML webpage 
idlist = get_idlist(json_url)
xml_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=" + idlist

# Step 4: Read XML webpage
XML = requests.get(xml_url, stream=True)
root = ET.fromstring(XML.content)


##################
# Part 2: Buildng the Bibliography
##################
# Step 5: Get all Bibliography info through each function

# Function(s) to get each text inside a Tag 
def get_text(root, header, sub):
    for x in root.findall(header): # find top header
        tag = x.findall(sub) # find the tag
        for y in tag: # get the text inside
            text = y.text
        if len(tag) <= 0: # if there is no tag/text
            return ("") # return blank
        else:
            return (text) # return text inside
        
def get_text_attrib(root, header, sub, att):
    for x in root.findall(header): # find top header
        tag = x.findall(sub) # find the tag
        if len(tag) <= 0: # if there is no tag/text
            return ("") # return a blank
        else:
            for i in tag:
                if i.attrib["IdType"] == att:
                    text = i.text
                    return (text) # return the text inside 

# Function to get each author and concatenate properly
def get_author(root):
    a_list = []
    for x in root.findall('.//Author'):
        exist_first = x.findall("ForeName") # find first name tag exists
        exist_last = x.findall("LastName") # find last name tag exists
        exist_org = x.findall("CollectiveName") # find org name tag exists
        if len(exist_first) and len(exist_last) > 0: # if first and name exists
            first = x.find("ForeName").text # get first name text
            last = x.find("LastName").text # get last name text
            a = (first[0] + ". " + last) # combine first letter of first name and last name
            a_list.append(a)
        elif len(exist_org) > 0:
            org = x.find("CollectiveName").text # if collective name exists           
            a_list.append(org) # get collective name
        else:
            a_list.append("")
    return (', '.join(a_list)) # return list of authors of paper (F1. Last1, F2. Last2)

# Function to get each abstract text and concatenate properly
def get_abstract(root):
    a_list = []
    for node in root.findall('.//AbstractText'): # find all abstract text
        text = node.text
        a_list.append(text)
    return ('. '.join(a_list)) # return all abstract text in one paragraph



##################
# Part 3: Final Product
##################
# Step 6: Get all bibliography info and concatenate in proper citation format
for node in root:
    for paper in node:
        try:              
            author = get_author(paper)
            title = get_text(paper, './/Article', './/ArticleTitle')
            journal = get_text(paper, './/Journal', './/Title')
            volume = get_text(paper, './/JournalIssue', './/Volume')
            issue = get_text(paper, './/JournalIssue', './/Issue')
            page = get_text(paper, './/Pagination', './/MedlinePgn')
            month = get_text(paper, './/ArticleDate', './/Month') 
            day = get_text(paper, './/ArticleDate', './/Day')
            year = get_text(paper, './/ArticleDate', './/Year')
            pubmed = get_text_attrib(node, './/ArticleIdList', './/ArticleId', 'pubmed') 
            doi = get_text_attrib(node, './/ArticleIdList', './/ArticleId', 'doi')
            pii = get_text_attrib(node, './/ArticleIdList', './/ArticleId', 'pii')            
#             abstract = get_abstract(paper)
            
            # Month names as strings
            m = ["Jan.", "Feb.", "Mar.", "Apr.", "May.", "Jun.",\
                 "Jul.", "Aug.", "Sep.", "Oct.", "Nov.", "Dec."] # List of month abbreviations
            date = m[int(month)-1]
        
#         print('{a}, "{t}," {j} {v}({i}) pp.{pp} {d}. PUBMED: {pub}; DOI: {doi} {pii}'.format(a = author, t = title, j = journal, v = volume, i = issue, pp = page, d = date, pub = pubmed, doi = doi, pii = pii))
            print(author + ', "' + title + '," ' + journal + ' ' +
                  volume + '(' + issue + ') ' + 'pp. ' + page + ' (' +
                  date + ' ' + day + ' ' + year + ') PUBMED:' + pubmed + '; DOI' + doi + " PII" + pii)
#             print("\t a:", abstract)
            print()

        except TypeError:
            continue
        except AttributeError:
            continue
            
##################
# TEST
##################
# Max: 6
# Term: fever
# JSON URL: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=6&sort=relevance&term=fever
# idlist: 30414522,30594188,30029861,29861186,29557255,30047499
# XML URL: http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=30414522,30594188,30029861,29861186,29557255,30047499

Enter max publication ID to search for: 10
Enter search term: fever
P. Lai, A. See, M. Silva, W. Gormley, K. Frerichs, M. Aziz-Sultan, R. Du, "Noninfectious Fever in Aneurysmal Subarachnoid Hemorrhage: Association with Cerebral Vasospasm and Clinical Outcome.," World neurosurgery 122() pp. e1014-e1019 (Nov. 07 2018) PUBMED:30414522; DOI10.1016/j.wneu.2018.10.203 PIIS1878-8750(18)32510-5

J. Baek, J. Yu, M. Kim, J. You, H. Jun, Y. Kim, J. Ko, "Coronary artery status of patients with transient fever 24-36 h after first IVIG infusion did not differ from that seen in responsive patients.," Pediatric rheumatology online journal 16(1) pp. 83 (Dec. 29 2018) PUBMED:30594188; DOI10.1186/s12969-018-0301-6 PII10.1186/s12969-018-0301-6

A. Lekshminarayanan, P. Bhatt, V. Linga, R. Chaudhari, B. Zhu, M. Dave, K. Donda, S. Savani, S. Patel, Z. Billimoria, S. Bhaskaran, S. Zaid-Kaylani, F. Dapaah-Siakwan, N. Bhatt, "National Trends in Hospitalization for Fever and Neutropenia in Children with Cancer, 

In [67]:
# Function(s) to get each text inside a Tag 
def get_text(root, header, sub):
    for x in root.findall(header): # find top header
        tag = x.findall(sub) # find the tag
        for y in tag: # get the text inside
            text = y.text
        if tag == []: # if there is no tag/text
            return ("") # return a blank
        else:
            return (text) # return the text inside
        
def get_text_attrib(root, header, sub, att):
    for x in root.findall(header): # find top header
        tag = x.findall(sub) # find the tag
        if tag == []: # if there is no tag/text
            return ("") # return a blank
        else:
            for i in tag:
                if i.attrib["IdType"] == att:
                    text = i.text
                    return (text) # return the text inside 

# Function to get each author and concatenate properly
def get_author(root):
    a_list = []
    for x in root.findall('.//Author'):
        exist_first = x.findall("ForeName") # find first name tag exists
        exist_last = x.findall("LastName") # find last name tag exists
        exist_org = x.findall("CollectiveName") # find org name tag exists
        if len(exist_first) and len(exist_last) > 0: # if first and name exists
            first = x.find("ForeName").text # get first name text
            last = x.find("LastName").text # get last name text
            a = (first[0] + ". " + last) # combine first letter of first name and last name
            a_list.append(a)
        else:
            org = x.find("CollectiveName").text # if collective name exists           
            a_list.append(org) # get collective name
    return (', '.join(a_list)) # return list of authors of paper (F1. Last1, F2. Last2)

# Function to get each abstract text and concatenate properly
def get_abstract(root):
    a_list = []
    for node in root.findall('.//AbstractText'): # find all abstract text
        text = node.text
        a_list.append(text)
    return ('. '.join(a_list)) # return all abstract text in one paragraph

# Month names as strings
m = ["Jan.", "Feb.", "Mar.", "Apr.", "May.", "Jun.",\
     "Jul.", "Aug.", "Sep.", "Oct.", "Nov.", "Dec."] # List of month abbreviations


# Step 7: Get all bibliography info and concatenate in proper citation format
for node in root:
    for paper in node:
        try:              
            author = get_author(paper)
            title = get_text(paper, './/Article', './/ArticleTitle')
            journal = get_text(paper, './/Journal', './/Title')
            volume = get_text(paper, './/JournalIssue', './/Volume')
            issue = get_text(paper, './/JournalIssue', './/Issue')
            page = get_text(paper, './/Pagination', './/MedlinePgn')
            month = get_text(paper, './/ArticleDate', './/Month') 
            day = get_text(paper, './/ArticleDate', './/Day')
            year = get_text(paper, './/ArticleDate', './/Year')
            pubmed = get_text_attrib(node, './/ArticleIdList', './/ArticleId', 'pubmed') 
            doi = get_text_attrib(node, './/ArticleIdList', './/ArticleId', 'doi')
            pii = get_text_attrib(node, './/ArticleIdList', './/ArticleId', 'pii')            
#             abstract = get_abstract(paper)
            
            print("a:", author, "t:", title, 
                  "j:", journal, "v:", volume, "i:", issue, "p:", page,
                  "m:", m[int(month)-1], "d:", day, "y:", year,
                  "pub:", pubmed, "doi:", doi, "pii:", pii)
            print("\t a:", abstract)
            print()

        except TypeError:
            continue
        except AttributeError:
            continue

a: P. Sampaio Rocha-Filho, R. Torres, U. Ramos Montarroyos t: HIV and Headache: A Cross-Sectional Study. j: Headache v: 57 i: 10 p: 1545-1550 m: Sep. d: 14 y: 2017 pub: 28905376 doi: 10.1111/head.13183 pii: None
	 a: The head and neck are the second most common locations for pain among HIV-positive individuals. Most studies were conducted among HIV patients at an advanced stage of the disease.. This was a cross-sectional study. Patients with HIV and CD4+ T lymphocyte counts >500 were included. Semi-structured interview, the Headache Impact Test (HIT-6), and the Hospital Anxiety and Depression Scale were used.. Of the 119 cases included, 63% were men. The mean age was 35.5 ± 10.4 years. Among the patients, 103 (87%) had headaches, 53 (45%) had migraines, 50 (42%) had tension-type headaches, and 53 (45%) had substantial and severe impact of headaches. Eleven patients had headaches that started after they had been diagnosed with HIV. These patients had more migraines (72% vs 43%; P < 0.05

In [14]:
def get_text(root, header, sub):
    for x in root.findall(header): # find top header
        tag = x.findall(sub) # find the tag
        for y in tag: # get the text inside
            text = y.text
        if tag == []: # if there is no tag/text
            return ("") # return a blank
        else:
            return (text) # return the text inside

def get_author(root):
    a_list = []
    for x in root.findall('.//Author'):        
        for y in x:        
            first = y.find("ForeName").text # find first name
            last = y.find("LastName").text # find last name
            a = (first[0] + ". " + last) # combine first letter of first name and last name
            a_list.append(a)
    return (', '.join(a_list)) # return list of authors of paper (F1. Last1, F2. Last2)

P. Sampaio Rocha-Filho
P. Sampaio Rocha-Filho
Y. Ran
Y. Ran
H. Yuan
H. Yuan
T. Takizawa
T. Takizawa
Y. Zhang
Y. Zhang
D. Friedman
D. Friedman
M. Bezerra
M. Bezerra
E. Sousa Melo
E. Sousa Melo
J. Ong
J. Ong
T. Seifert
T. Seifert


In [33]:
a_list = []
for x in root.findall('.//Author'):        
    for y in x:
        if y != None:
            first = y.find("ForeName").text # find first name
            last = y.find("LastName").text # find last name
            a = (first[0] + ". " + last) # combine first letter of first name and last name
            a_list.append(a)
        else:
            continue
print (', '.join(a_list)) # return list of authors of paper (F1. Last1, F2. Last2)

AttributeError: 'NoneType' object has no attribute 'text'

In [21]:
# for node in root.findall('.//ArticleId'): # find pubmed text
#     if node.attrib["IdType"] == "pubmed":
#         t = node.text
#         print ("PUBMED: " + t + "; ") # return pubmed text

# for x in root.findall('.//ArticleIdList'): # find top header
#     tag = x.findall('.//ArticleId') # find the tag
#     for i in tag:
#         if i.attrib["IdType"] == "pubmed":
#             text = i.text
#     if tag == []: # if there is no tag/text
#         print ("") # return a blank
#     else:
#         print (text) # return the text inside

28905376
28925506
26473407
28670690
28742215
28752894
24512574
24756514
25600719
1992368
2289234
22875880
25070715
25228684
15481717
24739993
12003693
23695067
12034799
4434179
25995055
24631586
23771276
1998880
27910093
27349210
28488294
28480575
