## PubMed Downloader

This code downloads data from pubmed and stores it in a csv file based on search terms.  PubMed serves as a centralized database to identify most medical studies that exist and can help in identifying relevant research on different topics.

Some information and relevant links include:
- https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
- https://www.nlm.nih.gov/databases/download/data_distrib_main.html
- https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch 

Author:  Natalie Chun (17 November 2018)

In [3]:
import pandas as pd
import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import time
import random
import re
import requests
import unicodedata
import csv
import os
import xml.etree.ElementTree as ET

# PubMedDownloader

This class downloads pubmed documents based on key search terms.

In [4]:
class PubMedDownloader(object):
    """Base class for downloading PubMed Abstracts and Article Names based on key search terms.  
    Currently initialized with key words related to school violence.
    Note:  The downloader stores files in csv with key search term appended.
    """
    
    # pmid is a PubMed ID
    # url is the url of the PubMed web page
    # search_term is the string used in the search box on the PubMed website
    def __init__(self, pmid=None):
        self.url = "http://www.ncbi.nlm.nih.gov/pubmed/"
        self.entrezurl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        self.date = datetime.datetime.now().strftime('%Y-%m-%d')
        self.kws = ['school violence','school safety','school security','school bullying','high-risk behavior','at-risk',
                   'school shooting','student suicide']
        self.outdir = './data/pubmed/'

    def search_page(self, searchterm):
        """Get search results from pubmed for particular search term."""
        
        url = self.url + '?term=%s&cmd=DetailsSearch' % (searchterm)
        print(url)
        page = urllib.request.urlopen(url)
        parser = BeautifulSoup(page,'html.parser')
        randnum = random.randint(3,10) 
        time.sleep(randnum)
        self.parse_search_page(parser)
        
    def parse_search_page(self, parser):
        """Parses general page of pubmed and insert into database.
        TODO:  Search multiple pages (right now only gets first page)
        need to return
        """
        
        data = []
        
        # parse one page of results
        results = parser.find_all('div', {'class':'rslt'})
        for res in results:
            #print(res)
            tempdata = {}
            temp = res.find('p', {'class':'title'})
            print(temp)
            tempdata['title'] = temp.text
            tempdata['href'] = temp.find('a',{'href':True})['href']
            tempdata['desc'] = res.find('p',{'class':'desc'}).text
            temp = res.find('p',{'class':'details'})
            tempdata['jrnl'] = temp.text
            if tempdata['title'] != '':
                cols = []
                for key, value in tempdata.items():
                    cols.append(value.strip('\n'))
                data.append(cols)
                print(data)
        
        with open(self.outdir + 'pubmed-%s.csv' % (self.date), 'w', newline='') as f:
            w = csv.writer(f, delimiter=',')
            w.writerow(['title','href','desc','details'])
            for d in data:
                w.writerow(d)
                
        # get the next page
    
    def search_entrez(self, searchterm, retmax=20):
        """Entrez is the API call that makes it easier to retrieve information from PubMed. 
        Fair use policy entails waiting at least 3 seconds to make a new call.
        """
        print("Downloading: %s" % (searchterm))
        # use keyword term that requires both words to show up in article for it to be relevant
        srch = '{}%5BKYWD%5D'.format(searchterm.replace(' ','%2B'))
        
        url = self.entrezurl + 'esearch.fcgi?db=pubmed&term=%s&retmax=%d' % (srch, retmax)
        page = urllib.request.urlopen(url)
        #print(page)
        parser = BeautifulSoup(page,'html.parser')
        #print(parser)
        ids = parser.find_all('id')
        #print(ids)
        data = []
        cols = ['PubmedArticleSet', 'PubmedArticle', 'MedlineCitation', 'PMID', 'DateRevised', 
                'Year', 'Month', 'Day', 'Article', 'Journal', 'ISSN', 'JournalIssue', 'PubDate', 
                'Title', 'ISOAbbreviation', 'ArticleTitle', 'ELocationID', 'Abstract', 'AbstractText',
                'AuthorList', 'Author', 'LastName', 'ForeName', 'Initials', 'AffiliationInfo', 'Affiliation', 
                'Language', 'PublicationTypeList', 'PublicationType', 'ArticleDate', 'MedlineJournalInfo',
                'Country', 'MedlineTA', 'NlmUniqueID', 'ISSNLinking', 'PubmedData', 'History', 'PubMedPubDate',
                'Hour', 'Minute', 'PublicationStatus', 'ArticleIdList', 'ArticleId']
  
        f = open(self.outdir + 'entrezpubmed-%s.csv' % (searchterm.replace(' ','_')), 'w', newline='')
        w = csv.writer(f, delimiter=',')
        w.writerow(cols)
        for cnt, uid in enumerate(ids):
            if cnt % 10 == 0:
                print("Number of summaries downloaded: %d" % (cnt))
            uid = uid.text
            tempdata = self.clean_xml_abstract_page(uid)
            coldata = [tempdata[col] if col in tempdata else '' for col in cols]
            try:
                w.writerow(coldata)
            except:
                print("Error: could not enter column data")
                print(coldata)
            time.sleep(3)
        f.close()
        
    def clean_xml_abstract_page(self, uid):
        """Clean XML node for Entrez abstract page.  These pages provide a fairly complete list of variables
        on each type of publication."""
        
        url = self.entrezurl + 'efetch.fcgi?db=pubmed&id=%s&rettype=abstract&retmode=XML' % (uid)
        page = urllib.request.urlopen(url)
        tree = ET.parse(page)
        root = tree.getroot()
        data = {}
        for elem in root.iter():
            if elem.tag != '':
                if elem.text is not None:
                    if elem.tag in ['ArticleTitle','AbstractText']:
                        data[elem.tag] = elem.text.strip('\n ').encode('ascii','ignore')
                    else:
                        data[elem.tag] = elem.text.strip('\n ')
                    
        #print(data)
        return(data)
        
    def download_entrez_summary(self, uid):
        """This downloads individual entrez summaries based on the UID.
        This page contains easily parsable information, but lacks detail of the abstract page.  
        Therefore this function may not be very useful for collecting data, but could collect certain type 
        of information faster than for the abstract which uses xml parsing."""
        
        url = self.entrezurl + 'esummary.fcgi?db=pubmed&id=%s' % (uid)
        page = urllib.request.urlopen(url)
        #print(page)
        parser = BeautifulSoup(page,'html.parser')
        #print(parser)
        
        # parse the entrez summary
        items = parser.find_all('item')
        data = {}
        data['id'] = uid
        for item in items:
             data[item['name']] = item.text
        #print(data)
        time.sleep(3)
        headers = data.keys()
        return(list(headers), list(data.values()))
    
    def run_all_kws(self):
        """Run and download articles for all kws.  Max out at 1000 article per term."""
        
        for kw in self.kws:
            self.search_entrez(kw, 1000)
        

### Example PubMed Code

The code below searches on particular term in the pubmed databases.  The pubmed has restrictions on the number of publications that are returned for any query with the default being 20.  For each search term the results are entered into its own database.

In [5]:
# Term to be searched in pubmed
searchterm = 'school+violence'
# Number of return items
numitems = 10 
debug = True
pm = PubMedDownloader()
if debug:
    pm.search_page(searchterm)
    pm.download_entrez_summary(30439779)
    pm.get_abstract(30439779)
    pm.clean_xml_abstract_page(30439779)
else:
    pm.run_all_kws()
    #pm.search_entrez(searchterm, numitems)

http://www.ncbi.nlm.nih.gov/pubmed/?term=school+violence&cmd=DetailsSearch
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/30634136" ref="ordinalpos=1&amp;ncbi_uid=30634136&amp;link_uid=30634136&amp;linksrc=docsum_title">The parent/caregiver involvement scale - Short form is a valid measure of parenting quality in high-risk families.</a></p>
[['The parent/caregiver involvement scale - Short form is a valid measure of parenting quality in high-risk families.', '/pubmed/30634136', 'Taylor JM, Bergin CA.', 'Infant Behav Dev. 2019 Jan 7;54:66-79. doi: 10.1016/j.infbeh.2018.11.002. [Epub ahead of print]']]
<p class="title" xmlns:mml="http://www.w3.org/1998/Math/MathML"><a href="/pubmed/30633062" ref="ordinalpos=2&amp;ncbi_uid=30633062&amp;link_uid=30633062&amp;linksrc=docsum_title">Impact of Behavior Management Training on Nurses' Confidence in Managing Patient Aggression.</a></p>
[['The parent/caregiver involvement scale - Short form is a valid measure of p

FileNotFoundError: [Errno 2] No such file or directory: './data/pubmed/pubmed-2019-01-12.csv'

# BaseDownloader

This class downloads a general pdf file from the following url.  Still need to implement pdf parsing.

In [109]:
class BaseDownloader(object):
    
    def __init__(self):
        # load the websites to scrape
        self.df = pd.read_csv('./data/privacy_url_companies-clean-original.csv')
    
    def download_pdf(self, url):
        """Download pdf and save it to a file."""
        
        #page = urllib.request.urlopen('https://journals.sagepub.com/doi/pdf/10.1177/2158244017700460')
        page = requests.get(url)
        with open('./data/pdf/temp.pdf', 'wb') as f:
            f.write(page.content)
            
    def download_page(self, url):
        """Download specific page for parsing."""

        page = urllib.request.urlopen(url)
        # parse the page
        parser = BeautifulSoup(page,'html.parser')
        print(parser)
        #print(parser.find_all('meta'))
        randnum = random.randint(1,5) 
        time.sleep(randnum)
            
    def _download_urls(self, url):
        """Download urls based on specific url"""
        
        for url in self.df['url']:
            self.download_page(url)
        

        

In [110]:
bd = BaseDownloader()
bd.download_pdf('https://journals.sagepub.com/doi/pdf/10.1177/2158244017700460')

## General Resources that Might Be of Relevance

### NCES crime indicators at schools
https://nces.ed.gov/programs/crimeindicators/ind_06.asp 