# Elsevier API

In [None]:
from lxml import etree
import gzip
import os
import requests

### Links

* see also https://codebeautify.org/xmlviewer
* `@xmlns:prism:http://prismstandard.org/namespaces/basic/2.0/` Publishing Requirements for Industry Standard Metadata.
* http://purl.org/ PURLs are persistent URLs, they provide permanent addresses for resources on the web.
* http://dublincore.org/documents/dcmi-terms/
* https://www.elsevier.com/authors/author-schemas/elsevier-xml-dtds-and-transport-schemas
* http://xmlgrid.net/ display xml (good to grab XPath)
* Elsevier API docs https://dev.elsevier.com/api_docs.html

In [None]:
from pygments import highlight
from IPython.display import HTML
import json

from pygments.lexers import XmlLexer,  JsonLexer
from pygments.formatters import HtmlFormatter

lexer = XmlLexer()
jsonlexer = JsonLexer()
htmlfmt = HtmlFormatter(style='colorful', cssclass='highlight')


def tostring(e):
    return etree.tostring(e, encoding='unicode', pretty_print=True)


def showxml(e):
    if not isinstance(e, list):
        e = [e]
    return HTML('<hr/>'.join(highlight(tostring(n), lexer, htmlfmt) for n in e))

def showjson(e):
    if not isinstance(e, list):
        e = [e]
    return HTML('<hr/>'.join(highlight(json.dumps(n, separators=(',', ': '), indent=2), 
                                       jsonlexer, htmlfmt) for n in e))

HTML('<style type="text/css">' +  htmlfmt.get_style_defs() + '</style>')

# Elsevier API

see https://dev.elsevier.com/index.html
and https://github.com/ElsevierDev/elsapy

json response doesn't seem to have all the data.

In [None]:
import requests
from io import BytesIO
from bs4 import BeautifulSoup

PMID_ELSEVIER = 'http://api.elsevier.com/content/article/pubmed_id/{}'
EKEY = '305ac4275ea475891668f6a71234efbc'
Headers = {'X-ELS-APIKey': EKEY}


def elsevier(pmid, url=PMID_ELSEVIER):
    resp = requests.get(url.format(pmid),
                        headers=Headers,
                        params={'view': 'full'}  # Do we need this?
                        )

    # for row in Events().parse(BytesIO(resp.content),'elsevier'):
    #    print(row)#,end=' ')

    soup = BeautifulSoup(BytesIO(resp.content), "xml")
    if soup.find('service-error'):
        print('no such document:', pmid, file=sys.stderr)
        return None
    else:
        return soup.prettify()


def elsevier_xml(pmid, url=PMID_ELSEVIER):
    resp = requests.get(url.format(pmid),
                        headers=Headers,
                        params={'view': 'full', }
                        )

    parser = etree.XMLParser(ns_clean=True)
    tree = etree.parse(BytesIO(resp.content), parser)

    root = tree.getroot()
    return root


def get_object(url):
    resp = requests.get(url,
                        headers=Headers,
                        params={'httpAccept': '*/*'}
                        )
    return resp.content

In [None]:
def getxml(pmid):
    parser = etree.XMLParser(ns_clean=True)
    with open('elsevier/xml/{}.xml'.format(pmid), 'rb') as fp:
        tree = etree.parse(fp, parser)

    root = tree.getroot()
    return root


def getxmlepmc(pmid):
    parser = etree.XMLParser(ns_clean=True)
    with open('elsevier/xml_epmc/{}.xml'.format(pmid), 'rb') as fp:
        tree = etree.parse(fp, parser)

    root = tree.getroot()
    return root

In [None]:
epmc = getxmlepmc('PMC1087855')

In [None]:
resp = requests.get(PMID_ELSEVIER.format('17586462'), 
                    headers={ 'X-ELS-APIKey': EKEY, 'Accept': 'application/json'},
                   params={'view': 'documents'})

``application/pdf`` seems only to return first page!

In [None]:
resp = requests.get(PMID_ELSEVIER.format('17586462'), 
                    headers={ 'X-ELS-APIKey': EKEY, 'Accept': 'application/pdf'},
                   params={'view': 'full'})

In [None]:
resp.content

In [None]:
with open('xxxx.pdf', 'wb') as fp:
    fp.write(resp.content)

In [None]:
r['full-text-retrieval-response']['originalText']['xocs:doc'].keys()

In [None]:
resp = requests.get('https://api.elsevier.com/content/abstract/citation-count', 
                    headers={ 'X-ELS-APIKey': EKEY, 'Accept': 'application/json'},
                    params= {'pubmed_id':'17586462'})

In [None]:
resp.json()

In [None]:
with open(os.path.expanduser('~/17586462.xml')) as fp:
    parser = etree.XMLParser(ns_clean=True)
    tree = etree.parse(fp, parser)

root = tree.getroot()


In [None]:
showxml(root)

In [None]:
NS = root.nsmap.copy()
NS['e'] = NS[None]
del NS[None]
NS

In [None]:
[(r.tag,r.text) for r in root.xpath('//prism:*', namespaces=NS)] 

In [None]:
from IPython.display import Image

Image(get_object('http://api.elsevier.com/content/object/eid/1-s2.0-S0006291X07012508-fx2.jpg'))

In [None]:
[r.text for r in root.xpath('//e:objects/e:object', namespaces=NS)]

In [None]:
showxml([r for r in root.xpath('//ce:bib-reference', namespaces=NS)])

In [None]:
[(r.xpath('./ce:surname/text()', namespaces=NS),
 r.xpath('./ce:given-name/text()', namespaces=NS))
 for r in root.xpath('//ce:bib-reference/sb:reference/sb:contribution/sb:authors/sb:author', namespaces=NS)]

In [None]:
root.xpath('//ce:bib-reference/ce:reference', namespaces=NS)

In [None]:
[r for r in root.xpath('/e:full-text-retrieval-response/e:pubmed-id[1]/text()', namespaces=NS)]

In [None]:
e= root.xpath('/e:full-text-retrieval-response', namespaces=NS)[0]
e.getchildren()

In [None]:
[r for r in root.xpath('/e:full-text-retrieval-response/e:originalText/xocs:doc//ja:article', namespaces=NS)]

In [None]:
''.join(r for r in root.xpath('//ja:article/ja:head//ce:abstract-sec//text()', namespaces=NS)).strip()

In [None]:
[''.join(r.xpath('//text()')) for r in root.xpath('//ja:article/ja:body//ce:sections//ce:para', namespaces=NS)]

In [None]:
from itertools import chain
list(chain(r.xpath('./ce:para|./ce:section', namespaces=NS) 
           for r in root.xpath('//ja:article/ja:body//ce:sections', namespaces=NS)))

In [None]:
list(chain(*[r.xpath('./ce:para|./ce:section', namespaces=NS) 
           for r in root.xpath('//ja:article/ja:body//ce:appendices', namespaces=NS)]))

In [None]:
CE = NS['ce']
TAG = '{%s}%%s' % CE
XREFS = {TAG % 'cross-ref', TAG % 'cross-refs'}
ITALIC = {TAG % 'italic'}
def para2txt(e):
    for t in e.xpath('.//text()'):
        p = t.getparent()
        if p.tag in XREFS:
            if p.tail == t:
                yield p.tail
            else:
                yield '[%s]' % p.attrib['refid']
        elif p.tag in ITALIC and p.tail != t:
            yield '*%s*' % t
        else:
            yield str(t)


In [None]:
paras = list(chain(*[r.xpath('./ce:para', namespaces=NS) 
           for r in root.xpath('//ja:article/ja:body//ce:sections', namespaces=NS)]))
paras

In [None]:

showxml(paras)

In [None]:
''.join(para2txt(paras[1]))

In [None]:
showxml(paras[1])

In [None]:
sections = [(r.attrib['id'], r.xpath('.//ce:para', namespaces=NS))
               for r in root.xpath('//ja:article/ja:body//ce:section', namespaces=NS)]
sections

In [None]:
# CE = NS['ce']
CE = 'http://www.elsevier.com/xml/common/dtd'
TAG = '{%s}%%s' % CE
XREFS = {TAG % 'cross-ref', TAG % 'cross-refs'}
ITALIC = {TAG % 'italic'}
def para2txt(e):
    for t in e.xpath('.//text()'):
        p = t.getparent()
        if p.tag in XREFS:
            if p.tail == t:
                yield p.tail
            else:
                yield '[%s]' % p.attrib['refid']
        elif p.tag in ITALIC and p.tail != t:
            yield '<i>%s</i>' % t
        else:
            yield str(t)

E = '/e:full-text-retrieval-response'
ART = '*[self::ja:converted-article or self::ja:article]'
class Elsevier(object):
    def __init__(self, root):
        self.root = root
        ns = root.nsmap.copy()
        ns['e'] = ns.pop(None)

        self.ns = ns
        
    def xpath(self, path, e=None):
        if e is None:
            return self.root.xpath(path, namespaces=self.ns)
        else:
            return e.xpath(path, namespaces=self.ns)

    @property
    def scopus_id(self):
        return self.xpath(E + '/e:scopus-id[1]/text()')[0]
    
    @property
    def doi(self):
        return self.xpath(E + '/e:coredata/prism:doi[1]/text()')[0]

    @property
    def doi2(self):
        return self.xpath(E + '/e:originalText/xocs:doc/xocs:meta/xocs:doi//text()')[0]
    
    @property
    def pubmid(self):
        return self.xpath(E + '/e:pubmed-id[1]/text()')[0]
    
    @property
    def title(self):
    
        t =  self.xpath(E + '/e:originalText/xocs:doc/xocs:serial-item/' + ART + '/ja:head/ce:title')
        t = t[0]
        return ''.join(para2txt(t)).strip()
    
    @property
    def abstract(self):
        t =  self.xpath(E + '/e:originalText/xocs:doc/xocs:serial-item/' + ART + '/ja:head/ce:abstract/ce:abstract-sec')
        t = t[0]
        return ''.join(para2txt(t)).strip()
    
      
    def sections(self, e):
        for c in e.getchildren():
            if c.tag == '{http://www.elsevier.com/xml/common/dtd}para':
                yield ''.join(para2txt(c)).strip()
            elif c.tag == '{http://www.elsevier.com/xml/common/dtd}section':
                title = c.xpath('./ce:section-title//text()', namespaces=self.ns)
                if title:
                    title = ''.join(title)
                else:
                    title = None
                yield (title, c.get('id'), list(self.sections(c)))
            
    def body(self):
        secs = self.xpath(E + '/e:originalText/xocs:doc/xocs:serial-item/' + ART + '/ja:body/ce:sections')
        for s in secs:
            yield from self.sections(s)
            
    def bib_refs(self):
        refs = self.xpath(E + '/e:originalText/xocs:doc/xocs:serial-item/' + ART + '/ja:tail/ce:bibliography/ce:bibliography-sec/ce:bib-reference')
        for ref in refs:
            rid = ref.get('id')
            authors = ref.xpath('./sb:reference/sb:contribution/sb:authors/sb:author', namespaces=self.ns)
            authors = [' '.join(s.strip() for s in a.xpath('.//text()', namespaces=self.ns)).strip() for a in authors]
            title = ''.join(ref.xpath('./sb:reference/sb:contribution/sb:title//text()', namespaces=self.ns)).strip()
            #jtitle = ref.xpath('./sb:reference/sb:host/sb:issue/sb:series/sb:title//text()')[0]
            yield (rid, title, authors)
            

    def _results(self, text):
        secs = self.xpath(E + '/e:originalText/xocs:doc/xocs:serial-item/' + ART + '/ja:body/ce:sections')
        for sec in secs:
            for s in  sec.xpath('./ce:section', namespaces=self.ns):
                for t in s.xpath('.//ce:section-title/text()', namespaces=self.ns):
                    if t.lower().find(text) >= 0:
                        return s
        return None
    
    def results(self):
        return self._results('results')
    
    def methods(self):
        return self._results('methods')

In [None]:
el = Elsevier(root)
list(el.body())

In [None]:
def tohtml(el):
    def sec(s, level=2):
        for e in s:
            if isinstance(e, tuple):
                yield '<h%d>%s</h%d>' % (level, e[0] or e[1], level)
 
                yield from sec(e[2], level=level+1)
            else:
                yield '<p>%s</p>' % e
            
    return ''.join(e for e in sec(el.body()))


In [None]:
HTML(('<h1>Abstract</h1><p>%s</p>' % el.abstract) + tohtml(el))

In [None]:

DOI_ELSEVIER='http://api.elsevier.com/content/article/doi/{}'
x = elsevier_xml('17586462')


## Notes

* extract chemicals e.g. H_2_O_2
* extract IDs
* extract Peptides, Nucleotides

In [None]:
import re

from collections import Counter

NSRE = re.compile('^{([^}]+)}(.*)$')

PRIME = u'[\'\u2019\u2032]'
Q = [
    ('gfp', '(?:GFP|YFP)'),
    ('primer', '5' + PRIME + r'[\s-]+[CTGA ]+[\s-]+3' + PRIME),
    ('agi', r'[Aa][Tt][1-5MCmc][Gg][0-9]{5}(?:\.[0-9]{1,2})?'),
    ('vector', '[Vv]ector'),
    ('construct', '[Cc]onstruct'),
    ('dna', '[CTGA][CTGA ]{3,}[CTGA]')
]
MATCH = re.compile('(' + '|'.join('(?P<%s>%s)' % (name, regex) for name, regex in Q) + ')')


# Q = [(name,re.compile('^'+regex+'$')) for name,regex in Q]

def cvt(txt):
    idx = 0
    for m in MATCH.finditer(txt):
        s, e = m.start(0), m.end(0)
        if s > idx:
            yield None, txt[idx:s]
        match = m.group(0)
        for name, v in m.groupdict().items():
            if v is not None:
                yield name, match
                break
        else:
            yield 'unknown', match

        idx = e
    if idx < len(txt):
        yield None, txt[idx:]


class Events(object):
    TAGS = {}

    #def findmatches(self, text):
    #    return [(None, text)]

    def findmatches(self, text):
        for row in cvt(text):
            yield row

    def parse(self, fp):
        counts = Counter()

        for e, elem in etree.iterparse(fp, events=('start', 'end')):
            if e == 'start':
                tag = elem.tag
                m = NSRE.match(tag)
                if m:
                    ns, tag = m.group(1, 2)
                stag = 'start_' + tag.replace('-', '_')

                if hasattr(self, stag):
                    yield getattr(self, stag)(elem)

                else:
                    etag = self.TAGS.get(tag, 'span')
                    yield '<%s class="%s">' % (etag, tag)

                if elem.text:
                    for name, match in self.findmatches(elem.text):
                        if name:
                            counts[name] += 1
                            yield '<b class="%s">%s</b>' % (name, match)
                        else:
                            yield match

            elif e == 'end':
                tag = elem.tag
                m = NSRE.match(tag)
                if m:
                    tag = m.group(2)
                stag = 'end_' + tag.replace('-', '_')
                if hasattr(self, stag):
                    yield getattr(self, stag)(elem)
                else:
                    etag = self.TAGS.get(tag, 'span')
                    yield '</%s> ' % etag  # [sic!] add space
                if elem.tail:
                    for name, match in cvt(elem.tail):
                        if name:
                            counts[name] += 1
                            yield '<b class="%s">%s</b>' % (name, match)
                        else:
                            yield match
            else:
                RuntimeError('what event %s?' % e)

        yield '<ul class="counts">'
        for name in counts:
            yield '<li>%s:%d</li>' % (name, counts[name])
        yield '</ul>'


In [None]:
secs = root.xpath('/e:full-text-retrieval-response/e:originalText/xocs:doc/xocs:serial-item/ja:article/ja:body/ce:sections', namespaces=NS)

In [None]:
secs

In [None]:
def parse(elem, level=0):

    yield 's', level, elem.tag,elem.text
    for e in elem.iterchildren():
        yield from parse(e, level=level+1)
    yield 'e', level, elem.tag, elem.tail


In [None]:
from collections import Counter
tags = set()
wc = Counter()
for sec in secs:
    for s, l, t, txt in parse(sec):
        t = t[t.index('}')+1:]
        if txt:
            tt = txt.split()
            for c in tt:
                c = c.lower()
                wc[c] += 1
        
        
        print(' '*l, s,t,repr(txt))
        tags.add(t)

In [None]:
wc.most_common()

In [None]:
tags = set()
refs = root.xpath( '/e:full-text-retrieval-response/e:originalText/xocs:doc/xocs:serial-item/ja:article/ja:head', namespaces=NS)

for sec in refs:
    for s, l, t, txt in parse(sec):
        t = t[t.index('}')+1:]
        print(' '*l, s,t,repr(txt))
        tags.add(t)

In [None]:
tags

In [None]:
headers = [('Accept', 'application/vnd.crossref.unixsd+xml')]
r = requests.get('http://dx.doi.org/10.5555/515151', headers)


In [None]:
r

In [None]:
r.content

In [None]:
r.headers

In [None]:
root

In [None]:
root = getxml('10734224')
ee = Elsevier(root)

In [None]:
ee.results()

In [None]:
root.nsmap


In [None]:
z = '/e:originalText/xocs:doc/xocs:serial-item/*[self::ja:converted-article or self::ja:article]/ja:head/ce:title'
e = '/e:full-text-retrieval-response'
ee = e + z
ee

In [None]:
NS = root.nsmap.copy()
NS['e'] = NS.pop(None)
NS

In [None]:
E = '/e:full-text-retrieval-response'
ART = '*[self::ja:converted-article or self::ja:article]'
import re
C = re.compile(r'\s+', re.I)
CE = 'http://www.elsevier.com/xml/common/dtd'
TAG = '{%s}%%s' % CE
XREFS = {TAG % 'cross-ref', TAG % 'cross-refs'}
ITALIC = {TAG % 'italic'}


def para2txt2(e):
    for t in e.xpath('.//text()'):
        p = t.getparent()
        if p.tag in XREFS:
            if p.tail == t:
                yield p.tail
            else:
                yield '[%s]' % p.attrib['refid']
        elif p.tag in ITALIC and p.tail != t:
            # yield '<i>%s</i>' % t
            yield str(t)
        else:
            yield str(t)


class Elsevier2(object):

    def __init__(self, root):
        self.root = root
        ns = root.nsmap.copy()
        ns['e'] = ns.pop(None)
        self.ns = ns

    def results(self):

        secs = self.root.xpath(E + '/e:originalText/xocs:doc/xocs:serial-item/' + ART + '/ja:body/ce:sections',
                               namespaces=self.ns)
        for sec in secs:
            for s in sec.xpath('./ce:section', namespaces=self.ns):
                for t in s.xpath('.//ce:section-title/text()', namespaces=self.ns):
                    if t.lower().find('results') >= 0:
                        return s

        return None

    def methods(self):

        secs = self.root.xpath(E + '/e:originalText/xocs:doc/xocs:serial-item/' + ART + '/ja:body/ce:sections',
                               namespaces=self.ns)
        for sec in secs:
            for s in sec.xpath('./ce:section', namespaces=self.ns):
                for t in s.xpath('.//ce:section-title/text()', namespaces=self.ns):
                    if t.lower().find('methods') >= 0:
                        return s

        return None

    def abstract(self):

        secs = self.root.xpath(E + '/e:originalText/xocs:doc/xocs:serial-item/' + ART + '/ja:head/ce:abstract/ce:abstract-sec',
                               namespaces=self.ns)
        if not secs:
            return None
        return secs[0]

    def tostr(self, r):
        for p in r.xpath('.//*[self::ce:para or self::ce:simple-para]', namespaces=self.ns):
            res = []
            for t in para2txt2(p):
                res.append(t)

            txt = ''.join(res)
            txt = C.sub(' ', txt)
            yield txt.strip()

In [None]:
ee = Elsevier2(root)
a = ee.abstract()
a

In [None]:
list(ee.tostr(a))

In [None]:
showxml(a)

In [None]:
 a.xpath('.//ce:simple-para|.//ce:para', namespaces=ee.ns)

In [None]:
 a.xpath('.//*[self::ce:simple-para or self::ce:para]', namespaces=ee.ns)

In [None]:
epmc.nsmap

In [None]:
epmc.xpath('/article/front/article-meta/abstract')

In [None]:
mm = epmc.xpath('/article/body/sec[@sec-type="methods"]')

In [None]:
showxml(mm)

In [None]:
mm = epmc.xpath('/article/body/sec/title[contains(translate(text(),"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz"),"methods")]/..')
mm

In [None]:
showxml(mm)

In [None]:
res = epmc.xpath('/article/body/sec/title[contains(text(),"Results")]/..')
res

In [None]:
TRANS = 'translate(text(),"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")'


import re


EXREFS = {'xref'}

EITALIC = {'i'}


def para2txt3(e):
    for t in e.xpath('.//text()'):
        p = t.getparent()
        if p.tag in EXREFS:
            if p.tail == t:
                yield p.tail
            else:
                yield '[%s]' % p.attrib['rid']
        elif p.tag in EITALIC and p.tail != t:
            # yield '<i>%s</i>' % t
            yield str(t)
        else:
            yield str(t)


class EPMC(object):
    SPACE = re.compile(r'\s+', re.I)
    
    def __init__(self, root):
        self.root = root

    def abstract(self):
        res = self.root.xpath('/article/front/article-meta/abstract')
        if not res:
            return None
        return res[0]

    def methods(self):
        mm = self.root.xpath('/article/body/sec[@sec-type="methods"]')
        if not mm:
            return None
        return mm[0]

    def results(self):
        res = epmc.xpath(
            '/article/body/sec/title[contains(' + TRANS + ',"results")]/..')
        if not res:
            return None
        return res[0]

    def tostr(self, r):
        for p in r.xpath('.//p'):
            res = []
            for t in para2txt3(p):
                res.append(t)

            txt = ''.join(res)
            txt = self.SPACE.sub(' ', txt)
            yield txt.strip()

In [None]:
e = EPMC(epmc)

In [None]:
list(e.tostr(e.methods()))

In [None]:
showxml(e.methods())

Journal Title
ISSN
Electronic ISSN
Publication Year
Volume
Issue
Page
DOI (if available)
PMCID
PubMed ID (if available)
Manuscript ID (if available)
Release Date (Mmm DD YYYY or live)

In [None]:
!wget ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/PMC-ids.csv.gz

In [None]:
import gzip
import csv
from collections import Counter
issn = {}
with open(os.path.expanduser('~/Downloads/jnlactive.csv'), encoding='latin1') as fp:
    R = csv.reader(fp)
    h = next(R)
    for row in R:
        issn[row[1]] = row[0]

summary = Counter()
with gzip.open(os.path.expanduser('~/Downloads/PMC-ids.csv.gz'), 'rt') as fp:
    R = csv.reader(fp)
    h = next(R)
    # print(h)
    n = 0
    for row in R:
        n +=1
        i = row[1]
        i = i.replace('-', '')
        if i in issn:
            summary[i] +=1
        # print(i,row)
        # if n > 20:
        #    break
for i in summary:
    print(i, issn[i], summary[i])

In [None]:
sum(summary.values())

In [None]:
from bs4 import BeautifulSoup
from io import BytesIO
doi = 'https://doi.org/10.1105/tpc.12.2.279' # Plant Cell
# doi = 'https://doi.org/10.1104/pp.16.01539' # Plant Phys
resp = requests.get(doi)
soup = BeautifulSoup(BytesIO(resp.content), "html.parser")

In [None]:
resp.url

In [None]:
a = soup.select('div.article.fulltext-view')[0]
str(a)

In [None]:
[s.attrs for s in a.select('div.section')]

In [None]:
[p.text for p in a.select('div.section.results p')]

In [None]:
for sec in a.select('div.section'):
    txt = sec.find('h2').string
    print(sec.attrs, txt)

In [None]:
from bs4 import BeautifulSoup
from io import BytesIO

s = [dict(queryString='0021-9258',fieldName='issn')]
data=dict(searchRowCriteria=s,
dateRange='allDates',
inTheLastList=6)
resp = requests.post('http://onlinelibrary.wiley.com/advanced/search', data=data)

soup = BeautifulSoup(BytesIO(resp.content))
print(soup.prettify())

In [None]:
req = requests.Request('POST','http://onlinelibrary.wiley.com/advanced/search',data=data)
p = req.prepare()
p.body


In [None]:
url = 'http://onlinelibrary.wiley.com/doi/10.1111/j.1365-313X.2004.02057.x/full'
resp = requests.get(url)
soup = BeautifulSoup(BytesIO(resp.content), 'html.parser')

In [None]:
[p.text for p in soup.select('article section#abstract p')]

In [None]:
[p.text for p in soup.select('article section.article-body-section p')]

In [None]:
for p in soup.select('article section.article-body-section p a[title="Link to bibliographic citation"]'):
    p.replace_with('CITATION')
for p in soup.select('article section.article-body-section p'):
    print(p.text)

In [None]:
a = soup.select('article')[0]



In [None]:
a.

In [None]:
for sec in a.select('section.article-body-section'):
    h2 = sec.find('h2')
    if h2 and h2.string.lower() == 'results':
        
    # sec.find('h2').string.lower() == 'results'
    

In [None]:
doi = '10.1074/jbc.M115.683656'
resp = requests.get('http://doi.org/{}'.format(doi))
if not resp.url.endswith('.full'):
    resp = requests.get(resp.url + '.full')
soup = BeautifulSoup(BytesIO(resp.content), 'html.parser')

In [None]:
[p.text for p in soup.select('div.section.abstract p')]

In [None]:
soup.select('div.article.fulltext-view')

In [None]:
doi = '10.1093/pcp/pcx070'
resp = requests.get('http://doi.org/{}'.format(doi))
soup = BeautifulSoup(BytesIO(resp.content), 'html.parser')

In [None]:
doi = '10.1093/pcp/pcx070'
resp = requests.get('http://doi.org/{}'.format(doi))

In [None]:
soup = BeautifulSoup(BytesIO(resp.content), 'html.parser')

In [None]:
soup.find('div', attrs={'data-widgetname': 'ArticleFulltext'})

In [None]:
soup

In [None]:
from IPython.display import IFrame
IFrame(resp.url, width='100%',height=600)

In [None]:
with open('nlpready/dump_18507772.html', 'rb') as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [None]:
soup.select('article div.article__body')

In [None]:
doi = '10.1093/pcp/pcx070'
resp = requests.get('http://doi.org/{}'.format(doi))
from IPython.display import IFrame
IFrame(resp.url, width='100%',height=600)