In [None]:
from nlpready._mlabc import readx_suba_papers_csv
papers=list(readx_suba_papers_csv('suba_j.csv'))

In [None]:

import logging
from io import StringIO
from pathlib import Path
from typing import TYPE_CHECKING

from bs4 import BeautifulSoup
from html_to_markdown import convert_to_markdown
from selenium import webdriver
from selenium.common.exceptions import InvalidSessionIdException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


from nlpready._issn import Location
from bs4 import Tag

logger = logging.getLogger("nlpready")


class Soup:

    heading_style = 'atx'

    def __init__(self, format:str = 'markdown'):
        self.format = format

    def soupify(self, html: str) -> BeautifulSoup:
        return BeautifulSoup(StringIO(html))

    def soup(self, html: str, css: Location) -> str:
        t = self.soupify(html)
        return "\n".join(self.get_text(a, css) for a in t.select(css.article_css))

        
    def get_text(self, article: Tag, css: Location) -> str:
        if css.remove_css:
            for ref in article.select(css.remove_css):
                ref.decompose()
        if self.format == 'markdown':
            return convert_to_markdown(str(article), heading_style=self.heading_style)
        if self.format == 'pmarkdown':
            return convert_to_markdown(article.prettify(), heading_style=self.heading_style)
        if self.format == 'html':
            return str(article)
        if self.format == 'phtml':
            return article.prettify()
        return article.get_text(" ")

    @classmethod
    def save_html(self, html, path: Path) -> None:
        if not path.parent.exists():
            path.parent.mkdir(parents=True)
        with path.open("wt", encoding="utf8") as fp:
            fp.write(html)


class Selenium(Soup):

    def __init__(
        self,
        headless: bool = True,
        timeout: int = 10,
        format: str = 'markdown',
        path: str | Path | None = None,
    ):
        super().__init__(format)
        options = webdriver.ChromeOptions()
        if headless:
            options.add_argument("headless")
        self.timeout = timeout
        self.driver = webdriver.Chrome(options=options)
        self.path = path
        self.wait_ = None

    # @classmethod
    # def has_driver(self) -> bool:
    #     return which('chromedriver') is not None

    @property
    def wait(self) -> WebDriverWait:
        if self.wait_ is not None:
            return self.wait_
        self.wait_ = WebDriverWait(self.driver, self.timeout)
        return self.wait_

    def wait_for_css(self, css: str) -> None:
        self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, css)))

    def doi(self, url: str, css: Location | None = None) -> str:
        if not url.startswith(('https://', 'http://')):
            url = f"https://doi.org/{url}"
        self.driver.get(url)
        try:
            if css is not None:
                wait = css.wait_css if css.wait_css else css.article_css
            else:
                wait = "html"
            self.wait_for_css(wait)

            html = self.find_html()
        except TimeoutException:
            logger.warning("timeout for: %s", url)
            html = ""
        return html
    
    @property
    def current_url(self) -> str | None:
        try:
            return self.driver.current_url
        except InvalidSessionIdException:
            return None
    
    
    def run(self, doi: str, css: Location) -> str | None:
        html = self.doi(doi, css)
        if not html:
            if self.is_cloudflare_challenge():
                return None
            return html
        if self.path is not None:
            self.save_html(html, Path(self.path))
        return self.soup(html, css)

    def rerun(self, css: Location) -> str:
        return self.soup(self.find_html(), css)

    def close(self):
        self.driver.close()
        self.driver = None

    def __del__(self):
        if self and self.driver is not None:
            try:
                self.close()
            except InvalidSessionIdException:
                pass

    def is_cloudflare_challenge(self) -> bool:
        scripts = [
            a.get_attribute("src")
            for a in self.driver.find_elements(by=By.TAG_NAME, value="script")
        ]
        return any(
            [
                "https://challenges.cloudflare.com" in src
                for src in scripts
                if src is not None
            ],
        )

    def find_html(self) -> str:
        h = self.driver.find_element(by=By.TAG_NAME, value="html")
        return h.get_attribute("outerHTML") or ""

    def resoupify(self) -> BeautifulSoup:
        return self.soupify(self.find_html())

In [None]:
from dataclasses import dataclass


@dataclass
class Location:
    article_css: str
    remove_css: str = ""
    wait_css: str = ""


# Generics

WILEY = Location(
    ".article__body",
    "section.article-section__references,section.article-section__citedBy",
)

OUP = Location(
    ".widget-ArticleFulltext.widget-instance-OUP_Article_FullText_Widget",
    ".ref-list",
)
BMC = Location("main > article", 'section[data-title="References"]')
CELL = Location("article .Abstracts,article .Body", "", "article section.bibliography")
SPRINGER = Location("article main .c-article-body", 'section[data-title="Reference"]')
SPRINGER2 = Location("article main .c-article-body", 'section[data-title="References"]')
PLOSONE = Location(".article-content", "ol.references")
TAYLORFRANCIS = Location("article.article", 'div[id="references-Section"]')
SCIENCEDIRECT = Location(
    "article .Abstracts,article .Body",
    "",
    "article section.bibliography",
)
NATURE = Location("article div.c-article-body", 'section[data-title="References"]')

####

JBioChem = SCIENCEDIRECT  # Location("article .Abstracts,article .Body","", "article section.bibliography")
JBiolChem = SCIENCEDIRECT
Virology = SCIENCEDIRECT
MolCellProteomics = SCIENCEDIRECT
Mitochondrion = SCIENCEDIRECT
JPlantPhysiol = SCIENCEDIRECT
JStructBiol = SCIENCEDIRECT
MolCells = SCIENCEDIRECT
Gene = SCIENCEDIRECT
Cell = SCIENCEDIRECT


PlantJ = WILEY
FEBSLet = WILEY
PhysiolPlant = WILEY
PlantCellEnviron = WILEY
JIntegrPlantBiol = WILEY
Proteomics = WILEY
NewPhytol = WILEY
MolPlantPathol = WILEY
ScientificWorldJournal = WILEY
FEBSJ = WILEY
Traffic = WILEY

JProteomics = CELL
PlantSci = CELL
MolPlant = CELL
BiochemBiophysResCommun = CELL
PlantPhysiolBiochem = CELL
Phytochemistry = CELL
CurrBiol = CELL
BiochimBiophysActa = CELL

JExpBot = OUP
PlantCellPhysiol = OUP
BiosciBiotechnolBiochem = OUP
Genetics = OUP
AnnBot = OUP
NucleicAcidsRes = OUP

PlantReprod = SPRINGER2
PhotosynRes = SPRINGER2
PlantCellRep = SPRINGER2
MolBiolRep = SPRINGER2
Protoplasma = SPRINGER
PlantMolBiol = SPRINGER
JPlantRes = SPRINGER
Springerplus = Location('main article', 'section[data-title="References"],.c-article-header')

PLoSONE = PLOSONE
PLoSGenet = PLOSONE
PLoSBiol = PLOSONE
PLoSPathog = PLOSONE


BMCPlantBiol = BMC
BMCResNotes = BMC

PlantSignalBehav = TAYLORFRANCIS
RNABiol = TAYLORFRANCIS


Nature = NATURE
NatCommun = NATURE
CellRes = NATURE
NatCellBiol = NATURE

ProcNatlAcadSciUSA = Location(
    'main article section[data-extent="frontmatter"], main article section[data-extent="bodymatter"]',
    'section[data-extent="backmatter"],.citations',
)
ProcNatlAcadSciUSA2 = Location(
    'main article section[id="abstract"],main article section[id="bodymatter"]',
)
JVirol = ProcNatlAcadSciUSA2

Science = Location(
    'main article section[id="abstract"],main article section[id="bodymatter"]',
)
MolSystBiol = Science

# 1460-2075
EMBOJ = Location('div[id="abstract"],main article section[data-extent="bodymatter"]')
EMBORep = EMBOJ

# EMBOJ = Location("main article", 'section[data-extent="backmatter"],.citations')

PlantPhysiol = Location(".widget-instance-OUP_Article_FullText_Widget", ".ref-list")
PlantCell = PlantPhysiol

JProteomeRes = Location(
    "main article .article_abstract,main article .article_content",
    "ol#references,.articleCitedByDropzone .cited-by",
)
Biochemistry = JProteomeRes

FrontPlantSci = Location(".JournalFullText .JournalFullText", ".References")
FrontCellDevBiol = FrontPlantSci

SciRep = Location(".main-content")
GeneDev = Location("div.article.fulltext-view")
JCellSci = Location(
    ".widget-ArticleFulltext.widget-instance-ArticleFulltext",
    'h2[data-section-title="References"] ~ div',
)
JCellBiol = Location(
    ".widget-ArticleFulltext.widget-instance-ArticleFulltext_SplitView",
    'h2[data-section-title="References"] ~ div',
)
IntJMolSci = Location("article .html-body section")


MolPlantMicrobeInteract = Location("main article .article__body")
Planta = Location('article section[data-title="Abstract"],article .main-content')
RNA = Location(
    ".article",
    ".section.ref-list",
)  # full-text is on different page with .full attached to url
BiochemJ = Location(
    ".widget-ArticleFulltext.widget-instance-ArticleFulltext",
    ".ref-list",
)
Elife = Location("main .main-content-grid", 'section[id="references"]')
MolBiolCell = Location("main article .article__body", "ul.references")  # Blocks?
Development = Location(
    ".widget-ArticleMainView.widget-instance-ArticleMainView_Article",
    # SIC!!!!
    'h2[data-section-title="<strong>References</strong>"]',
)
MolBiosyst = Location(
    "article.article-control",
    ".ref-list,.article__authors,.drawer-control.fixpadv--m",
)



DATA = {
    "1532-2548": PlantPhysiol,
    "1365-313X": PlantJ,
    "1873-3468": FEBSLet,
    "1664-462X": FrontPlantSci,
    "2045-2322": SciRep,
    "1532-298X": PlantCell,
    "1932-6203": PLoSONE,
    "1399-3054": PhysiolPlant,
    "1365-3040": PlantCellEnviron,
    "0021-9258": JBioChem,
    "0960-7412": PlantJ,
    "0032-0889": PlantPhysiol,
    "1040-4651": PlantCell,
    "0092-8674": Cell,
    "0261-4189": EMBOJ,
    "1460-2075": EMBOJ,
    "1469-3178": EMBORep,
    "0014-5793": FEBSLet,
    "0890-9369": GeneDev,
    "0027-8424": ProcNatlAcadSciUSA,
    "1477-9137": JCellSci,
    "1422-0067": IntJMolSci,
    "1460-2431": JExpBot,
    "1744-7909": JIntegrPlantBiol,
    "1471-9053": PlantCellPhysiol,
    "1469-8137": NewPhytol,
    "1476-4687": Nature,
    "1471-2229": BMCPlantBiol,
    "1752-9867": MolPlant,
    "1559-2324": PlantSignalBehav,
    "1615-9861": Proteomics,
    "1090-2104": BiochemBiophysResCommun,
    "1873-2690": PlantPhysiolBiochem,
    "1873-3700": Phytochemistry,
    "1553-7404": PLoSGenet,
    "1879-0445": CurrBiol,
    "2041-1723": NatCommun,
    "1615-6102": Protoplasma,
    "1545-7885": PLoSBiol,
    "1555-8584": RNABiol,
    "1573-5028": PlantMolBiol,
    "0006-3002": BiochimBiophysActa,
    "1618-0860": JPlantRes,
    "1347-6947": BiosciBiotechnolBiochem,
    "2296-634X": FrontCellDevBiol,
    "1535-3907": JProteomeRes,
    "1083-351X": JBiolChem,
    "0894-0282": MolPlantMicrobeInteract,
    "1432-2048": Planta,
    "1091-6490": ProcNatlAcadSciUSA2,
    "1096-0341": Virology,
    "1756-0500": BMCResNotes,
    "1876-7737": JProteomics,
    "2194-7961": PlantReprod,
    "1535-9484": MolCellProteomics,
    "1873-2259": PlantSci,
    "1573-5079": PhotosynRes,
    "1943-2631": Genetics,
    "1432-203X": PlantCellRep,
    "1469-9001": RNA,
    "1470-8728": BiochemJ,
    "2050-084X": Elife,
    "1540-8140": JCellBiol,
    "1097-4172": Cell,
    "1872-8278": Mitochondrion,
    "1095-8290": AnnBot,
    "1618-1328": JPlantPhysiol,
    "1553-7374": PLoSPathog,
    "1537-744X": ScientificWorldJournal,
    "1095-8657": JStructBiol,
    "1879-0038": Gene,
    "1520-4995": Biochemistry,
    "1098-5514": JVirol,
    "1939-4586": MolBiolCell,  # driver.get BLOCKS!
    "1477-9129": Development,
    "1364-3703": MolPlantPathol,
    "1742-2051": MolBiosyst,
    "1095-9203": Science,
    "1742-4658": FEBSJ,
    "1362-4962": NucleicAcidsRes,
    "1748-7838": CellRes,
    "1600-0854": Traffic,
    "1573-4978": MolBiolRep,
    "1744-4292": MolSystBiol,
    "0219-1032": MolCells,
    "2193-1801": Springerplus,
    "1476-4679": NatCellBiol,
}


In [None]:
from collections import Counter
n = Counter()
for p in papers:
    if p.issn:
        n[p.issn]+=1
p = None
for idx, p1 in enumerate(reversed(papers)):
    if p1.issn:
        if p1.issn in {"0253-9772", '1756-2651'}:
            continue
    if (p1.issn not in DATA) and p1.year > 2010 and n[p1.issn] > 0:
        p = p1
        break
print(idx, len(papers), len(n), len(DATA))
p.issn in DATA

In [None]:
f"https://doi.org/{p.doi}"

In [None]:
p

In [None]:
from IPython.display import Markdown
web = Selenium(timeout=10,headless=False)
markdown = web.run(p.doi, DATA[p.issn])
Markdown(markdown)

----

## EPMC

In [None]:
from bs4 import BeautifulSoup
from io import StringIO, BytesIO
from html_to_markdown import convert_to_markdown
from IPython.display import HTML

In [None]:
import re
from typing import Iterator,IO

from lxml import etree
from lxml.etree import Element

NS = {
    "xlink": "http://www.w3.org/1999/xlink",
    "mml": "http://www.w3.org/1998/Math/MathML",
    "ali": "http://www.niso.org/schemas/ali/1.0/",
}
NSRE = re.compile("^{([^}]+)}(.*)$")

HTMLTAGS = {
    "a",
    "abbr",
    "address",
    "area",
    "article",
    "aside",
    "audio",
    "b",
    "base",
    "bdi",
    "bdo",
    "blockquote",
    "body",
    "br",
    "button",
    "canvas",
    "caption",
    "cite",
    "code",
    "col",
    "colgroup",
    "data",
    "datalist",
    "dd",
    "del",
    "details",
    "dfn",
    "dialog",
    "div",
    "dl",
    "dt",
    "em",
    "embed",
    "fieldset",
    "figcaption",
    "figure",
    "footer",
    "form",
    "h1",
    "h2",
    "h3",
    "h4",
    "h5",
    "h6",
    "head",
    "header",
    "hgroup",
    "hr",
    "html",
    "i",
    "iframe",
    "img",
    "input",
    "ins",
    "kbd",
    "label",
    "legend",
    "li",
    "link",
    "main",
    "map",
    "mark",
    "math",
    "menu",
    "meta",
    "meter",
    "nav",
    "noscript",
    "object",
    "ol",
    "optgroup",
    "option",
    "output",
    "p",
    "picture",
    "pre",
    "progress",
    "q",
    "rp",
    "rt",
    "ruby",
    "s",
    "samp",
    "script",
    "search",
    "section",
    "select",
    "slot",
    "small",
    "source",
    "span",
    "strong",
    "style",
    "sub",
    "summary",
    "sup",
    "svg",
    "table",
    "tbody",
    "td",
    "template",
    "textarea",
    "tfoot",
    "th",
    "thead",
    "time",
    "title",
    "tr",
    "track",
    "u",
    "ul",
    "var",
    "video",
    "wbr",
}

PMCTAGS = {
    "italic": "i",
    "bold": "strong",
    "ref-list": "ol",
    "ref": "li",
    "sec": "section",
    "title": "h2",
    "ext-link": "a",
    "uri": "a",
    "body": "main",
    "abstract": "div",
    "caption": "caption",
    "xref": "a",
    "fig": "figure",
    "pub-id": "a",
    "sup": "sup",
    "sub": "sub",
    "label": "label",
    "article-title": "i",
    "graphic": "a",
    "article-id": "a",
    "license": "div",
    "journal-meta": "div",
    "list": "ul",
    "list-item": "li",
    "media": "a",
    "object-id": "a",
    "def-list": "dl",
    "term": "dt",
    "def": "dd",
    "table": "table",
    "colgroup": "colgroup",
    "tbody": "tbody",
    "tr": "tr",
    "th": "th",
    "td": "td",
    "col": "col",
    "email": "span",
    "title-group": "h1",
    "article": "article",
    "article-meta": "div",
    "front": "div",
    "table-wrap": "div",
    "table-wrap-foot": "div",
    "break": "br",
}


class Events:
    TAGS: dict[str, str] = {}

    def __init__(self):
        self.missing = set()

    def findmatches(self, text: str) -> Iterator[tuple[str|None, str]]:
        yield (None, text)

    def parse(self, fp:IO[bytes]) -> Iterator[str]:
        for e, elem in etree.iterparse(fp, events=("start", "end")):
            if e == "start":
                tag = elem.tag
                m = NSRE.match(tag)
                if m:
                    _, tag = m.group(1, 2)
                stag = "start_" + tag.replace("-", "_")

                if hasattr(self, stag):
                    yield getattr(self, stag)(elem)

                else:
                    if tag not in self.TAGS:
                        if tag in HTMLTAGS:
                            etag = tag
                        else:
                            etag = "span"
                            self.missing.add(tag)
                    else:
                        etag = self.TAGS[tag]
                    yield f'<{etag} class="{tag}">'

                if elem.text:
                    for name, match in self.findmatches(elem.text):
                        if name:
                            yield f'<strong class="{name}">{match}</strong>'
                        else:
                            yield match

            elif e == "end":
                tag = elem.tag
                m = NSRE.match(tag)
                if m:
                    tag = m.group(2)
                stag = "end_" + tag.replace("-", "_")
                if hasattr(self, stag):
                    yield getattr(self, stag)(elem)
                else:
                    if tag not in self.TAGS:
                        if tag in HTMLTAGS:
                            etag = tag
                        else:
                            etag = "span"
                    else:
                        etag = self.TAGS[tag]
                    if etag not in {
                        "hr",
                        "br",
                        "img",
                        "col",
                        "base",
                        "area",
                        "embed",
                        "input",
                    }:

                        yield f"</{etag}> "  # [sic!] add space
                
                if elem.tail:
                    if elem.tail == '\n':
                        yield ' '
                    else:
                        yield elem.tail
            else:
                RuntimeError(f"what event {e}?")


def gethref(elem: Element) -> str | None:
    # return elem.attrib.get("{%s}href" % NS["xlink"])
    return elem.attrib.get(f"{{{NS["xlink"]}}}href")


class PMCEvents(Events):
    TAGS = PMCTAGS

    def __init__(self, url: str | None = None):
        super().__init__()
        self.url = url

    def start_xref(self, elem: Element) -> str:
        rid = elem.attrib["rid"]
        return f'<a class="xref" href="#{rid}">'

    def start_ref(self, elem: Element) -> str:
        rid = elem.attrib["id"]
        return f'<li class="ref" id="{rid}">'

    def start_fig(self, elem: Element) -> str:
        rid = elem.attrib["id"]
        return f'<figure class="fig" id="{rid}">'

    def start_article_id(self, elem: Element) -> str:
        return "<b>article id:</b> " + self._pub_id(elem, "article-id")

    def start_object_id(self, elem: Element) -> str:
        if not elem.text.startswith(("https://", "http://")):
            href = "https://dx.doi.org/" + elem.text
        else:
            href = elem.text
        return f'<a target="xref" class="object-id" href="{href}">'

    def start_pub_id(self, elem: Element) -> str:
        return self._pub_id(elem, "pub-id")

    def _pub_id(self, elem: Element, name: str) -> str:

        pid = elem.attrib.get("pub-id-type")
        if pid == "doi":
            return (
                f'{pid}: <a target="xref" class="{name} {pid}"'
                f' href="https://dx.doi.org/{elem.text}">'
            )

        if pid == "pmid":
            return (
                f'{pid}: <a target="xref" class="{name} {pid}"'
                f' href="https://eutils.ncbi.nlm.nih.gov/pubmed/{elem.text}">'
            )

        if pid == "pmcid":
            return (
                f'{pid}: <a target="xref" class="{name} {pid}"'
                f' href="https://ncbi.nlm.nih.gov/pmc/articles/PMC{elem.text}">'
            )

        return f'{pid}: <a target="xref" class="{name} {pid}" href="#{elem.text}">'

    def start_sec(self, elem: Element) -> str:
        mm = elem.attrib.get("sec-type")
        mm = " " + mm.replace("|", "-") if mm else ""
        return f'<hr/><section class="sec{mm}">'

    def end_sec(self, elem: Element) -> str:
        return "</section>"

    def start_aff(self, elem: Element) -> str:
        iid = elem.attrib.get("id")
        return f'<span class="aff" id="{iid}">' if iid else '<span class="aff" >'

    # def start_abstract(self, elem: Element) -> str:
    #     return '<hr/><div class="abstract">'

    def start_ext_link(self, elem: Element) -> str:
        type = elem.attrib.get("ext-link-type", "")
        return self._uri(elem, "ext-link", type=type)

    def start_uri(self, elem: Element) -> str:
        return self._uri(elem, "uri")

    def _uri(self, elem: Element, tag: str, type: str = "") -> str:
        href = gethref(elem)
        if not href:
            return f'<a class="{tag} error" href="#">{type} No Link for {tag}! {elem.attrib}'

        return f'<a target="{tag}" class="{tag}" href="{href}">{type} '

    def start_graphic(self, elem: Element) -> str:
        href = gethref(elem)
        if not href:
            return (
                f'<a class="graphic error" href="#">No Link for graphic! {elem.attrib}'
            )
        if self.url:
            href = self.url + "/" + href
        return f'graphic: <a target="ext-link" class="graphic" href="{href}">{href}'

    def start_license(self, elem: Element) -> str:
        href = gethref(elem)
        if not href:
            return f'<div class="license error"><a href="#">No Link for license! {elem.attrib}</a>'
        return f'<div class="license"><a target="ext-link" class="license" href="{href}">license</a>'

    def start_media(self, elem: Element) -> str:
        href = gethref(elem)
        if not href:
            return f'media: <a target="media" class="media errror" href="#">no link for media {elem.attrib}'
        if self.url:
            href = self.url + "/" + href
        return f'media: <a target="media" class="media" href="{href}">{href}'


In [None]:

from typing import Self
from io import BytesIO
from requests import Session
from bs4 import BeautifulSoup, Tag
from html_to_markdown import convert_to_markdown



XML = (
    "https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"  # noqa: E221
)
import re
PE = re.compile(b'<[?][^?]+[?]>')
def epmc(pmcid: str, session: Session | None = None) -> bytes | None:
    """Given a PUBMED id return the Europmc XML as bytes."""
    url = XML.format(pmcid=pmcid)
    if session is None:
        session = Session()
    resp = session.get(url)
    if resp.status_code == 404:
        return None
    resp.raise_for_status()
    return PE.sub(b'', resp.content)



class EPMC:
    REMOVE_META = (
        "contrib-group",
        "author-notes",
        "history",
        "kwd-group",
        "article-id",
        "article-categories",
        "aff",
        "pub-date",
        "elocation-id",
        "permissions",
        "issue",
        "volume",
    )
    REMOVE = (
        "ref-list",
        "floats-group",
        "front > journal-meta",
        "back",
        "named-content",
        "funding-source",
        "fn",
        "supplementary-material",
        "fpage",
        "lpage",
        "funding-group",
        "award-id",
        "self-uri",
        "counts",
    )

    def __init__(self, content: bytes):
        self.soup = BeautifulSoup(BytesIO(content), "lxml-xml")
        self.missing = set()

    @classmethod
    def from_pmcid(
        cls, pmcid: str, session: Session | None = None
    ) -> Self | None:
        content = epmc(pmcid, session)
        return cls(content) if content else None

    def get_article(self) -> Tag | None:
        article = self.soup.select("article")
        if not article:
            return None
        return article[0]

    def save_content(self, save: str) -> None:
        with open(save, "wt", encoding="utf-8") as fp:
            fp.write(self.soup.prettify())

    def cull(self, article: Tag) -> Tag:
        remove = list(self.REMOVE)
        remove.extend(f"front > article-meta > {m}" for m in self.REMOVE_META)

        for s in article.select(",".join(remove)):
            s.decompose()
        return article

    def extract(self, article: Tag) -> tuple[Tag, Tag, Tag]:
        title = article.select("> front > article-meta > title-group > article-title")[
            0
        ]
        abstract = article.select("> front > article-meta > abstract")[0]
        body = article.select("> body")[0]
        return title, abstract, body

    def html(self) -> str | None:
        article = self.get_article()
        if article is None:
            return None
        article = self.cull(article)
        pmc = PMCEvents()
        html = "".join(pmc.parse(BytesIO(str(article).encode("utf-8"))))
        self.missing = pmc.missing
        return html

    def html_to_markdown(self, html: str) -> str:
        return convert_to_markdown(html)

In [None]:
from nlpready._mlabc import readx_suba_papers_csv
papers=list(readx_suba_papers_csv('suba_j.csv'))

In [None]:
DONE= {'PMC5486124', 'PMC5447075', 'PMC5853234', 'PMC5490910', 'PMC5490927', 'PMC5823009'}

In [None]:

for p in reversed(papers):
    if p.pmcid and p.pmcid not in DONE and p.year > 2010:
        break
p

In [None]:
f"https://doi.org/{p.doi}"

In [None]:
CSS="""
<style>
.email { color:orange; }
.title-group { color: blue; font-size: 14pt; }
figure { border: solid orange 1px; padding: 0.5em; }
</style>
"""

In [None]:
x =  EPMC.from_pmcid(p.pmcid)
html = x.html()
if x.missing:
    print(x.missing)
HTML(CSS + html)

In [None]:
# x.save_content('saved.xml')

In [None]:
from html_to_markdown import convert_to_markdown
from IPython.display import display, Markdown
Markdown(convert_to_markdown(html))