In [None]:
# -*- coding: utf-8 -*- 
import html
import requests
import tarfile
import urllib
import xml.etree.ElementTree as ET
import shutil

import spacy
from spacy.matcher import Matcher

from io import BytesIO

In [None]:
pmid = "23193287"

In [None]:
def get_pmc_from_pmid(pmid):
    """ Using the NCBI 'PMC ID Converter' API, Get a PMCID
    
        @docs https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/
        @notes
            To search for multiple pmids you can do this in one call:
            https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids=1,2,3
            
            But you need to join the values.  Thankfully Python makes it easy:
            e.g. 
                pmid_array = ["1","2","3"]
                pmid_string = pmid_array.join(",") # results in "1,2,3"
                url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={pmid_string}"
    """
    response = requests.get(f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={pmid}")
    xml = ET.fromstring(response.text)
    return xml.find('record').attrib.get("pmcid",None)

def get_ftp_from_pmcid(pmcid):
    """ Using the NCBI Open Access (OA) API, Get the ftp location for the PCM data
        @docs https://www.ncbi.nlm.nih.gov/pmc/tools/oa-service/
    """
    response = requests.get(f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmcid}")
    xml = ET.fromstring(response.text)
    
    for c in xml.find("records").find('record').findall('link'):
        if fmt := c.attrib.get('format',None):
            if fmt.lower() == 'tgz':
                return c.attrib.get('href')
    return None

def get_tar_gz_from_ftpid(ftpid, local_file_path):
    """ Using the NCBI FTP, Download .tar.gz data from the FTP
        @notes
            * This function simply downloads the data to your computer, opening the file
            using python would be the next step.  
    """
    # `requests` can't handle FTP requests but Python has a standard library to help us out
    response = urllib.request.urlopen(ftpid)
    with open(local_file_path, "wb") as fw:
        shutil.copyfileobj(response,fw)
        
    
def get_xml_from_ftpid(ftpid):
    """ Using the NCBI FTP, Get the XML data (As a string) from the FTP
        @notes
            * NCBI's FTP returns .tar.gz files
            * NCBI's .tar.gz files contain X files, the one we want ends with .nxml
            * This snippet uses some lesser known features of Python to avoid downloading
            .tar.gz locally.  Everything happens in-memory.
    """

    # `requests` can't handle FTP requests but Python has a standard library to help us out
    response = urllib.request.urlopen(ftpid)
    
    # Here's a fun hack.  Instead of downloading the .tar.gaz file locally we "download" it
    # in to memory.  
    memfile = BytesIO(response.read())

    # Next we need to use Python's tar library to open the file (still in memory) and extract
    # the xml content.
    with tarfile.open(fileobj=memfile, mode="r:gz",encoding='utf-8') as tar_file:
        for member in tar_file.getmembers():
            
            # Uncomment this line to print the file names in the .tar.gz file
            #print(member.name)
            
            if ".nxml" in member.name:
                f = tar_file.extractfile(member)
                contents = f.read()
                return contents
    return None

In [None]:
pmcid = get_pmc_from_pmid(pmid)
pmcid

In [None]:
ftpid = get_ftp_from_pmcid(pmcid)
ftpid

In [None]:
xml_content = get_xml_from_ftpid(ftpid)
xml_content[:500] #only print first 500 characters to avoid making a mess.

In [None]:
nlp = spacy.load("en_core_web_sm")


In [None]:
def print_spacy_matches(doc,nlp,matches,padding_start=5,padding_end=5):
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start-padding_start:end+padding_end]  # The matched span
        print(match_id, string_id, start, end, span.text)

In [None]:
# To use the raw XML data we do a few things first.

# 1:  Convert from a byte-string to a unicode string.  
xml_content_as_string = xml_content.decode('utf-8')

# 2:  Replace HTML-escaped characters like &#x02018; with ' 
# (https://www.quackit.com/character_sets/unicode/general_punctuation_unicode_character_codes.cfm)
xml_content_as_string = html.unescape(xml_content_as_string)

# To use spacy we convert our string to a spacy document using the "en_core_web_sm"
# corpus of english words and punctuation.
doc = nlp(xml_content_as_string)

In [None]:
# https://spacy.io/usage/rule-based-matching
patterns = [
    [{"LOWER": "accession"}],
]

# find any instance of the word 'accession'.
matcher = Matcher(nlp.vocab)
matcher.add("Accession", patterns)
matches = matcher(doc)
print_spacy_matches(doc,nlp,matches)

In [None]:

patterns = [
    [{"LOWER": "wgs"},{"LOWER": "accession"},{"LOWER": "number"}],
    # [{"LOWER": "pdb"}],
    # [{"LOWER": "genbank"}],
]

# find any instance of the words 'wgs access number'
matcher = Matcher(nlp.vocab)
matcher.add("Accessions", patterns)
matches = matcher(doc)

print_spacy_matches(doc,nlp,matches,padding_start=0,padding_end=5)

In [None]:
patterns = [
    [{"TEXT": {"REGEX":"[abcdABCD]{1}\/.+\/.+"}}],
]


matcher = Matcher(nlp.vocab)
matcher.add("Accession", patterns)
matches = matcher(doc)
matches

In [None]:
print_spacy_matches(doc,nlp,matches)