In [160]:
from pyarrow import parquet
from pathlib import Path
import random
from urllib.request import urlopen
import json
import time

In [169]:

# A function to return the XML catalog record for an ISSN by two subsequence API calls.
def nlm_xml(issn):
  with urlopen(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nlmcatalog&term={issn}&format=json") as response:
    body = response.read()
  id = json.loads(body)['esearchresult']['idlist'][0]
  # Timeout under the limit right now.
  time.sleep(.25)
  with urlopen(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nlmcatalog&id={id}&format=xml") as response:
    body = response.read()
  return body.decode("utf-8")

class Catalog_Cache():
  """
  A dictionary-like object that handles the caching of XML catalog records 
  and the parsing of subject headings from them.
  """
  def __init__(self):
    self.cache = {}
    self.p = Path("data", "issn.txt")
    self.__enter__()
    if self.p.exists():
      self.prepopulate_cache()
    
  def __getitem__(self, issn):
    if self.cache.get(issn):
      return self.cache[issn]
    else:
      print("Getting over HTTP", end = "\r")
      self.cache[issn] = nlm_xml(issn)
      # Limit of 3 queries a second.
      time.sleep(0.5)
      self.writer.write(f"{issn}\x1e{self.cache[issn]}\x1d")
      return self.cache[issn]

  def __enter__(self):
    self.writer = self.p.open(mode="a")
    return self

  def __exit__(self, type, value, traceback):
    self.writer.close()

  def close(self):
    self.writer.close()

  def prepopulate_cache(self):
    lines = self.p.open('r').read().split('\x1d')
    for line in lines:
      if len(line) > 1:
        id, xml = line.split('\x1e')
        self.cache[id] = xml
      
  def subject_headings(self, issn):
    body = self[issn]
    f = BytesIO()
    f.write(body.encode("utf-8"))
    f.seek(0)
    xtree = et.parse(f)
    xroot = xtree.getroot()
    mesh_headings = [m.text for m in xroot.findall('.//MeshHeading//DescriptorName')]
    return mesh_headings



In [174]:

m = Catalog_Cache()

In [175]:
len(m.cache)

3690

In [176]:
m.close()

In [173]:

fs = [*Path("parquet").glob("*.parquet")]
random.shuffle(fs)
tb = parquet.read_table(fs[0])
with Catalog_Cache() as m:
  for issn in [*set(tb.flatten().flatten().flatten().flatten()['MedlineCitation.Article.Journal.ISSN'].to_pylist())]:
    subjects = m.subject_headings(issn)
    print(issn, subjects, end = "\r")


1090-3801 ['Pain Management', 'Pain']ilitation'], Ocular'] 'Primary Health Care']ealth', 'Sports Medicine', 'Travel']d States']physiology', 'Neuropsychology']]urveillance, Postmarketing']

In [95]:
from io import BytesIO
import xml.etree.ElementTree as et



In [157]:
m.close()

In [96]:
def nlmid_for_issn(issns):
  https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nlmcatalog&term=0025-8172&format=json
def entries_for_nlmid:
  https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nlmcatalog&id=412004&format=xml

SyntaxError: invalid syntax (2208174421.py, line 2)

In [None]:
import pyarrow as pa
pa.RecordBatch.from_struct_array(tb['MedlineCitation'].combine_chunks())

pyarrow.RecordBatch
PMID: string
DateCompleted: struct<Year: string, Month: string, Day: string>
  child 0, Year: string
  child 1, Month: string
  child 2, Day: string
DateRevised: struct<Year: string, Month: string, Day: string>
  child 0, Year: string
  child 1, Month: string
  child 2, Day: string
Article: struct<Journal: struct<ISSN: string, JournalIssue: struct<Volume: string, Issue: string, PubDate: struct<Year: string, Month: string, MedlineDate: string, Season: string, Day: string>>, Title: list<item: string>, ISOAbbreviation: string>, ArticleTitle: string, Pagination: struct<MedlinePgn: string>, ELocationID: list<item: string>, Abstract: struct<AbstractText: string, CopyrightInformation: string>, Author: list<item: struct<LastName: string, ForeName: string, Initials: string, AffiliationInfo: list<item: struct<Affiliation: string>>, CollectiveName: string, Suffix: string, Identifier: list<item: string>>>, Language: list<item: string>, PublicationTypeList: struct<PublicationTyp

In [50]:
tb.flatten().flatten().flatten().flatten()['MedlineCitation.Article.Journal.ISSN']

<pyarrow.lib.ChunkedArray object at 0x12ab0c360>
[
  [
    "1434-5293",
    "1434-5293",
    "1434-5293",
    "1434-5293",
    "1434-5293",
    "1434-5293",
    "1434-5293",
    "0941-293X",
    "1438-2199",
    "0009-4722",
    ...
    "1556-4029",
    "1556-4029",
    "1365-2796",
    "1600-0765",
    "1423-0410",
    "1399-3054",
    "1423-0410",
    "1478-3231",
    "1478-3231",
    "1478-3231"
  ]
]

In [52]:
from pyarrow import compute as pc
r = pc.list_flatten(tb.flatten().flatten().flatten().flatten()['MedlineCitation.Article.Author'])

pa.RecordBatch.from_struct_array(r.combine_chunks())['Identifier']

<pyarrow.lib.ListArray object at 0x12aa37700>
[
  null,
  null,
  null,
  null,
  null,
  null,
  null,
  null,
  null,
  null,
  ...
  null,
  null,
  null,
  null,
  null,
  null,
  null,
  null,
  null,
  null
]