Skip to content

Commit

Permalink
Merge pull request #234 from golnazads/master
Browse files Browse the repository at this point in the history
added JATS export, first pass
  • Loading branch information
golnazads committed Apr 21, 2023
2 parents 8acbba1 + 62de892 commit 9e1f97c
Show file tree
Hide file tree
Showing 6 changed files with 290 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python_actions.yml
Expand Up @@ -16,7 +16,7 @@ jobs:
- name: Install dependencies
run: |
# python -m pip install --upgrade setuptools pip
python -m pip install --upgrade pip
python -m pip install --upgrade wheel pip
pip install -U -r requirements.txt
pip install -U -r dev-requirements.txt
Expand Down
255 changes: 254 additions & 1 deletion exportsrv/formatter/xmlFormat.py
Expand Up @@ -5,25 +5,31 @@
from datetime import datetime
from flask import current_app
from textwrap import fill
import re
from geotext import GeoText
from csv import reader

from exportsrv.formatter.format import Format
from exportsrv.utils import get_eprint
from exportsrv.formatter.strftime import strftime

# This class accepts JSON object created by Solr and can reformats it
# This class accepts JSON object created by Solr and can reformat it
# for the XML Export formats we are supporting.
# 1- To get Dublin Core XML use
# dublinXML = XMLFormat(jsonFromSolr).get_dublincore_xml()
# 2- To get Reference XML without Abstract use
# referenceXML = XMLFormat(jsonFromSolr).get_reference_xml()
# 3- To get Reference XML with Abstract use
# referenceXML = XMLFormat(jsonFromSolr).get_reference_xml(True)
# 4- To get JATS XML use
# referenceXML = XMLFormat(jsonFromSolr).get_jats_xml(True)

class XMLFormat(Format):

EXPORT_FORMAT_REF_XML = 'ReferenceXML'
EXPORT_FORMAT_REF_ABS_XML = 'ReferenceAbsXML'
EXPORT_FORMAT_DUBLIN_XML = 'DublinXML'
EXPORT_FORMAT_JATS_XML = 'JATSXML'

EXPORT_SERVICE_RECORDS_SET_XML_REF = [('xmlns', 'http://ads.harvard.edu/schema/abs/1.1/references'),
('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'),
Expand All @@ -38,6 +44,18 @@ class XMLFormat(Format):
('xmlns:dc', 'http://purl.org/dc/elements/1.1/'),
('xsi:schemaLocation', 'http://ads.harvard.edu/schema/abs/1.1/dc http://ads.harvard.edu/schema/abs/1.1/dc.xsd')]

EXPORT_SERVICE_RECORDS_SET_XML_JATS = [('xmlns', 'http://ads.harvard.edu/schema/abs/1.1/dc'),
('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'),
('xmlns:dc', 'http://purl.org/dc/elements/1.1/'),
('xsi:schemaLocation', 'http://ads.harvard.edu/schema/abs/1.0/jats http://ads.harvard.edu/schema/abs/1.0/jats.xsd')]

re_year = re.compile(r'([12]+[09]\d\d)')

# partial list of known publishers,
# once this field is populated in solr, rely on solr,
# for now this is only used for JATS format
re_publisher_names = re.compile(r"([A-Z]+[A-Za-z\s\-:]+ University Press|[A-Z]+[A-Za-z\s\-:,']+ Press|Springer- .*|Elsevier|University of [A-Z]+[A-Za-z\s-]+ Press|University of [A-Z]+[A-Za-z\s-]+|[\w-]+,\s*\w+\s*:[\s\w]+|Springer\s+([A-Z]+[A-Za-z\s-]+)+|Springer Nature|Springer, Cham|Springer Fachmedien Wiesbaden GmbH, DE|Springer-Verlag GmbH Deutschland|Springer-Verlag Berlin Heidelberg|Springer-Verlag|Springer|Cambridge, J. Wilson and son, University press|Cambridge, The University press|Oxford university press|Loyola university press|Cambridge, University press|Harvard university press|Cambridge, Eng., The University press|Louisiana state university press|Des Moines, Iowa, University press|Cambridge [Eng.] The University press|Edinburgh University Press|Cambridge University Press|Yale University Press|Brigham Young University Press|University Press of Virginia|Erevan University Press|Artemis Press|Laval University Press|Columbia University Press|Rutgers University Press|University Press of America|Johns Hopkins University Press|Sydney University Press|Yerevan University press|McGill-Queen's University Press|Innsbruck University Press|University of Arizona Press|Atlantis Press|Ilia State University Press|Ziti press|The University of Chicago Press|Princeton University Press|eXamen.press|T rculo Press|Duke University Press|Kyoto University Press|Imperial College Press|Heron Press Ltd|Kyiv University Press|Sole Logistics Press|BrownWalker Press|Joseph Henry Press|National Radio Astronomy Press|SPIE Press|Kyriakidis Press|St. Martin's Press|Huntington Library and University of Washington Press|Microcosm Press|Free Press \(Simon and Schuster\)|Cambridge Univ. Press|Templeton Foundation Press|IEEE Press|Heron Press|AIP Press|Pergamon Press|Boydell Press|Baltic Astronomy 6 and L. Davis Press|West Virginia University Press|ACM Press Books|State University of New York \(SUNY\) Press|Clarendon Press|Universal Academic Press Inc|SPIE Optical Engineering Press|Presses universitaires de France|Ginn Press|ABELexpress|SPC Press|CRC Press|Plenum Press|Pedagogical Univ. Press|The MIT \(Massachusetts Institute of Technology\) Press|Yourdon Press Computing Series|L. Davis Press|Ivy Press Books|Moscow Univ. Press|Cambridge Univ. Press|Presses du CNRS|Presses de l'Ecole nationale des ponts et chaussees|Academic Press Inc|Cambridge UniversityPress|Vantage Press Inc|Massachusettes Institute of Technology \(MIT\) Press|IAP Press|Academic Press and OHM|IEEE Comput. Soc. Press|The Weizmann Science Press of Israel|MIT Press|Process Press|Blandford Press|Science Press|University of Tasmania Press|Vantage Press|Arno Press|Academic Press|University of Massachusetts Press|Delacorte Press/E. Friede|The Macmillan Press Ltd|Peebles Press|Anchor Press/Doubleday|Smithsonian Institution Press|Anchor Press / Doubleday|FAN Press|Univ. Calif. Press|Presses de la Cite|Ballena Press|Pica Press|University of Texas Press|Optosonic Press|University of Missouri Press|University of Alabama Press|Nauka Press|Exposition Press|Presses universitaire de France|Viking Press|Priory Press|Chemical Rubber Co. Press|Books for Libraries Press|Pragopress|Fundamental Research Press|University of California Press|University of Michigan Press|University of New Mexico Press|University of London Press|Natural History Press|Lenin Belorussian State University Press|Crowell-Collier Press|Greenwood Press|NEO Press|M.I.T. Press|Univ. Wisconsin Press|University of Chicago Press|University of Colorado Press|Brockhampton Press|Golden Press|Lutterworth Press|Trident Press Book|Beacon Press|St Martin's Press|Pageant Press|M. I. T. Press|Orion Press|The Univesrity of Chicago Press|Museum Press|Citadel Press|Pegasus Press|Childrens Press|Ronald Press Co|Majestic Press|Westernlore Press|The Science press printing company|The Technical press ltd|The Florida Bible institute press|The Sheldon press|Pacific Science Press|The Theosophical press|The Clarendon press|The Pilgrim press|The Hispanic Society of America and The De Vinne Press|Press of E. W. Stephen|The Nichols press|Press of T. P. Nichols|Press of J. Wilson and son|Roy. Acad. press)")

def __format_date(self, solr_date, export_format):
"""
Expand Down Expand Up @@ -350,6 +368,10 @@ def __get_fields(self, export_format):
('pub_raw', 'dc:source'), ('pubdate', 'dc:date'), ('keyword', 'dc:subject'),
('copyright', 'dc:rights'), ('url', 'dc:relation'), ('num_citations', 'dc:relation'),
('abstract', 'dc:description'), ('doi', 'dc:identifier')]
elif (export_format == self.EXPORT_FORMAT_JATS_XML):
fields = [('doctype', ''), ('author', ''), ('year', ''), ('title', ''),
('pub', ''), ('pub_raw', ''), ('volume', 'volume'), ('issue', 'issue'),
('editor', ''), ('publisher', ''), ('page', ''), ('page_range', ''), ('doi', '')]
else:
fields = []
return OrderedDict(fields)
Expand Down Expand Up @@ -523,6 +545,226 @@ def __get_doc_reference_xml(self, index, parent, export_format):
self.__add_in(record, fields[field], get_eprint(a_doc))


def __add_person_group_jats_xml(self, person_list, record, person_group_type):
"""
add author or editors for JATS xml format
:param record:
:param person_group_type:
:return:
"""
if person_list:
# add outter tag
person_group_record = ET.SubElement(record, 'person-group')
person_group_record.set('person-group-type', person_group_type)
# now add inner tag
for person in person_list:
separate = person.split(',')
# author might not have first name
if len(separate) >= 1:
person_record = ET.SubElement(person_group_record, 'string-name')
ET.SubElement(person_record, 'surname').text = separate[0].strip()
if len(separate) == 2:
ET.SubElement(person_record, 'given-names').text = '%s.'%separate[1].strip()[0]
# add role tag if this is editor type
if (person_group_type == 'editor'):
ET.SubElement(record, 'role').text = 'Eds.'


def __add_title_jats_xml(self, title, record, publication_type, lookahead):
"""
format title basded on JATS publication type
:param title:
:param record:
:param publication_type:
:param lookahead:
:return:
"""
title = ';'.join(title)

# <article-title>title</article-title>.
if publication_type in ['journal', 'report']:
title_record = ET.SubElement(record, 'article-title')
title_record.text = title
title_record.tail = '.\n'
# <article-title>title</article-title>,
elif publication_type == 'confproc':
title_record = ET.SubElement(record, 'article-title')
title_record.text = title
title_record.tail = ',\n'
# book: <source><italic>title</italic></source>
# book with editor: <source><italic>title</italic></source>;
elif publication_type == 'book':
title_record = ET.SubElement(record, 'title')
ET.SubElement(title_record, 'italic').text = title
# is set to true if the record has editor and needs to have semicolon at the end
if lookahead:
title_record.tail = ';\n'
# <source>Ph.D. thesis</source>
elif publication_type == 'thesis':
ET.SubElement(record, 'source').text = 'Ph.D. thesis'
# <source>title</source>.
elif publication_type in ['software', 'review', 'other']:
ET.SubElement(record, 'source').text = title


def __add_conf_proc_info_jats_xml(self, pub_raw, record):
"""
for confproc publication type, jats format the following four tags are needed to be filled
<conf-name>usually the first or second substring</conf-name>,
<conf-loc>city/country usually appears following conference name</conf-loc>,
<month>most records do not have the month of conference so for now ignore</month>
<year>year of the conference usually appears in pub_raw</year>.
:param pub_raw:
:param record:
:return:
"""
# see if the year appears in pub_raw
year = None
match = self.re_year.search(pub_raw)
if match:
year = match.group(1)

# see if the location is in pub_raw
location = ''
places = GeoText(pub_raw)
if places.cities:
location = places.cities
if places.countries:
if location:
location += ', '
location += places.countries

# now split the pub_raw and try to see if conference name can be inferred
conference = [s for s in list(reader([pub_raw]))[0] if 'conference' in s.lower()]
if conference:
ET.SubElement(record, 'conf-name').text = conference[0]
if location:
ET.SubElement(record, 'conf-loc').text = location
if year:
ET.SubElement(record, 'year').text = location


def __add_book_publisher_info_jats_xml(self, pub_raw, record):
"""
for book publication type, jats format the following four tags are needed to be filled
<publisher-loc>in pub_raw</publisher-loc>:
<publisher-name>in pub_raw</publisher-name>.
:param pub_raw:
:param record:
:return:
"""
# see if the location is in pub_raw
location = ''
places = GeoText(pub_raw)
if places.cities:
location = places.cities
if places.countries:
if location:
location += ', '
location += places.countries

publisher = ''
match = self.re_publisher_names.search(pub_raw, re.IGNORECASE)
if match:
publisher = match.group(1)

if publisher:
if location:
location_record = ET.SubElement(record, 'publisher-loc')
location_record.text = location
location_record.tail = ': '
ET.SubElement(record, 'publisher-name').text = publisher

def __get_doc_jats_xml(self, index, parent):
"""
for each document from Solr, get the fields, and format them accordingly for JATS format
:param index:
:param parent:
:return:
"""
ads_to_jats_doctype_mapping = {
'book': 'book', 'inproceedings': 'book', 'inbook': 'book',
'proceedings':'confproc',
'article': 'journal', 'abstract': 'journal', 'eprint': 'journal',
'phdthesis': 'thesis', 'mastersthesis': 'thesis',
'software': 'software',
'techreport': 'report',
'bookreview': 'review',
'circular': 'other', 'editorial': 'other', 'erratum': 'other', 'misc': 'other', 'catalog': 'other',
'newsletter': 'other', 'obituary': 'other', 'pressrelease': 'other', 'proposal': 'other', 'talk': 'other',
}
a_doc = self.from_solr['response'].get('docs')[index]
fields = self.__get_fields(self.EXPORT_FORMAT_JATS_XML)

# add outter tag and label for this reference
ref = ET.SubElement(parent, 'ref', id='CIT%03d'%(index+1))
ET.SubElement(ref, 'label').text = '%d.'%(index+1)

publication_type = ''
for field in fields:
if not a_doc.get(field, None):
continue

if (field == 'doctype'):
publication_type = ads_to_jats_doctype_mapping[a_doc.get(field, '')]
record = ET.SubElement(ref, 'mixed-citation')
record.set('publication-type', publication_type)
elif (field == 'author') or (field == 'editor'):
self.__add_person_group_jats_xml(a_doc.get(field, []), record, field)
elif (field == 'year'):
# year appears in parenthesis, so need to find the last element and add open parenthesis
if record:
record[-1].tail = '\n('
else:
record.text = '\n('
year = ET.SubElement(record, 'year')
year.text = a_doc.get(field, '')
# now add the close parenthesis
year.tail = ')\n'
elif (field == 'title'):
self.__add_title_jats_xml(a_doc.get(field, ''), record, publication_type, a_doc.get('editor', None))
elif (field == 'pub'):
if (publication_type == 'journal'):
source_record = ET.SubElement(record, 'source')
ET.SubElement(source_record, 'italic').text = a_doc.get(field, '')
elif (field == 'pub_raw'):
if (publication_type == 'confproc'):
self.__add_conf_proc_info_jats_xml(a_doc.get(field, ''), record)
# TODO: once solr contains publisher info need to remove extracting publisher from pub_raw
elif (publication_type == 'book') or (publication_type == 'report'):
self.__add_book_publisher_info_jats_xml(a_doc.get(field, ''), record)
elif (field == 'volume'):
ET.SubElement(record, 'volume').text = a_doc.get(field, '')
elif (field == 'issue'):
# issue appears in parenthesis, so need to find the last element and add open parenthesis
record[-1].tail = '(\n'
issue = ET.SubElement(record, 'issue')
issue.text = a_doc.get(field, '')
# now add the close parenthesis followed by colon
issue.tail = '):'
elif (field == 'page'):
ET.SubElement(record, 'fpage').text = ''.join(a_doc.get(field, ''))
elif (field == 'page_range'):
pages = ''.join(a_doc.get('page_range', '')).split('-')
if len(pages) == 2:
# need to insert a dash before lastpage
record[-1].tail = ' #x2013;'
ET.SubElement(record, 'lpage').text = pages[1]
# insert a dot after page info
record[-1].tail = '.\n'
elif (field == 'doi'):
# need to add `doi:` before tag
record[-1].tail = ' doi:'
doi = ET.SubElement(record, 'pub-id')
doi.set('pub-id-type', 'doi')
doi.text = ''.join(a_doc.get(field, ''))


def __get_xml(self, export_format):
"""
setup the outer xml structure
Expand Down Expand Up @@ -550,6 +792,9 @@ def __get_xml(self, export_format):
elif (export_format == self.EXPORT_FORMAT_DUBLIN_XML):
for index in range(num_docs):
self.__get_doc_dublin_xml(index, records)
elif (export_format == self.EXPORT_FORMAT_JATS_XML):
for index in range(num_docs):
self.__get_doc_jats_xml(index, records)
format_xml = ET.tostring(records, encoding='utf8', method='xml')
format_xml = (b'>\n<'.join(format_xml.split(b'><')))
format_xml = format_xml.replace(b'</record>', b'</record>\n')
Expand Down Expand Up @@ -577,3 +822,11 @@ def get_dublincore_xml(self):
"""
return self.__get_xml(self.EXPORT_FORMAT_DUBLIN_XML)


def get_jats_xml(self):
"""
:return: jats xml format
"""
return self.__get_xml(self.EXPORT_FORMAT_JATS_XML)

1 change: 1 addition & 0 deletions exportsrv/tests/unittests/stubdata/xmlTest.py

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions exportsrv/tests/unittests/test_export_service.py
Expand Up @@ -108,6 +108,12 @@ def test_refxml_with_abs(self):
# now compare it with an already formatted data that we know is correct
assert(xml_export == xmlTest.data_ref_with_abs)

def test_jatsxml(self):
# format the stubdata using the code
xml_export = XMLFormat(solrdata.data).get_jats_xml()
# now compare it with an already formatted data that we know is correct
assert(xml_export == xmlTest.data_jats)

def test_aastex(self):
# format the stubdata using the code
csl_export = CSL(CSLJson(solrdata.data).get(), 'aastex', adsFormatter.latex).get()
Expand Down

0 comments on commit 9e1f97c

Please sign in to comment.