Merge pull request #234 from golnazads/master

added JATS export, first pass
adsabs · Apr 21, 2023 · 9e1f97c · 9e1f97c
2 parents 8acbba1 + 62de892
commit 9e1f97c
Show file tree

Hide file tree

Showing 6 changed files with 290 additions and 2 deletions.
diff --git a/.github/workflows/python_actions.yml b/.github/workflows/python_actions.yml
@@ -16,7 +16,7 @@ jobs:
     - name: Install dependencies
       run: |
         # python -m pip install --upgrade setuptools pip
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade wheel pip
         pip install -U -r requirements.txt
         pip install -U -r dev-requirements.txt
         

diff --git a/exportsrv/formatter/xmlFormat.py b/exportsrv/formatter/xmlFormat.py
@@ -5,25 +5,31 @@
 from datetime import datetime
 from flask import current_app
 from textwrap import fill
+import re
+from geotext import GeoText
+from csv import reader
 
 from exportsrv.formatter.format import Format
 from exportsrv.utils import get_eprint
 from exportsrv.formatter.strftime import strftime
 
-# This class accepts JSON object created by Solr and can reformats it
+# This class accepts JSON object created by Solr and can reformat it
 # for the XML Export formats we are supporting.
 # 1- To get Dublin Core XML use
 #    dublinXML = XMLFormat(jsonFromSolr).get_dublincore_xml()
 # 2- To get Reference XML without Abstract use
 #    referenceXML = XMLFormat(jsonFromSolr).get_reference_xml()
 # 3- To get Reference XML with Abstract use
 #    referenceXML = XMLFormat(jsonFromSolr).get_reference_xml(True)
+# 4- To get JATS XML use
+#    referenceXML = XMLFormat(jsonFromSolr).get_jats_xml(True)
 
 class XMLFormat(Format):
 
     EXPORT_FORMAT_REF_XML = 'ReferenceXML'
     EXPORT_FORMAT_REF_ABS_XML = 'ReferenceAbsXML'
     EXPORT_FORMAT_DUBLIN_XML = 'DublinXML'
+    EXPORT_FORMAT_JATS_XML = 'JATSXML'
 
     EXPORT_SERVICE_RECORDS_SET_XML_REF = [('xmlns', 'http://ads.harvard.edu/schema/abs/1.1/references'),
                                           ('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'),
@@ -38,6 +44,18 @@ class XMLFormat(Format):
                                              ('xmlns:dc', 'http://purl.org/dc/elements/1.1/'),
                                              ('xsi:schemaLocation', 'http://ads.harvard.edu/schema/abs/1.1/dc http://ads.harvard.edu/schema/abs/1.1/dc.xsd')]
 
+    EXPORT_SERVICE_RECORDS_SET_XML_JATS = [('xmlns', 'http://ads.harvard.edu/schema/abs/1.1/dc'),
+                                           ('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'),
+                                           ('xmlns:dc', 'http://purl.org/dc/elements/1.1/'),
+                                           ('xsi:schemaLocation', 'http://ads.harvard.edu/schema/abs/1.0/jats http://ads.harvard.edu/schema/abs/1.0/jats.xsd')]
+
+    re_year = re.compile(r'([12]+[09]\d\d)')
+
+    # partial list of known publishers,
+    # once this field is populated in solr, rely on solr,
+    # for now this is only used for JATS format
+    re_publisher_names = re.compile(r"([A-Z]+[A-Za-z\s\-:]+ University Press|[A-Z]+[A-Za-z\s\-:,']+ Press|Springer- .*|Elsevier|University of [A-Z]+[A-Za-z\s-]+ Press|University of [A-Z]+[A-Za-z\s-]+|[\w-]+,\s*\w+\s*:[\s\w]+|Springer\s+([A-Z]+[A-Za-z\s-]+)+|Springer Nature|Springer, Cham|Springer Fachmedien Wiesbaden GmbH, DE|Springer-Verlag GmbH Deutschland|Springer-Verlag Berlin Heidelberg|Springer-Verlag|Springer|Cambridge, J. Wilson and son, University press|Cambridge, The University press|Oxford university press|Loyola university press|Cambridge, University press|Harvard university press|Cambridge, Eng., The University press|Louisiana state university press|Des Moines, Iowa, University press|Cambridge [Eng.] The University press|Edinburgh University Press|Cambridge University Press|Yale University Press|Brigham Young University Press|University Press of Virginia|Erevan University Press|Artemis Press|Laval University Press|Columbia University Press|Rutgers University Press|University Press of America|Johns Hopkins University Press|Sydney University Press|Yerevan University press|McGill-Queen's University Press|Innsbruck University Press|University of Arizona Press|Atlantis Press|Ilia State University Press|Ziti press|The University of Chicago Press|Princeton University Press|eXamen.press|T rculo Press|Duke University Press|Kyoto University Press|Imperial College Press|Heron Press Ltd|Kyiv University Press|Sole Logistics Press|BrownWalker Press|Joseph Henry Press|National Radio Astronomy Press|SPIE Press|Kyriakidis Press|St. Martin's Press|Huntington Library and University of Washington Press|Microcosm Press|Free Press \(Simon and Schuster\)|Cambridge Univ. Press|Templeton Foundation Press|IEEE Press|Heron Press|AIP Press|Pergamon Press|Boydell Press|Baltic Astronomy 6 and L. Davis Press|West Virginia University Press|ACM Press Books|State University of New York \(SUNY\) Press|Clarendon Press|Universal Academic Press Inc|SPIE Optical Engineering Press|Presses universitaires de France|Ginn Press|ABELexpress|SPC Press|CRC Press|Plenum Press|Pedagogical Univ. Press|The MIT \(Massachusetts Institute of Technology\) Press|Yourdon Press Computing Series|L. Davis Press|Ivy Press Books|Moscow Univ. Press|Cambridge Univ.  Press|Presses du CNRS|Presses de l'Ecole nationale des ponts et chaussees|Academic Press Inc|Cambridge UniversityPress|Vantage Press Inc|Massachusettes Institute of Technology \(MIT\) Press|IAP Press|Academic Press and OHM|IEEE Comput. Soc. Press|The Weizmann Science Press of Israel|MIT Press|Process Press|Blandford Press|Science Press|University of Tasmania Press|Vantage Press|Arno Press|Academic Press|University of Massachusetts Press|Delacorte Press/E. Friede|The Macmillan Press Ltd|Peebles Press|Anchor Press/Doubleday|Smithsonian Institution Press|Anchor Press / Doubleday|FAN Press|Univ. Calif. Press|Presses de la Cite|Ballena Press|Pica Press|University of Texas Press|Optosonic Press|University of Missouri Press|University of Alabama Press|Nauka Press|Exposition Press|Presses universitaire de France|Viking Press|Priory Press|Chemical Rubber Co. Press|Books for Libraries Press|Pragopress|Fundamental Research Press|University of California Press|University of Michigan Press|University of New Mexico Press|University of London Press|Natural History Press|Lenin Belorussian State University Press|Crowell-Collier Press|Greenwood Press|NEO Press|M.I.T. Press|Univ. Wisconsin Press|University of Chicago Press|University of Colorado Press|Brockhampton Press|Golden Press|Lutterworth Press|Trident Press Book|Beacon Press|St Martin's Press|Pageant Press|M. I. T. Press|Orion Press|The Univesrity of Chicago Press|Museum Press|Citadel Press|Pegasus Press|Childrens Press|Ronald Press Co|Majestic Press|Westernlore Press|The Science press printing company|The Technical press ltd|The Florida Bible institute press|The Sheldon press|Pacific Science Press|The Theosophical press|The Clarendon press|The Pilgrim press|The Hispanic Society of America and The De Vinne Press|Press of E. W. Stephen|The Nichols press|Press of T. P. Nichols|Press of J. Wilson and son|Roy. Acad. press)")
+
     def __format_date(self, solr_date, export_format):
         """
 
@@ -350,6 +368,10 @@ def __get_fields(self, export_format):
                       ('pub_raw', 'dc:source'), ('pubdate', 'dc:date'), ('keyword', 'dc:subject'),
                       ('copyright', 'dc:rights'), ('url', 'dc:relation'), ('num_citations', 'dc:relation'),
                       ('abstract', 'dc:description'), ('doi', 'dc:identifier')]
+        elif (export_format == self.EXPORT_FORMAT_JATS_XML):
+            fields = [('doctype', ''), ('author', ''), ('year', ''), ('title', ''),
+                      ('pub', ''), ('pub_raw', ''), ('volume', 'volume'), ('issue', 'issue'),
+                      ('editor', ''), ('publisher', ''), ('page', ''), ('page_range', ''), ('doi', '')]
         else:
             fields = []
         return OrderedDict(fields)
@@ -523,6 +545,226 @@ def __get_doc_reference_xml(self, index, parent, export_format):
                 self.__add_in(record, fields[field], get_eprint(a_doc))
 
 
+    def __add_person_group_jats_xml(self, person_list, record, person_group_type):
+        """
+        add author or editors for JATS xml format
+
+        :param record:
+        :param person_group_type:
+        :return:
+        """
+        if person_list:
+            # add outter tag
+            person_group_record = ET.SubElement(record, 'person-group')
+            person_group_record.set('person-group-type', person_group_type)
+            # now add inner tag
+            for person in person_list:
+                separate = person.split(',')
+                # author might not have first name
+                if len(separate) >= 1:
+                    person_record = ET.SubElement(person_group_record, 'string-name')
+                    ET.SubElement(person_record, 'surname').text = separate[0].strip()
+                if len(separate) == 2:
+                    ET.SubElement(person_record, 'given-names').text = '%s.'%separate[1].strip()[0]
+            # add role tag if this is editor type
+            if (person_group_type == 'editor'):
+                ET.SubElement(record, 'role').text = 'Eds.'
+
+
+    def __add_title_jats_xml(self, title, record, publication_type, lookahead):
+        """
+        format title basded on JATS publication type
+
+        :param title:
+        :param record:
+        :param publication_type:
+        :param lookahead:
+        :return:
+        """
+        title = ';'.join(title)
+
+        # <article-title>title</article-title>.
+        if publication_type in ['journal', 'report']:
+            title_record = ET.SubElement(record, 'article-title')
+            title_record.text = title
+            title_record.tail = '.\n'
+        # <article-title>title</article-title>,
+        elif publication_type == 'confproc':
+            title_record = ET.SubElement(record, 'article-title')
+            title_record.text = title
+            title_record.tail = ',\n'
+        # book: <source><italic>title</italic></source>
+        # book with editor: <source><italic>title</italic></source>;
+        elif publication_type == 'book':
+            title_record = ET.SubElement(record, 'title')
+            ET.SubElement(title_record, 'italic').text = title
+            # is set to true if the record has editor and needs to have semicolon at the end
+            if lookahead:
+                title_record.tail = ';\n'
+        # <source>Ph.D. thesis</source>
+        elif publication_type == 'thesis':
+            ET.SubElement(record, 'source').text = 'Ph.D. thesis'
+        # <source>title</source>.
+        elif publication_type in ['software', 'review', 'other']:
+            ET.SubElement(record, 'source').text = title
+
+
+    def __add_conf_proc_info_jats_xml(self, pub_raw, record):
+        """
+        for confproc publication type, jats format the following four tags are needed to be filled
+        <conf-name>usually the first or second substring</conf-name>,
+        <conf-loc>city/country usually appears following conference name</conf-loc>,
+        <month>most records do not have the month of conference so for now ignore</month>
+        <year>year of the conference usually appears in pub_raw</year>.
+
+        :param pub_raw:
+        :param record:
+        :return:
+        """
+        # see if the year appears in pub_raw
+        year = None
+        match = self.re_year.search(pub_raw)
+        if match:
+            year = match.group(1)
+
+        # see if the location is in pub_raw
+        location = ''
+        places = GeoText(pub_raw)
+        if places.cities:
+            location = places.cities
+        if places.countries:
+            if location:
+                location += ', '
+            location += places.countries
+
+        # now split the pub_raw and try to see if conference name can be inferred
+        conference = [s for s in list(reader([pub_raw]))[0] if 'conference' in s.lower()]
+        if conference:
+            ET.SubElement(record, 'conf-name').text = conference[0]
+            if location:
+                ET.SubElement(record, 'conf-loc').text = location
+            if year:
+                ET.SubElement(record, 'year').text = location
+
+
+    def __add_book_publisher_info_jats_xml(self, pub_raw, record):
+        """
+        for book publication type, jats format the following four tags are needed to be filled
+        <publisher-loc>in pub_raw</publisher-loc>:
+        <publisher-name>in pub_raw</publisher-name>.
+
+        :param pub_raw:
+        :param record:
+        :return:
+        """
+        # see if the location is in pub_raw
+        location = ''
+        places = GeoText(pub_raw)
+        if places.cities:
+            location = places.cities
+        if places.countries:
+            if location:
+                location += ', '
+            location += places.countries
+
+        publisher = ''
+        match = self.re_publisher_names.search(pub_raw, re.IGNORECASE)
+        if match:
+            publisher = match.group(1)
+
+        if publisher:
+            if location:
+                location_record = ET.SubElement(record, 'publisher-loc')
+                location_record.text = location
+                location_record.tail = ': '
+            ET.SubElement(record, 'publisher-name').text = publisher
+
+    def __get_doc_jats_xml(self, index, parent):
+        """
+        for each document from Solr, get the fields, and format them accordingly for JATS format
+
+        :param index:
+        :param parent:
+        :return:
+        """
+        ads_to_jats_doctype_mapping = {
+            'book': 'book', 'inproceedings': 'book', 'inbook': 'book',
+            'proceedings':'confproc',
+            'article': 'journal', 'abstract': 'journal', 'eprint': 'journal',
+            'phdthesis': 'thesis', 'mastersthesis': 'thesis',
+            'software': 'software',
+            'techreport': 'report',
+            'bookreview': 'review',
+            'circular': 'other', 'editorial': 'other', 'erratum': 'other', 'misc': 'other', 'catalog': 'other',
+            'newsletter': 'other', 'obituary': 'other', 'pressrelease': 'other', 'proposal': 'other', 'talk': 'other',
+        }
+        a_doc = self.from_solr['response'].get('docs')[index]
+        fields = self.__get_fields(self.EXPORT_FORMAT_JATS_XML)
+
+        # add outter tag and label for this reference
+        ref = ET.SubElement(parent, 'ref', id='CIT%03d'%(index+1))
+        ET.SubElement(ref, 'label').text = '%d.'%(index+1)
+
+        publication_type = ''
+        for field in fields:
+            if not a_doc.get(field, None):
+                continue
+
+            if (field == 'doctype'):
+                publication_type = ads_to_jats_doctype_mapping[a_doc.get(field, '')]
+                record = ET.SubElement(ref, 'mixed-citation')
+                record.set('publication-type', publication_type)
+            elif (field == 'author') or (field == 'editor'):
+                self.__add_person_group_jats_xml(a_doc.get(field, []), record, field)
+            elif (field == 'year'):
+                # year appears in parenthesis, so need to find the last element and add open parenthesis
+                if record:
+                    record[-1].tail = '\n('
+                else:
+                    record.text = '\n('
+                year = ET.SubElement(record, 'year')
+                year.text = a_doc.get(field, '')
+                # now add the close parenthesis
+                year.tail = ')\n'
+            elif (field == 'title'):
+                self.__add_title_jats_xml(a_doc.get(field, ''), record, publication_type, a_doc.get('editor', None))
+            elif (field == 'pub'):
+                if (publication_type == 'journal'):
+                    source_record = ET.SubElement(record, 'source')
+                    ET.SubElement(source_record, 'italic').text = a_doc.get(field, '')
+            elif (field == 'pub_raw'):
+                if (publication_type == 'confproc'):
+                    self.__add_conf_proc_info_jats_xml(a_doc.get(field, ''), record)
+                # TODO: once solr contains publisher info need to remove extracting publisher from pub_raw
+                elif (publication_type == 'book') or (publication_type == 'report'):
+                    self.__add_book_publisher_info_jats_xml(a_doc.get(field, ''), record)
+            elif (field == 'volume'):
+                ET.SubElement(record, 'volume').text = a_doc.get(field, '')
+            elif (field == 'issue'):
+                # issue appears in parenthesis, so need to find the last element and add open parenthesis
+                record[-1].tail = '(\n'
+                issue = ET.SubElement(record, 'issue')
+                issue.text = a_doc.get(field, '')
+                # now add the close parenthesis followed by colon
+                issue.tail = '):'
+            elif (field == 'page'):
+                ET.SubElement(record, 'fpage').text = ''.join(a_doc.get(field, ''))
+            elif (field == 'page_range'):
+                pages = ''.join(a_doc.get('page_range', '')).split('-')
+                if len(pages) == 2:
+                    # need to insert a dash before lastpage
+                    record[-1].tail = ' #x2013;'
+                    ET.SubElement(record, 'lpage').text = pages[1]
+                # insert a dot after page info
+                record[-1].tail = '.\n'
+            elif (field == 'doi'):
+                # need to add `doi:` before tag
+                record[-1].tail = ' doi:'
+                doi = ET.SubElement(record, 'pub-id')
+                doi.set('pub-id-type', 'doi')
+                doi.text = ''.join(a_doc.get(field, ''))
+
+
     def __get_xml(self, export_format):
         """
         setup the outer xml structure
@@ -550,6 +792,9 @@ def __get_xml(self, export_format):
             elif (export_format == self.EXPORT_FORMAT_DUBLIN_XML):
                 for index in range(num_docs):
                     self.__get_doc_dublin_xml(index, records)
+            elif (export_format == self.EXPORT_FORMAT_JATS_XML):
+                for index in range(num_docs):
+                    self.__get_doc_jats_xml(index, records)
             format_xml = ET.tostring(records, encoding='utf8', method='xml')
             format_xml = (b'>\n<'.join(format_xml.split(b'><')))
             format_xml = format_xml.replace(b'</record>', b'</record>\n')
@@ -577,3 +822,11 @@ def get_dublincore_xml(self):
         """
         return self.__get_xml(self.EXPORT_FORMAT_DUBLIN_XML)
 
+
+    def get_jats_xml(self):
+        """
+
+        :return: jats xml format
+        """
+        return self.__get_xml(self.EXPORT_FORMAT_JATS_XML)
+
diff --git a/exportsrv/tests/unittests/stubdata/xmlTest.py b/exportsrv/tests/unittests/stubdata/xmlTest.py
diff --git a/exportsrv/tests/unittests/test_export_service.py b/exportsrv/tests/unittests/test_export_service.py
@@ -108,6 +108,12 @@ def test_refxml_with_abs(self):
         # now compare it with an already formatted data that we know is correct
         assert(xml_export == xmlTest.data_ref_with_abs)
 
+    def test_jatsxml(self):
+        # format the stubdata using the code
+        xml_export = XMLFormat(solrdata.data).get_jats_xml()
+        # now compare it with an already formatted data that we know is correct
+        assert(xml_export == xmlTest.data_jats)
+
     def test_aastex(self):
         # format the stubdata using the code
         csl_export = CSL(CSLJson(solrdata.data).get(), 'aastex', adsFormatter.latex).get()