Merge pull request #30 from golnazads/master

small adjustment
adsabs · Dec 19, 2023 · 9e1590f · 9e1590f
2 parents f7c7978 + 9238173
commit 9e1590f
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 53 deletions.
diff --git a/adsdocmatch/match_w_metadata.py b/adsdocmatch/match_w_metadata.py
@@ -83,47 +83,6 @@ def process_pub_metadata(self, metadata):
             self.process_pub_bibstem[bibstem] = 1 if status == True else (0 if status == False else -1)
         return self.process_pub_bibstem[bibstem]
 
-    def parse_arXiv_comments(self, metadata):
-        """
-
-        :param metadata:
-        :return:
-        """
-        if metadata.get("origin", "") == 'ARXIV':
-            comments = metadata.get('comments', '')
-            # extract doi out of comments if there are any
-            match = self.re_doi.search(comments)
-            if match:
-                metadata['doi'] = match.group(1)
-            else:
-                doi = metadata.get('properties', {}).get('DOI', None)
-                if doi:
-                    metadata['doi'] = doi.replace('doi:', '')
-            match_doctype = None
-            title = metadata.get('title')
-            # check title for erratum
-            match = self.re_doctype_errata.search(title)
-            if match:
-                match_doctype = ['erratum']
-            else:
-                match = self.re_doctype_bookreview.search(title)
-                if match:
-                    match_doctype = ['bookreview']
-                else:
-                    # check both comments and title for thesis
-                    match = self.re_doctype_thesis.search("%s %s" % (comments, title))
-                    if match:
-                        match_doctype = ['phdthesis', 'mastersthesis']
-            must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
-        else:
-            # in this matching, doi is the doi to match
-            # hence remove it since this is the record's doi
-            metadata.pop("doi", None)
-            match_doctype = None
-            must_match = False
-            comments = ''
-        return metadata, comments, must_match, match_doctype
-
     def write_results(self, result_filename, matches, metadata_filename, rerun_filename):
         """
 
@@ -231,6 +190,21 @@ def batch_match_to_arXiv(self, input_filename, result_filename, rerun_filename):
                     # wait a second before the next attempt
                     time.sleep(1)
 
+    def parse_pub_doi_from_arXiv_record(self, comments, properties):
+        """
+
+        :param comments:
+        :param properties:
+        :return:
+        """
+        match = self.re_doi.search(comments)
+        if match:
+            return match.group(1)
+        else:
+            doi = properties.get('DOI', None)
+            if doi:
+                return doi.replace('doi:', '')
+        return
     def match_to_pub(self, filename):
         """
         read and parse arXiv metadata file
@@ -242,7 +216,36 @@ def match_to_pub(self, filename):
         try:
             with open(filename, 'rb') as arxiv_fp:
                 metadata = get_pub_metadata(arxiv_fp.read())
-                metadata, comments, must_match, match_doctype = self.parse_arXiv_comments(metadata)
+                if metadata.get("origin", "") == 'ARXIV':
+                    comments = metadata.get('comments', '')
+                    # extract doi to match if available
+                    doi = self.parse_pub_doi_from_arXiv_record(comments, metadata.get('properties', {}))
+                    if doi:
+                        metadata['doi'] = doi
+                    match_doctype = None
+                    title = metadata.get('title')
+                    # check title for erratum
+                    match = self.re_doctype_errata.search(title)
+                    if match:
+                        match_doctype = ['erratum']
+                    else:
+                        match = self.re_doctype_bookreview.search(title)
+                        if match:
+                            match_doctype = ['bookreview']
+                        else:
+                            # check both comments and title for thesis
+                            match = self.re_doctype_thesis.search("%s %s" % (comments, title))
+                            if match:
+                                match_doctype = ['phdthesis', 'mastersthesis']
+                    must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for
+                                     ads_archive_class in self.MUST_MATCH)
+                else:
+                    # in this matching, doi is the doi to match
+                    # hence remove it since this is the record's doi
+                    metadata.pop("doi", None)
+                    match_doctype = None
+                    must_match = False
+                    comments = ''
                 oracle_matches = self.ORACLE_UTIL.get_matches(metadata, 'eprint', must_match, match_doctype)
                 # before proceeding see if this arXiv article's class is among the ones that ADS archives the
                 # published version if available
@@ -462,13 +465,3 @@ def process_match_to_pub(self, path):
         combined_output_filename = "%s%s" % (path, config.get('DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME', 'default'))
         self.merge_classic_docmatch_results(classic_matched_filename, result_filename, combined_output_filename)
         return combined_output_filename
-
-if __name__ == '__main__':
-    print(MatchMetadata().match_to_pub('/proj/ads/abstracts/gen/text/L48/L48-23288.abs'))
-    print(MatchMetadata().match_to_arXiv('/proj/ads/abstracts/gen/text/L48/L48-23288.abs'))
-
-    '''
-/proj/ads/abstracts/gen/text/L52/L52-28159.abs
-/proj/ads/abstracts/gen/text/L48/L48-23288.abs
-/proj/ads/abstracts/sources/ArXiv/oai/arXiv.org/2306/02768
-'''
diff --git a/adsdocmatch/tests/unittests/test_oracle_util.py b/adsdocmatch/tests/unittests/test_oracle_util.py
@@ -79,7 +79,9 @@ def test_extract_doi(self):
             fullpath = stubdata_dir + filename
             with open(fullpath, 'rb') as arxiv_fp:
                 metadata = get_pub_metadata(arxiv_fp.read())
-                metadata, _, _, _ = self.match_metadata.parse_arXiv_comments(metadata)
+                to_match_doi = self.match_metadata.parse_pub_doi_from_arXiv_record(metadata.get('comments', ''), metadata.get('properties', {}))
+                if to_match_doi:
+                    metadata['doi'] = to_match_doi
                 self.assertEqual(self.match_metadata.ORACLE_UTIL.extract_doi(metadata), doi)
 
     def test_read_google_sheet(self):