Skip to content

Commit

Permalink
Merge pull request #30 from golnazads/master
Browse files Browse the repository at this point in the history
small adjustment
  • Loading branch information
golnazads committed Dec 19, 2023
2 parents f7c7978 + 9238173 commit 9e1590f
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 53 deletions.
97 changes: 45 additions & 52 deletions adsdocmatch/match_w_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,47 +83,6 @@ def process_pub_metadata(self, metadata):
self.process_pub_bibstem[bibstem] = 1 if status == True else (0 if status == False else -1)
return self.process_pub_bibstem[bibstem]

def parse_arXiv_comments(self, metadata):
"""
:param metadata:
:return:
"""
if metadata.get("origin", "") == 'ARXIV':
comments = metadata.get('comments', '')
# extract doi out of comments if there are any
match = self.re_doi.search(comments)
if match:
metadata['doi'] = match.group(1)
else:
doi = metadata.get('properties', {}).get('DOI', None)
if doi:
metadata['doi'] = doi.replace('doi:', '')
match_doctype = None
title = metadata.get('title')
# check title for erratum
match = self.re_doctype_errata.search(title)
if match:
match_doctype = ['erratum']
else:
match = self.re_doctype_bookreview.search(title)
if match:
match_doctype = ['bookreview']
else:
# check both comments and title for thesis
match = self.re_doctype_thesis.search("%s %s" % (comments, title))
if match:
match_doctype = ['phdthesis', 'mastersthesis']
must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
else:
# in this matching, doi is the doi to match
# hence remove it since this is the record's doi
metadata.pop("doi", None)
match_doctype = None
must_match = False
comments = ''
return metadata, comments, must_match, match_doctype

def write_results(self, result_filename, matches, metadata_filename, rerun_filename):
"""
Expand Down Expand Up @@ -231,6 +190,21 @@ def batch_match_to_arXiv(self, input_filename, result_filename, rerun_filename):
# wait a second before the next attempt
time.sleep(1)

def parse_pub_doi_from_arXiv_record(self, comments, properties):
"""
:param comments:
:param properties:
:return:
"""
match = self.re_doi.search(comments)
if match:
return match.group(1)
else:
doi = properties.get('DOI', None)
if doi:
return doi.replace('doi:', '')
return
def match_to_pub(self, filename):
"""
read and parse arXiv metadata file
Expand All @@ -242,7 +216,36 @@ def match_to_pub(self, filename):
try:
with open(filename, 'rb') as arxiv_fp:
metadata = get_pub_metadata(arxiv_fp.read())
metadata, comments, must_match, match_doctype = self.parse_arXiv_comments(metadata)
if metadata.get("origin", "") == 'ARXIV':
comments = metadata.get('comments', '')
# extract doi to match if available
doi = self.parse_pub_doi_from_arXiv_record(comments, metadata.get('properties', {}))
if doi:
metadata['doi'] = doi
match_doctype = None
title = metadata.get('title')
# check title for erratum
match = self.re_doctype_errata.search(title)
if match:
match_doctype = ['erratum']
else:
match = self.re_doctype_bookreview.search(title)
if match:
match_doctype = ['bookreview']
else:
# check both comments and title for thesis
match = self.re_doctype_thesis.search("%s %s" % (comments, title))
if match:
match_doctype = ['phdthesis', 'mastersthesis']
must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for
ads_archive_class in self.MUST_MATCH)
else:
# in this matching, doi is the doi to match
# hence remove it since this is the record's doi
metadata.pop("doi", None)
match_doctype = None
must_match = False
comments = ''
oracle_matches = self.ORACLE_UTIL.get_matches(metadata, 'eprint', must_match, match_doctype)
# before proceeding see if this arXiv article's class is among the ones that ADS archives the
# published version if available
Expand Down Expand Up @@ -462,13 +465,3 @@ def process_match_to_pub(self, path):
combined_output_filename = "%s%s" % (path, config.get('DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME', 'default'))
self.merge_classic_docmatch_results(classic_matched_filename, result_filename, combined_output_filename)
return combined_output_filename

if __name__ == '__main__':
print(MatchMetadata().match_to_pub('/proj/ads/abstracts/gen/text/L48/L48-23288.abs'))
print(MatchMetadata().match_to_arXiv('/proj/ads/abstracts/gen/text/L48/L48-23288.abs'))

'''
/proj/ads/abstracts/gen/text/L52/L52-28159.abs
/proj/ads/abstracts/gen/text/L48/L48-23288.abs
/proj/ads/abstracts/sources/ArXiv/oai/arXiv.org/2306/02768
'''
4 changes: 3 additions & 1 deletion adsdocmatch/tests/unittests/test_oracle_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ def test_extract_doi(self):
fullpath = stubdata_dir + filename
with open(fullpath, 'rb') as arxiv_fp:
metadata = get_pub_metadata(arxiv_fp.read())
metadata, _, _, _ = self.match_metadata.parse_arXiv_comments(metadata)
to_match_doi = self.match_metadata.parse_pub_doi_from_arXiv_record(metadata.get('comments', ''), metadata.get('properties', {}))
if to_match_doi:
metadata['doi'] = to_match_doi
self.assertEqual(self.match_metadata.ORACLE_UTIL.extract_doi(metadata), doi)

def test_read_google_sheet(self):
Expand Down

0 comments on commit 9e1590f

Please sign in to comment.