Skip to content

Commit

Permalink
Improve deduplication with support for URL DOI handles (#1386)
Browse files Browse the repository at this point in the history
  • Loading branch information
laurens88 committed Nov 2, 2023
1 parent b844ffe commit ebecc93
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 0 deletions.
4 changes: 4 additions & 0 deletions asreview/data/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,10 @@ def duplicated(self, pid="doi"):
# in case of strings, strip whitespaces and replace empty strings with None
if is_string_dtype(self.df[pid]) or is_object_dtype(self.df[pid]):
s_pid = self.df[pid].str.strip().replace("", None)
if pid == "doi":
s_pid = s_pid.str.replace(
r"^https?://(www\.)?doi\.org/", "", regex=True
)
else:
s_pid = self.df[pid]

Expand Down
21 changes: 21 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,24 @@ def test_deduplication():

# test whether .drop_duplicates() drops the duplicated records correctly
pd.testing.assert_frame_equal(d_dups.drop_duplicates(), d_nodups.df)


def test_duplicated():
# Create an instance of ASReviewData
instance = ASReviewData(pd.DataFrame({
'doi': ['https://www.doi.org/10.1000/xyz',
'https://www.doi.org/10.1000/abc',
'https://www.doi.org/10.1000/xyz',
'10.1000/xyz',
'10.1000/xyz',
'http://www.doi.org/10.1000/xyz',
'https://doi.org/10.1000/xyz'],
'title': ['T1', 'T2', 'T3', 'T4', 'T1', 'T2', 'T3'],
'abstract': ['A1', 'A2', 'A3', 'A4', 'A1', 'A2', 'A3']
}))

# Call the function and get the result
result = instance.duplicated()

# Check the result
assert result.equals(pd.Series([False, False, True, True, True, True, True]))

0 comments on commit ebecc93

Please sign in to comment.