Skip to content

Commit

Permalink
Add case-insensitive DOI deduplication (#1606)
Browse files Browse the repository at this point in the history
  • Loading branch information
laurens88 committed Dec 12, 2023
1 parent d73162c commit fb35d18
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 4 deletions.
2 changes: 1 addition & 1 deletion asreview/data/base.py
Expand Up @@ -510,7 +510,7 @@ def duplicated(self, pid="doi"):
if is_string_dtype(self.df[pid]) or is_object_dtype(self.df[pid]):
s_pid = self.df[pid].str.strip().replace("", None)
if pid == "doi":
s_pid = s_pid.str.replace(
s_pid = s_pid.str.lower().str.replace(
r"^https?://(www\.)?doi\.org/", "", regex=True
)
else:
Expand Down
7 changes: 4 additions & 3 deletions tests/test_data.py
Expand Up @@ -143,16 +143,17 @@ def test_duplicated():
'doi': ['https://www.doi.org/10.1000/xyz',
'https://www.doi.org/10.1000/abc',
'https://www.doi.org/10.1000/xyz',
'https://www.doi.org/10.1000/XYZ',
'10.1000/xyz',
'10.1000/xyz',
'http://www.doi.org/10.1000/xyz',
'https://doi.org/10.1000/xyz'],
'title': ['T1', 'T2', 'T3', 'T4', 'T1', 'T2', 'T3'],
'abstract': ['A1', 'A2', 'A3', 'A4', 'A1', 'A2', 'A3']
'title': ['T1', 'T2', 'T3', 'T3', 'T4', 'T1', 'T2', 'T3'],
'abstract': ['A1', 'A2', 'A3', 'A3', 'A4', 'A1', 'A2', 'A3']
}))

# Call the function and get the result
result = instance.duplicated()

# Check the result
assert result.equals(pd.Series([False, False, True, True, True, True, True]))
assert result.equals(pd.Series([False, False, True, True, True, True, True, True]))

0 comments on commit fb35d18

Please sign in to comment.