diff --git a/asreview/data/base.py b/asreview/data/base.py index fa6e9557d..ea930d50a 100644 --- a/asreview/data/base.py +++ b/asreview/data/base.py @@ -510,7 +510,7 @@ def duplicated(self, pid="doi"): if is_string_dtype(self.df[pid]) or is_object_dtype(self.df[pid]): s_pid = self.df[pid].str.strip().replace("", None) if pid == "doi": - s_pid = s_pid.str.replace( + s_pid = s_pid.str.lower().str.replace( r"^https?://(www\.)?doi\.org/", "", regex=True ) else: diff --git a/tests/test_data.py b/tests/test_data.py index 649460f6e..7e0504585 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -143,16 +143,17 @@ def test_duplicated(): 'doi': ['https://www.doi.org/10.1000/xyz', 'https://www.doi.org/10.1000/abc', 'https://www.doi.org/10.1000/xyz', + 'https://www.doi.org/10.1000/XYZ', '10.1000/xyz', '10.1000/xyz', 'http://www.doi.org/10.1000/xyz', 'https://doi.org/10.1000/xyz'], - 'title': ['T1', 'T2', 'T3', 'T4', 'T1', 'T2', 'T3'], - 'abstract': ['A1', 'A2', 'A3', 'A4', 'A1', 'A2', 'A3'] + 'title': ['T1', 'T2', 'T3', 'T3', 'T4', 'T1', 'T2', 'T3'], + 'abstract': ['A1', 'A2', 'A3', 'A3', 'A4', 'A1', 'A2', 'A3'] })) # Call the function and get the result result = instance.duplicated() # Check the result - assert result.equals(pd.Series([False, False, True, True, True, True, True])) + assert result.equals(pd.Series([False, False, True, True, True, True, True, True]))