In [1]:
import csv
import editdistance
from utils import retrieve_paper_details
from collections import defaultdict

In [2]:
# We detect duplicates between the papers by checking their title.
# In order to also detect variations of the same title (e.g. Uppercase, Added Punctuation)
# we employ the levensthein edit distance with a max difference of 20.
# Since some of the titles have a length of less than 20 characters, the will naturally be
# flagged as duplicates. However, this is not a problem, as we manually inspect the duplicates afterward.
def find_duplicates(papers):
    duplicates = []
    for idx in range(len(papers)):
        cur_paper = papers[idx]
        relevant_papers = papers[idx+1:]
        for paper in relevant_papers:
            if editdistance.eval(paper.titel.lower(),cur_paper.titel.lower()) < 20:
                duplicates.append((cur_paper, paper))
    return duplicates

# An alternative approach towards detecting duplicates is by checking for their DOI. 
# However, since not all paper entries retrieved from the databases contain DOI's,
# this measurement is less acurate. It is still a nice baseline to see how well the
# other deduplication approach performs.
def find_duplicates_via_doi(papers):
    duplicates = []
    for idx in range(len(papers)):
        cur_paper = papers[idx]
        relevant_papers = papers[idx+1:]
        for paper in relevant_papers:
            if cur_paper.doi != "" and paper.doi == cur_paper.doi:
                duplicates.append((cur_paper, paper))
    return duplicates

# Write all duplicates do csv file for manual inspection
def summarize_duplicates(duplicates):
    keys = duplicates[0][0].__dict__.keys()
    with open('results/automatically_duplicated_papers.csv', 'w') as f:
        csv_file = csv.DictWriter(f, keys, delimiter=";")
        csv_file.writeheader()
        for duplicate in duplicates:
            csv_file.writerow(duplicate[0].__dict__)
            csv_file.writerow(duplicate[1].__dict__)
            csv_file.writerow(defaultdict(str)) # Empty line added to differentiate between duplicate pairs

In [3]:
resource_dir = "resources/"
resource_files = ['acm.csv', 'IEEE.csv', 'ScienceDirect.csv', 'scopus.csv']

papers = retrieve_paper_details(resource_dir, resource_files)
print('Total number of papers:', len(papers))

Total number of papers: 885


In [4]:
duplicates = find_duplicates(papers)
duplicates_via_doi = find_duplicates_via_doi(papers)
print('Number of duplicates via Title:', len(duplicates))
print('Number of duplicates via DOI:', len(duplicates_via_doi))

Number of duplicates via Title: 116
Number of duplicates via DOI: 90


In [5]:
issubset = set(duplicates_via_doi).issubset(set(duplicates))
print('Question: Are all duplicates found via DOI also found via Title?\nAnswer:', issubset)

Question: Are all duplicates found via DOI also found via Title?
Answer: True


In [6]:
summarize_duplicates(duplicates)