In [1]:
import csv
from utils import retrieve_paper_details, Paper

In [2]:
manually_inspected_non_duplicates = [
    ("Continuous Compliance", "Cryptography for #MeToo"),
    ("Continuous Compliance", "Towards a privacy debt"),
    ("Cryptography for #MeToo", "Continuous Compliance"),
    ("Towards a privacy debt", "Continuous Compliance"),
    ("Lighting Control System Modeling", "Combining STPA with SysML modeling")
]

In [3]:
def in_non_duplicates(dupl1, dupl2):
    return (dupl1.titel,dupl2.titel) in manually_inspected_non_duplicates

def retrieve_automatically_duplicated_papers(path):
    real_duplicates = []
    with open(path, 'r') as f:
        duplicates = csv.DictReader(f, delimiter=";")
        cur_duplicate = {}
        for duplicate in duplicates:
            if duplicate['titel'] == "":
                if not in_non_duplicates(cur_duplicate['keep'], cur_duplicate['remove']):
                    real_duplicates.append(cur_duplicate)
                cur_duplicate = {}
            elif cur_duplicate == {}:
                cur_duplicate['keep'] = Paper(duplicate)
            else:
                cur_duplicate['remove'] = Paper(duplicate)
    return real_duplicates

def retrieve_already_existing_paper_base(path):
    base = []
    with open(path, 'r') as f:
        base_papers = csv.DictReader(f, delimiter=";")
        for paper in base_papers:
            base.append(Paper(paper))
    return base

def in_existing_paper_base(paper):
    if paper in existing_paper_base:
        return True
    return False

def remove_duplicates(duplicates, papers):
    keep = []
    for p in papers:
        if p not in [d['remove'] for d in duplicates]:
            if not in_existing_paper_base(p):
                keep.append(p)
    return keep

def save_remaining_papers(papers):
    keys = papers[0].__dict__.keys()
    with open('results/deduplicated_files.csv', 'w') as f:
        csv_file = csv.DictWriter(f, keys, delimiter=";")
        csv_file.writeheader()
        for paper in papers:
            csv_file.writerow(paper.__dict__)

In [4]:
resource_dir = "resources/"
resource_files = ['acm.csv', 'IEEE.csv', 'ScienceDirect.csv', 'scopus.csv']

papers = retrieve_paper_details(resource_dir, resource_files)
print('Total number of papers:', len(papers))

Total number of papers: 885


In [5]:
results_dir = "results/"
automatically_dup_papers = 'automatically_duplicated_papers.csv'
valid_duplicates = retrieve_automatically_duplicated_papers(results_dir + automatically_dup_papers)
print('Number of valid duplicates:', len(valid_duplicates))

Number of valid duplicates: 111


In [6]:
resource_file = 'preexisting_paper_base.csv'
existing_paper_base = retrieve_already_existing_paper_base(resource_dir + resource_file)

In [7]:
keep = remove_duplicates(valid_duplicates, papers)
print('Remaining papers after deduplication:', len(keep))

Remaining papers after deduplication: 736


In [8]:
save_remaining_papers(keep)