In [18]:
import csv
import dedupe
import re
from unidecode import unidecode
import os

input_file = 'readyDedupYp.csv'
output_file = 'csv_example_output.csv'
settings_file = 'csv_example_learned_settings'
training_file = 'csv_example_training.json'


def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    try : # python 2/3 string differences
        column = column.decode('utf8')
    except AttributeError:
        pass
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    # If data is missing, indicate that by setting the value to `None`
    if not column:
        column = None
    return column

def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID and each value is dict
    """

    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['Id'])
            data_d[row_id] = dict(clean_row)

    return data_d

print('importing data ...')
data_d = readData(input_file)

# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    # ## Training
    # Define the fields dedupe will pay attention to
    fields = [
        {'field': 'name', 'type': 'String'},
        {'field': 'phone', 'type': 'String'},
        {'field': 'streetAddress', 'type': 'String'},
        {'field': 'city', 'type': 'String'},
         {'field': 'state', 'type': 'String'},
        {'field': 'web', 'type': 'String', 'has missing': True},
    ]

    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(fields)

    ######  'Dedupe' object has no attribute 'sample' ###### 

if os.path.exists(training_file):
    print('reading labeled examples from ', training_file)
    #'StaticDedupe' object has no attribute 'prepare_training'
    #with open(training_file, 'rb') as f:
        #deduper.prepare_training(data_d, f) 
else:
    deduper.prepare_training(data_d)
    
    
    print('starting active labeling...')

    dedupe.console_label(deduper) #https://github.com/dedupeio/dedupe-examples/issues/108

    # Using the examples we just labeled, train the deduper and learn
    # blocking predicates
    deduper.train()

    # When finished, save our training to disk
    with open(training_file, 'w') as tf:
        deduper.write_training(tf) #also seems to have had a name change

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'wb') as sf:
        deduper.write_settings(sf) #name change here too

        
#######'Dedupe' object has no attribute 'threshold'     
print('clustering...')
###clustered_dupes = deduper.match(data_d, threshold)
clustered_dupes = deduper.partition(data_d, 0.5) 

print('# duplicate sets', len(clustered_dupes))

# ## Writing Results

# Write our original data back out to a CSV with a new column called
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data_d[c] for c in id_set]
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores):
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": score
        }

singleton_id = cluster_id + 1

with open(output_file, 'w', newline = '') as f_output, open(input_file) as f_input:
    writer = csv.writer(f_output)
    reader = csv.reader(f_input)

    heading_row = next(reader)
    heading_row.insert(0, 'confidence_score')
    heading_row.insert(0, 'Cluster ID')
    canonical_keys = canonical_rep.keys()
    for key in canonical_keys:
        heading_row.append('canonical_' + key)

    writer.writerow(heading_row)

    for row in reader:
        row_id = int(row[0])
        if row_id in cluster_membership:
            cluster_id = cluster_membership[row_id]["cluster id"]
            canonical_rep = cluster_membership[row_id]["canonical representation"]
            row.insert(0, cluster_membership[row_id]['confidence'])
            row.insert(0, cluster_id)
            for key in canonical_keys:
                row.append(canonical_rep[key].encode('utf8'))
        else:
            row.insert(0, None)
            row.insert(0, singleton_id)
            singleton_id += 1
            for key in canonical_keys:
                row.append(None)
        writer.writerow(row)

importing data ...


INFO:dedupe.api:Predicate set:
INFO:dedupe.api:(SimplePredicate: (twoGramFingerprint, name), TfidfTextCanopyPredicate: (0.6, city), SimplePredicate: (sortedAcronym, phone))
INFO:dedupe.api:(SimplePredicate: (commonThreeTokens, name), TfidfTextCanopyPredicate: (0.4, web), TfidfTextCanopyPredicate: (0.2, name))


reading from csv_example_learned_settings
reading labeled examples from  csv_example_training.json
clustering...


INFO:dedupe.canopy_index:Removing stop word com


# duplicate sets 2921
