In [1]:
import os
import time
import logging
import optparse
import locale
import itertools
import io
import csv
import dj_database_url
import psycopg2
import psycopg2.extras
import dedupe
import numpy

from psycopg2.extensions import register_adapter, AsIs

register_adapter(numpy.int32, AsIs)
register_adapter(numpy.int64, AsIs)
register_adapter(numpy.float32, AsIs)
register_adapter(numpy.float64, AsIs)

## 1. Methods/Classes

In [3]:
class Readable(object):

    def __init__(self, iterator):

        self.output = io.StringIO()
        self.writer = csv.writer(self.output)
        self.iterator = iterator

    def read(self, size):

        self.writer.writerows(itertools.islice(self.iterator, size))

        chunk = self.output.getvalue()
        self.output.seek(0)
        self.output.truncate(0)

        return chunk
    
def record_pairs(result_set):

    for i, row in enumerate(result_set):
        a_record_id, a_record, b_record_id, b_record = row
        record_a = (a_record_id, a_record)
        record_b = (b_record_id, b_record)

        yield record_a, record_b

        if i % 10000 == 0:
            print(i)
            
def cluster_ids(clustered_dupes):

    for cluster, scores in clustered_dupes:
        cluster_id = cluster[0]
        for donor_id, score in zip(cluster, scores):
            yield donor_id, cluster_id, score            

## 2. Setup

In [4]:
# Control verbosity
verbose = 1

if verbose == 1:
    log_level = logging.INFO
elif verbose >= 2:
    log_level = logging.DEBUG
    
logging.getLogger().setLevel(log_level)

# Preexisting settings/training file
settings_file = 'pgsql_big_dedupe_example_settings'
training_file = 'pgsql_big_dedupe_example_training.json'

In [5]:
# set environment variable DATABASE_URL
# template: %env DATABASE_URL=postgres://{user}:{password}@{host}/{db-name}
%env DATABASE_URL=postgres://test:testpassword@localhost/dedupe-example

# Connect to DB
db_conf = dj_database_url.config()

if not db_conf:
    raise Exception(
        'set DATABASE_URL environment variable with your connection, e.g. '
        'export DATABASE_URL=postgres://user:password@host/mydatabase'
    )
    
read_con = psycopg2.connect(database=db_conf['NAME'],
                            user=db_conf['USER'],
                            password=db_conf['PASSWORD'],
                            host=db_conf['HOST'],
                            cursor_factory=psycopg2.extras.RealDictCursor)
write_con = psycopg2.connect(database=db_conf['NAME'],
                             user=db_conf['USER'],
                             password=db_conf['PASSWORD'],
                             host=db_conf['HOST'])

env: DATABASE_URL=postgres://test:testpassword@localhost/dedupe-example


## 3. Deduplication

In [6]:
fields = [{'field': 'name', 'type': 'String'},
          {'field': 'address', 'type': 'String', 'has missing': True},
          {'field': 'city', 'type': 'ShortString', 'has missing': True},
          {'field': 'state', 'type': 'ShortString', 'has missing': True},
          {'field': 'zip', 'type': 'ShortString', 'has missing': True},
         ]

# Initialize Deduper with given fields
deduper = dedupe.Dedupe(fields, num_cores=4)

In [7]:
# SQL query
DONOR_SELECT = "SELECT donor_id, city, name, zip, state, address FROM processed_donors"

In [11]:
# Read from 'processed_donors'
with read_con.cursor('donor_select') as cur:
    cur.execute(DONOR_SELECT)
    temp_d = {i: row for i, row in enumerate(cur)}
    
    # example element of temp_d:
    # RealDictRow([('donor_id', 435),
    #         ('city', None),
    #         ('name', '12-19-02 cash deposit'),
    #         ('zip', None),
    #         ('state', 'il'),
    #        ('address', None)])

with read_con.cursor('count') as cur:
    cur.execute('SELECT COUNT(*) FROM processed_donors')
    print('Num. rows', cur.fetchone()['count'])

Num. rows: 706030


In [12]:
if os.path.exists(training_file):
    print('reading labeled examples from ', training_file)
    with open(training_file) as tf:
        deduper.prepare_training(temp_d, tf)
else:
    deduper.prepare_training(temp_d)
    
del temp_d

INFO:dedupe.api:reading training from file


reading labeled examples from  pgsql_big_dedupe_example_training.json


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, address), SimplePredicate: (wholeFieldPredicate, name))
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (sortedAcronym, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (hundredIntegerPredicate, address))
INFO:dedupe.training:(SimplePredicate: (metaphoneToken, name), SimplePredicate: (twoGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (commonIntegerPredicate, name), SimplePredicate: (commonThreeTokens, name))
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, address), SimplePredicate: (sortedAcronym, name))


In [13]:
dedupe.console_label(deduper)

name : iuoe local #399-political educ. fund
address : 2260 so. grove st.
city : chicago
state : il
zip : 60616

name : iuoe local 399
address : 2260 s grove str
city : chicago
state : il
zip : 60616

32/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


f


Finished labeling


In [14]:
with open(training_file, 'w') as tf:
    deduper.write_training(tf)

In [15]:
deduper.train(recall=0.9)

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
  * (true_distinct + false_distinct)))
INFO:rlr.crossvalidation:optimum alpha: 0.000010, score 0.3043999749933167
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (sortedAcronym, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address), SimplePredicate: (commonThreeTokens, name))
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, name), SimplePredicate: (twoGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (commonIntegerPredicate, name), SimplePredicate: (commonThreeTokens, name))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, city), SimplePredicate: (oneGramFingerprint, name))


In [16]:
with open(settings_file, 'wb') as sf:
    deduper.write_settings(sf)

In [17]:
deduper.cleanup_training()

In [None]:
for field in deduper.fingerprinter.index_fields:
    print(field)

In [18]:
with write_con:
    with write_con.cursor() as cur:
        cur.execute("DROP TABLE IF EXISTS blocking_map")
        cur.execute("CREATE TABLE blocking_map (block_key text, donor_id INTEGER)")

# If dedupe learned a Index Predicate, we have to take a pass
# through the data and create indices.
print('creating inverted index')

for field in deduper.fingerprinter.index_fields:
    with read_con.cursor('field_values') as cur:
        cur.execute("SELECT DISTINCT %s FROM processed_donors" % field)
        field_data = (row[field] for row in cur)
        deduper.fingerprinter.index(field_data, field)
        
# Now we are ready to write our blocking map table by creating a
# generator that yields unique `(block_key, donor_id)` tuples.
print('writing blocking map')
with read_con.cursor('donor_select') as read_cur:
    read_cur.execute(DONOR_SELECT)
    full_data = ((row['donor_id'], row) for row in read_cur)
    b_data = deduper.fingerprinter(full_data)
    with write_con:
        with write_con.cursor() as write_cur:
            write_cur.copy_expert('COPY blocking_map FROM STDIN WITH CSV',
                                  Readable(b_data),
                                  size=10000)

creating inverted index
writing blocking map


INFO:dedupe.blocking:10000, 0.7684602 seconds
INFO:dedupe.blocking:20000, 1.5433532 seconds
INFO:dedupe.blocking:30000, 2.2997902 seconds
INFO:dedupe.blocking:40000, 3.0500072 seconds
INFO:dedupe.blocking:50000, 3.8522632 seconds
INFO:dedupe.blocking:60000, 4.6934402 seconds
INFO:dedupe.blocking:70000, 5.6468152 seconds
INFO:dedupe.blocking:80000, 6.4750682 seconds
INFO:dedupe.blocking:90000, 7.2550532 seconds
INFO:dedupe.blocking:100000, 7.9991922 seconds
INFO:dedupe.blocking:110000, 8.7456752 seconds
INFO:dedupe.blocking:120000, 9.5421292 seconds
INFO:dedupe.blocking:130000, 10.2824442 seconds
INFO:dedupe.blocking:140000, 11.0973792 seconds
INFO:dedupe.blocking:150000, 11.9101712 seconds
INFO:dedupe.blocking:160000, 12.6880932 seconds
INFO:dedupe.blocking:170000, 13.5542102 seconds
INFO:dedupe.blocking:180000, 14.3557012 seconds
INFO:dedupe.blocking:190000, 15.1254132 seconds
INFO:dedupe.blocking:200000, 15.9005712 seconds
INFO:dedupe.blocking:210000, 16.7706892 seconds
INFO:dedupe.b

## 4. Results

In [23]:
with read_con: 
    with read_con.cursor() as cur:
        cur.execute("SELECT COUNT(a) FROM (SELECT DISTINCT canon_id FROM entity_map) a")
        print("Distinct entities:", cur.fetchone()['count'])
        cur.execute("SELECT COUNT(*) FROM entity_map")
        print("All references:", cur.fetchone()['count'])

Distinct entities: 115782
All references: 418995
