In [1]:
import os
import time
import logging
import optparse
import locale
import itertools
import io
import csv
import dj_database_url
import psycopg2
import psycopg2.extras
import dedupe
import numpy

from psycopg2.extensions import register_adapter, AsIs

register_adapter(numpy.int32, AsIs)
register_adapter(numpy.int64, AsIs)
register_adapter(numpy.float32, AsIs)
register_adapter(numpy.float64, AsIs)

## 1. Methods/Classes

In [2]:
class Readable(object):

    def __init__(self, iterator):

        self.output = io.StringIO()
        self.writer = csv.writer(self.output)
        self.iterator = iterator

    def read(self, size):

        self.writer.writerows(itertools.islice(self.iterator, size))

        chunk = self.output.getvalue()
        self.output.seek(0)
        self.output.truncate(0)

        return chunk

In [3]:
def record_pairs(result_set):

    for i, row in enumerate(result_set):
        a_record_id, a_record, b_record_id, b_record = row
        record_a = (a_record_id, a_record)
        record_b = (b_record_id, b_record)

        yield record_a, record_b

        if i % 10000 == 0:
            print(i)

In [4]:
def cluster_ids(clustered_dupes):

    for cluster, scores in clustered_dupes:
        cluster_id = cluster[0]
        for donor_id, score in zip(cluster, scores):
            yield donor_id, cluster_id, score

## 2. Setup

In [5]:
# Control verbosity
verbose = 1

if verbose == 1:
    log_level = logging.INFO
elif verbose >= 2:
    log_level = logging.DEBUG
    
logging.getLogger().setLevel(log_level)

# Preexisting settings/training file
settings_file = 'pgsql_big_dedupe_example_settings'
training_file = 'pgsql_big_dedupe_example_training.json'

In [6]:
# set environment variable DATABASE_URL
# template: %env DATABASE_URL=postgres://{user}:{password}@{host}/{db-name}
%env DATABASE_URL=postgres://test:testpassword@localhost/dedupe-example

# Connect to DB
db_conf = dj_database_url.config()

if not db_conf:
    raise Exception(
        'set DATABASE_URL environment variable with your connection, e.g. '
        'export DATABASE_URL=postgres://user:password@host/mydatabase'
    )
    
read_con = psycopg2.connect(database=db_conf['NAME'],
                            user=db_conf['USER'],
                            password=db_conf['PASSWORD'],
                            host=db_conf['HOST'],
                            cursor_factory=psycopg2.extras.RealDictCursor)
write_con = psycopg2.connect(database=db_conf['NAME'],
                             user=db_conf['USER'],
                             password=db_conf['PASSWORD'],
                             host=db_conf['HOST'])

env: DATABASE_URL=postgres://test:testpassword@localhost/dedupe-example


## 3. Deduplication

In [7]:
fields = [{'field': 'name', 'type': 'String'},
                  {'field': 'address', 'type': 'String', 'has missing': True},
                  {'field': 'city', 'type': 'ShortString', 'has missing': True},
                  {'field': 'state', 'type': 'ShortString', 'has missing': True},
                  {'field': 'zip', 'type': 'ShortString', 'has missing': True},
                  ]

# Initialize Deduper with given fields
deduper = dedupe.Dedupe(fields, num_cores=4)

In [8]:
# SQL query
DONOR_SELECT = "SELECT donor_id, city, name, zip, state, address FROM processed_donors"

In [9]:
# Read from 'processed_donors'
with read_con.cursor('donor_select') as cur:
    cur.execute(DONOR_SELECT)
    temp_d = {i: row for i, row in enumerate(cur)}
    
    # example element of temp_d:
    # RealDictRow([('donor_id', 435),
    #         ('city', None),
    #         ('name', '12-19-02 cash deposit'),
    #         ('zip', None),
    #         ('state', 'il'),
    #        ('address', None)])

In [10]:
if os.path.exists(training_file):
    print('reading labeled examples from ', training_file)
    with open(training_file) as tf:
        deduper.prepare_training(temp_d, tf)
else:
    deduper.prepare_training(temp_d)
    
del temp_d

INFO:dedupe.api:reading training from file


reading labeled examples from  pgsql_big_dedupe_example_training.json


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (fingerprint, address), SimplePredicate: (wholeFieldPredicate, name))
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address), SimplePredicate: (sortedAcronym, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (sameSevenCharStartPredicate, address))
INFO:dedupe.training:(SimplePredicate: (metaphoneToken, name), SimplePredicate: (wholeFieldPredicate, address))
INFO:dedupe.training:(SimplePredicate: (suffixArray, address), SimplePredicate: (wholeFieldPredicate, name))


In [11]:
dedupe.console_label(deduper)

name : 11-5-02 cash deposit
address : None
city : None
state : il
zip : None

name : 11-1-02 cash deposit
address : None
city : None
state : il
zip : None

24/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


y


name : 7-11-02 cash deposit
address : None
city : None
state : il
zip : None

name : 7-31-09 cash deposit
address : None
city : None
state : il
zip : None

25/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, name), SimplePredicate: (doubleMetaphone, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (sameSevenCharStartPredicate, address))
INFO:dedupe.training:(SimplePredicate: (metaphoneToken, name), SimplePredicate: (wholeFieldPredicate, address))
INFO:dedupe.training:(SimplePredicate: (sortedAcronym, address), SimplePredicate: (sortedAcronym, name))
name : 5-31-07 cash deposits
address : None
city : None
state : il
zip : None

name : 5-7-02 cash deposit
address : None
city : None
state : il
zip : None

26/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


name : heat & frost insulators local 17 pac fund
address : 3850 racine ave.
city : chicago
state : il
zip : 60609

name : international heat & frost insulators local 17
address : None
city : None
state : il
zip : None

26/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : cash
address : n/a
city : None
state : il
zip : None

name : friends of saviano
address : n/a
city : river grove
state : il
zip : 60000

27/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, name), SimplePredicate: (doubleMetaphone, name))
INFO:dedupe.training:(SimplePredicate: (metaphoneToken, name), SimplePredicate: (wholeFieldPredicate, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address), SimplePredicate: (commonThreeTokens, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (nearIntegersPredicate, name))
INFO:dedupe.training:(SimplePredicate: (sortedAcronym, address), SimplePredicate: (sortedAcronym, name))
name : 6-30-08 cash deposits
address : None
city : None
state : il
zip : None

name : 6/11/09 raffle
address : 1900 reidfarm rd
city : rockford
state : il
zip : 61107

27/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : items under $150.00
address : None
city : None
state : il
zip : None

name : local no. 150 - i.u.o.e. local area pac
address : 6200 joliet road
city : countryside
state : il
zip : 60525

27/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : ditommaso
address : 17 w 220 22nd st.
city : oakbrook terrace
state : il
zip : 60181

name : ditommaso
address : 17w220 22nd street suite 200
city : oakbrook terrace
state : il
zip : 60181

27/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : glaxosmithkline
address : fpo825
city : philadelphia
state : pa
zip : 19101

name : glaxosmithkline
address : p.o. box 13681
city : philadelphia
state : pa
zip : 19101-3661

28/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (sortedAcronym, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address), SimplePredicate: (commonThreeTokens, name))
INFO:dedupe.training:(SimplePredicate: (metaphoneToken, name), SimplePredicate: (wholeFieldPredicate, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (nearIntegersPredicate, name))
INFO:dedupe.training:(SimplePredicate: (sortedAcronym, address), SimplePredicate: (sortedAcronym, name))
name : isra
address : po box 476
city : chatsworth
state : il
zip : 60921

name : isra political victory fund
address : p o box 476
city : chatsworth
state : il
zip : 60921

29/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : power rogers & smith
address : 35 w. wacker dr.
city : chicago
state : il
zip : 60601

name : power rogers & smith p.c.
address : 70 w. madison st ste 5500
city : chicago
state : il
zip : 60602-4212

30/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (sortedAcronym, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address), SimplePredicate: (commonThreeTokens, name))
INFO:dedupe.training:(SimplePredicate: (metaphoneToken, name), SimplePredicate: (twoGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (nearIntegersPredicate, name))
INFO:dedupe.training:(SimplePredicate: (sortedAcronym, address), SimplePredicate: (sortedAcronym, name))
name : construction & general laborer's dist. council of chgo & vicinity
address : 101 burr ridge parkway suite 300
city : burr ridge
state : il
zip : 60527

name : construction & general laborers' dist. council of chicago & vicinity
address : 999 mcclintock dr ste 300
city : burr ridge
state : il
zip : 60527

31/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (

y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (sortedAcronym, name))
INFO:dedupe.training:(SimplePredicate: (commonFourGram, zip), SimplePredicate: (commonThreeTokens, name))
INFO:dedupe.training:(SimplePredicate: (metaphoneToken, name), SimplePredicate: (twoGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (sortedAcronym, address), SimplePredicate: (sortedAcronym, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (nearIntegersPredicate, name))
name : conlon public strategies
address : 350 n lasalle st suite 1420
city : chicago
state : il
zip : 60610

name : conlon public strategies, inc.
address : 350 n. lasalle, suite 1420
city : chicago, il
state : il
zip : 60664

32/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [12]:
with open(training_file, 'w') as tf:
    deduper.write_training(tf)

In [13]:
deduper.train(recall=0.9)

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
  * (true_distinct + false_distinct)))
INFO:rlr.crossvalidation:optimum alpha: 0.010000, score 0.4
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, name), SimplePredicate: (sortedAcronym, name))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (sameSevenCharStartPredicate, address))
INFO:dedupe.training:(SimplePredicate: (metaphoneToken, name), SimplePredicate: (twoGramFingerprint, address))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, name), SimplePredicate: (commonTwoTokens, city))


In [14]:
with open(settings_file, 'wb') as sf:
    deduper.write_settings(sf)

In [15]:
deduper.cleanup_training()

In [16]:
for field in deduper.fingerprinter.index_fields:
    print("hi")

In [17]:
with write_con:
    with write_con.cursor() as cur:
        cur.execute("DROP TABLE IF EXISTS blocking_map")
        cur.execute("CREATE TABLE blocking_map (block_key text, donor_id INTEGER)")

# If dedupe learned a Index Predicate, we have to take a pass
# through the data and create indices.
print('creating inverted index')

for field in deduper.fingerprinter.index_fields:
    with read_con.cursor('field_values') as cur:
        cur.execute("SELECT DISTINCT %s FROM processed_donors" % field)
        field_data = (row[field] for row in cur)
        deduper.fingerprinter.index(field_data, field)
        
# Now we are ready to write our blocking map table by creating a
# generator that yields unique `(block_key, donor_id)` tuples.
print('writing blocking map')
with read_con.cursor('donor_select') as read_cur:
    read_cur.execute(DONOR_SELECT)
    full_data = ((row['donor_id'], row) for row in read_cur)
    b_data = deduper.fingerprinter(full_data)
    with write_con:
        with write_con.cursor() as write_cur:
            write_cur.copy_expert('COPY blocking_map FROM STDIN WITH CSV',
                                  Readable(b_data),
                                  size=10000)

creating inverted index
writing blocking map


INFO:dedupe.blocking:10000, 0.6784822 seconds
INFO:dedupe.blocking:20000, 1.3560322 seconds
INFO:dedupe.blocking:30000, 2.0488882 seconds
INFO:dedupe.blocking:40000, 2.7724692 seconds
INFO:dedupe.blocking:50000, 3.6042072 seconds
INFO:dedupe.blocking:60000, 4.4430732 seconds
INFO:dedupe.blocking:70000, 5.2672502 seconds
INFO:dedupe.blocking:80000, 6.0286942 seconds
INFO:dedupe.blocking:90000, 6.7340972 seconds
INFO:dedupe.blocking:100000, 7.4186132 seconds
INFO:dedupe.blocking:110000, 8.1199932 seconds
INFO:dedupe.blocking:120000, 8.7931572 seconds
INFO:dedupe.blocking:130000, 9.4988272 seconds
INFO:dedupe.blocking:140000, 10.1969882 seconds
INFO:dedupe.blocking:150000, 10.8838392 seconds
INFO:dedupe.blocking:160000, 11.5731002 seconds
INFO:dedupe.blocking:170000, 12.3873882 seconds
INFO:dedupe.blocking:180000, 13.1410292 seconds
INFO:dedupe.blocking:190000, 13.8521682 seconds
INFO:dedupe.blocking:200000, 14.5567732 seconds
INFO:dedupe.blocking:210000, 15.3273952 seconds
INFO:dedupe.bl

In [57]:
read_cur = read_con.cursor('donor_select')
read_cur.execute(DONOR_SELECT)
full_data = ((row['donor_id'], row) for row in read_cur)
#     for i, record in enumerate(full_data):
#         record_id, instance = record
#         if i % 600000 == 0:
#             print(f'{record_id}, {instance}')
        
#         for pred_id, predicate in preds:
#             print(f'{pred_id}, {predicate}')
            
            
#     b_data = deduper.fingerprinter(full_data)
#     with write_con:
#         with write_con.cursor() as write_cur:
#             write_cur.copy_expert('COPY blocking_map FROM STDIN WITH CSV',
#                                   Readable(b_data),
#                                   size=10000)

InFailedSqlTransaction: current transaction is aborted, commands ignored until end of transaction block


In [54]:
for i, record in enumerate(full_data):
        record_id, instance = record
        if i % 600000 == 0:
            print(f'{record_id}, {instance}')

In [58]:
print(read_cur.open)

AttributeError: 'RealDictCursor' object has no attribute 'open'