
https://www.kdnuggets.com/2016/06/regularization-logistic-regression.html

https://github.com/dedupeio/dedupe/tree/master/docs

https://github.com/dedupeio/dedupe-examples/tree/master/csv_example

https://dedupeio.github.io/dedupe-examples/docs/csv_example.html

https://dedupe.io/developers/library/en/latest/API-documentation.html#


In [1]:
import os
import re
import csv
import string
import dedupe
from unidecode import unidecode
import pandas as pd
import numpy as np

In [2]:
path = os.getcwd()
path

'C:\\Github\\dedupe'

In [3]:


def preProcess(column):

#

    try : # python 2/3 string differences
        column = column.decode('utf8')
    except AttributeError:
        pass
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()

#If data is missing, indicate that by setting the value to None

    if not column:
        column = None
    return column

#Read in our data from a CSV file and create a dictionary of records, where the key is a unique record ID and each value is dict

def readData(filename):

#

    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['Id'])
            data_d[row_id] = dict(clean_row)

    return data_d

print('importing data ...')
data_dd = readData(path+'\\raw.txt')



importing data ...


In [4]:
data_dd[0]['Address']

'1 n ogden ave'

In [5]:
raw = pd.read_csv(path+'\\raw.txt', sep=',', index_col=None, encoding='utf-8', 
                 dtype={'Site name':'object', 'Address':'object',
                        'Zip':'float', 'Phone':'float'})
print(len(raw))
match_cols =  ['Site name', 'Address', 'Zip', 'Phone']
print(raw[match_cols].dtypes)
print(raw[match_cols].isnull().sum())

3337
Site name     object
Address       object
Zip          float64
Phone        float64
dtype: object
Site name       0
Address         0
Zip          1333
Phone         146
dtype: int64


In [6]:
def cleanse_raw(df, cols):
    for col in cols:
        if df[col].dtype ==  np.object_:
            df[col+'_'] = df[col].copy().str.lower().str.replace('[^a-zA-Z0-9]', ' ').copy()
        else:
            df[col+'_'] = df[col].copy()
    return df

In [7]:
cleansed = cleanse_raw(raw, match_cols)
clean_cols = [x for x in cleansed.columns if '_' in x]

In [8]:
data_d = cleansed[clean_cols].to_dict(orient='index')
data_d[0]

{'Address_': u'1 n ogden ave ',
 'Phone_': 2262649.0,
 'Site name_': u' salvation army   temple   salvation army',
 'Zip_': nan}

In [9]:
data_d[k].keys()

NameError: name 'k' is not defined

In [11]:
for k in data_d.keys():
    for k1 in ['Address_', 'Site name_']:
        data_d[k][k1] = str(data_d[k][k1])
    for k2 in ['Zip_', 'Phone_']:
        if np.isnan(data_d[k][k2]):
            data_d[k][k2] = None

# Dedupe

In [13]:
# define fields to use for de-dedup
fields = [
    {'field' : 'Site name', 'type': 'String'},
    {'field' : 'Address', 'type': 'String'},
    {'field' : 'Zip', 'type': 'Exact', 'has missing' : True},
    {'field' : 'Phone', 'type': 'String', 'has missing' : True},
    ]

In [14]:
# Create a new deduper object and pass our data model to it.
deduper = dedupe.Dedupe(fields)

In [15]:
deduper.sample(data_dd, sample_size=10000)

  % (sample_size, len(blocked_sample)))
INFO:dedupe.canopy_index:Removing stop word  s


In [16]:
dedupe.consoleLabel(deduper)

Site name : henry booth house whiz kids nursery 518 w 103 rd st
Address : 518 w 103rd street
Zip : 60628
Phone : 2339445

Site name : henry booth house whiz kids nursery 518 w 103 rd st
Address : 518 w 103rd st.
Zip : 60628
Phone : 2339445

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


y


Site name : chicago public schools hamline, john h
Address : 4747 s bishop st
Zip : 60609
Phone : 5354565

Site name : chicago public schools hamline, john h
Address : 4652 s. bishop
Zip : 60609
Phone : 5354520

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonIntegerPredicate, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
Site name : chicago public schools ryerson, martin a.
Address : 646 n lawndale
Zip : 60624
Phone : 5346700

Site name : aunt martha's - riverdale site
Address : 14424 wentworth avenue riverdale, il 60827
Zip : None
Phone : 8496019

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
Site name : chicago commons association - nia family center
Address : 744 n monticello ave
Zip : None
Phone : 8266971

Site name : belding
Address : 4257 n. tripp
Zip : 60641
Phone : 5343590

2/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Site name : carole robertson center for learning - carole robertson center- ogden 3701
Address : 3701 w ogden ave
Zip : None
Phone : 5228400

Site name : home of life community dev. corp. home of life just for you (773)-626-8655
Address : 4647 w. washington
Zip : 60644
Phone : None

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Site name : chicago youth centers - fellowship house / cyc
Address : 844 w 32nd st
Zip : None
Phone : 3262282

Site name : chicago youth centers fellowship house
Address : 844 w 32nd st
Zip : 60608
Phone : 3262282

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Site name : marcy newberry association - new birth
Address : 1500 w 69th st
Zip : None
Phone : 4710699

Site name : marcy newberry association - new birth
Address : 1500 w 69th st
Zip : None
Phone : 4710699

3/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(LevenshteinCanopyPredicate: (3, Address), LevenshteinCanopyPredicate: (4, Site name))
INFO:dedupe.training:(SimplePredicate: (commonIntegerPredicate, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
Site name : henry booth house little folks day care
Address : 2527 e 73rd st.
Zip : 60649
Phone : None

Site name : ymca south side
Address : 6330 s stony island avenue
Zip : 60637
Phone : 9470700

4/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Site name : north avenue day nursery north avenue day nursery
Address : 2001 w pierce
Zip : 60622
Phone : 3424499

Site name : north avenue day nursery
Address : 2001 w pierce street
Zip : 60622
Phone : 3424499

4/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Site name : mary crane west
Address : 2820 n. leavitt
Zip : None
Phone : 3485528

Site name : mary crane north
Address : 2905 n. leavitt
Zip : 60618
Phone : 3485528

5/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, Site name), SimplePredicate: (wholeFieldPredicate, Address))
Site name : family rescue - ridgeland head start
Address : 6824 s ridgeland ave
Zip : None
Phone : 6671073

Site name : family rescue ridgeland
Address : 6824 s ridgeland avenue
Zip : 60649
Phone : 6671073

6/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, Site name), SimplePredicate: (suffixArray, Address))
Site name : north avenue day nursery - north avenue day nursery
Address : 2001 w pierce ave
Zip : None
Phone : 3424499

Site name : north avenue day nursery
Address : 2001 w pierce street
Zip : 60622
Phone : 3424499

7/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sortedAcronym, Address), TfidfTextCanopyPredicate: (0.8, Site name))
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
Site name : erie house
Address : 1701 w. superior
Zip : 60622
Phone : 5635800

Site name : erie house i/t
Address : 1701 w. superior
Zip : 60622
Phone : 5635800

8/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Phone), TfidfTextCanopyPredicate: (0.8, Site name))
Site name : perez
Address : 1241 w. 19th st.
Zip : None
Phone : 5347650

Site name : colemon, johnnie
Address : 1441 w. 119th st.
Zip : 60643
Phone : 5353975

9/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(TfidfTextCanopyPredicate: (0.4, Phone), TfidfTextCanopyPredicate: (0.8, Site name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (3, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
Site name : catholic charities of the archdiocese of chicago - grace mission
Address : 5332 s western ave
Zip : None
Phone : 4761990

Site name : catholic charities-grace mission
Address : 5332 s. western
Zip : 60609
Phone : 4761900

9/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Site name : firman community services - firman house south
Address : 5401 s wentworth ave
Zip : None
Phone : 4513400

Site name : firman community services firman house south
Address : 5401 s wentworth ave
Zip : 60609
Phone : 3733400

10/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(TfidfNGramCanopyPredicate: (0.6, Site name), TfidfTextCanopyPredicate: (0.4, Address))
Site name : trinity united church of christ - trinity united
Address : 532 w 95th st
Zip : None
Phone : 4883511

Site name : trinity united church of christ trinity ucc
Address : 532 w 95th st
Zip : 60628
Phone : None

11/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Site name : chicago commons association - nia family center
Address : 744 n monticello ave
Zip : None
Phone : 8266971

Site name : chicago commons nia
Address : 744 n. monticelo
Zip : 60624
Phone : 8263770

12/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameSevenCharStartPredicate, Site name), TfidfTextCanopyPredicate: (0.4, Address))
Site name : ada s. mckinley community services - little hands
Address : 7146 s ashland ave
Zip : None
Phone : None

Site name : childserv englewood
Address : 7928 s ashland avenue
Zip : 60620
Phone : None

13/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Site name : ymca high ridge
Address : 2424 w. touhy
Zip : None
Phone : 2628300

Site name : ymca of metropolitan chicago high ridge
Address : 2424 w. touhy
Zip : 60645
Phone : 2628300

13/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Site name : ymca of metropolitan chicago - garfield head start/child developmental center- ymca
Address : 7 n homan ave
Zip : None
Phone : 2653900

Site name : ymca garfield
Address : 7 n homan avenue
Zip : 60624
Phone : 2653900

14/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, Site name), TfidfTextCanopyPredicate: (0.4, Address))
Site name : mary crane league mary crane center (east)
Address : 2974 n. clybourn
Zip : 60618
Phone : 9388130

Site name : mary crane league mary crane center (north)
Address : 2905 n. leavitt
Zip : 60618
Phone : 9388130

15/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Site name : first church of love and faith first church of love and faith
Address : 2140 w 79th st
Zip : 60620
Phone : 8739155

Site name : first church of love and faith
Address : 2140 w 79th street
Zip : 60620
Phone : 8739155

16/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, Site name), TfidfTextCanopyPredicate: (0.4, Address))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
Site name : chicago public schools linne, carl von
Address : 3221 n sacramento
Zip : 60618
Phone : 5345262

Site name : linne
Address : 3221 n. sacramento
Zip : 60618
Phone : 5345262

17/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, Address), SimplePredicate: (firstTokenPredicate, Site name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
Site name : smith
Address : 744 e. 103rd st.
Zip : None
Phone : 5355689

Site name : henry booth house little folks day care
Address : 2527 e 73rd st.
Zip : 60649
Phone : None

18/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (firstTokenPredicate, Site name), TfidfTextCanopyPredicate: (0.4, Address))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, Zip), TfidfTextCanopyPredicate: (0.4, Phone))
Site name : el valor - little tykes i
Address : 1711 w 35th st
Zip : None
Phone : 2547700

Site name : kkp - little tykes i
Address : 1711 w. 35th st.
Zip : 60609
Phone : 2548396

18/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Site name : erie neighborhood house fcch-analida sanchez site
Address : 1914 n sayre
Zip : 60707
Phone : 4322213

Site name : erie neighborhood house fcch-margarita del valle site
Address : 1004 n kedzie ave
Zip : 60651
Phone : 4322213

19/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameSevenCharStartPredicate, Address), TfidfTextCanopyPredicate: (0.2, Site name))
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, Address), TfidfTextCanopyPredicate: (0.4, Phone))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
Site name : catholic charities of the archdiocese of chicago - our lady of tepeyac
Address : 2414 s albany ave
Zip : None
Phone : 2775888

Site name : our lady of tepeyac
Address : 2414 south albany avenue
Zip : 60623
Phone : 2775888

19/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Site name : disney magnet ii
Address : 3815 n. kedvale
Zip : None
Phone : 5358650

Site name : disney magnet
Address : 3815 n. kedvale ave.
Zip : 60613
Phone : 5345840

20/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Address), SimplePredicate: (sameThreeCharStartPredicate, Phone))
INFO:dedupe.training:(SimplePredicate: (doubleMetaphone, Address), TfidfTextCanopyPredicate: (0.8, Site name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
Site name : chicago youth centers pathways to learning cc1
Address : 3460 w 79th st
Zip : 60652
Phone : 4369244

Site name : chicago youth centers pathways to learning cc
Address : 3418 w 79th st
Zip : 60652
Phone : 7765439

21/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Address), TfidfNGramCanopyPredicate: (0.4, Address))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, Site name), SimplePredicate: (sameFiveCharStartPredicate, Phone))
Site name : mary crane league mary crane center (west)
Address : 2820 n leavitt
Zip : 60618
Phone : 3485528

Site name : mary crane north
Address : 2905 n. leavitt
Zip : 60618
Phone : 3485528

22/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Address), SimplePredicate: (sortedAcronym, Phone))
INFO:dedupe.training:(SimplePredicate: (suffixArray, Address), TfidfNGramCanopyPredicate: (0.8, Site name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (4, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
Site name : mary crane league - mary crane - east
Address : 2974 n clybourn ave
Zip : None
Phone : 3485528

Site name : mary crane
Address : 2905 n. leavitt
Zip : 60618
Phone : 3485528

23/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Site name : bunnyland developmental childcare
Address : 545 w. 119th street
Zip : None
Phone : 5685200

Site name : henry booth house bunnyland land day care
Address : 545 w 119th street
Zip : 60628
Phone : 5685200

24/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Address), TfidfTextCanopyPredicate: (0.4, Address))
INFO:dedupe.training:(TfidfTextCanopyPredicate: (0.4, Phone), TfidfTextCanopyPredicate: (0.8, Site name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, Site name), SimplePredicate: (wholeFieldPredicate, Zip))
Site name : carole robertson center for learning fcch-darlene mcghee
Address : 8055 s wood
Zip : 60620
Phone : 9945193

Site name : carole robertson center for learning fcch-tosha kelly
Address : 8050 s honore
Zip : 60620
Phone : None

25/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Site name : haymarket center wholly innocence day care center
Address : 34 n sangamon
Zip : 60607
Phone : 2267984

Site name : haymarket center haymarket center
Address : 120 n sangamon
Zip : 60607
Phone : 2267984

26/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Address), TfidfTextCanopyPredicate: (0.4, Address))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, Zip), TfidfNGramCanopyPredicate: (0.8, Site name))
INFO:dedupe.training:(TfidfTextCanopyPredicate: (0.4, Phone), TfidfTextCanopyPredicate: (0.8, Site name))
Site name : henry booth house - little hands & feet
Address : 7801 s wolcott ave
Zip : None
Phone : None

Site name : henry booth house little hands & feet
Address : 7801 s wolcott
Zip : 60620
Phone : 9948561

27/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Address), TfidfTextCanopyPredicate: (0.4, Address))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, Zip), TfidfTextCanopyPredicate: (0.6, Site name))
INFO:dedupe.training:(TfidfTextCanopyPredicate: (0.4, Phone), TfidfTextCanopyPredicate: (0.8, Site name))
Site name : carole robertson center for learning - carole robertson center 2929
Address : 2929 w 19th st
Zip : None
Phone : 5211600

Site name : carole robertson center for learning fcch-rhonda culverson
Address : 3826 w 85th street
Zip : 60629
Phone : 5211600

28/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Address), TfidfNGramCanopyPredicate: (0.4, Address))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, Zip), TfidfTextCanopyPredicate: (0.6, Site name))
INFO:dedupe.training:(TfidfTextCanopyPredicate: (0.4, Phone), TfidfTextCanopyPredicate: (0.8, Site name))
Site name : chicago public schools henson, matthew a.
Address : 1326 s avers
Zip : 60623
Phone : 5341804

Site name : chicago public schools johnson, james weldon
Address : 1504 s albany
Zip : 60623
Phone : 5341833

29/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (metaphoneToken, Address), TfidfTextCanopyPredicate: (0.4, Phone))
INFO:dedupe.training:(SimplePredicate: (sameSevenCharStartPredicate, Address), TfidfNGramCanopyPredicate: (0.8, Address))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, Zip), TfidfNGramCanopyPredicate: (0.8, Site name))
Site name : dumas
Address : 6615 s. kenwood ave
Zip : None
Phone : 5350802

Site name : dumas
Address : 6650 s. ellis
Zip : 60637
Phone : 5350750

29/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Site name : catholic charities of the archdiocese of chicago st. joseph
Address : 4800 s paulina
Zip : 60609
Phone : 9272524

Site name : catholic charities of the archdiocese of chicago grace mission
Address : 5332 s. western
Zip : 60609
Phone : 4761990

29/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


Site name : jolly fun house playschool
Address : 7559 w. addison
Zip : None
Phone : 6376115

Site name : jolly fun house playschool
Address : 7559 w addison street
Zip : 60634
Phone : None

29/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Site name : salvation army - family outreach initiative (sal army)
Address : 845 w 69th st
Zip : None
Phone : 8324700

Site name : salvation army - red shield
Address : 945 w 69th st
Zip : None
Phone : 3583203

30/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameSevenCharStartPredicate, Address), TfidfNGramCanopyPredicate: (0.4, Address))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, Site name), TfidfTextCanopyPredicate: (0.4, Phone))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, Zip), TfidfNGramCanopyPredicate: (0.8, Site name))
Site name : centers for new horizons - ida b. wells learning center
Address : 3601 s rhodes st
Zip : None
Phone : 3733640

Site name : centers for new horizons ida b. wells
Address : 3641 s rhodes avenue
Zip : 60653
Phone : 3733460

30/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Site name : care-a-lot child development center
Address : 5522 n milwaukee avenue
Zip : 60630
Phone : 7630888

Site name : care-a-lot child development center
Address : 6441 n central avenue
Zip : 60646
Phone : 7638888

31/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Address), TfidfNGramCanopyPredicate: (0.4, Address))
INFO:dedupe.training:(SimplePredicate: (oneGramFingerprint, Phone), TfidfTextCanopyPredicate: (0.6, Site name))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, Zip), TfidfNGramCanopyPredicate: (0.8, Site name))


In [17]:
 deduper.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.100000, score 0.659673743625
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (sameFiveCharStartPredicate, Address), TfidfNGramCanopyPredicate: (0.4, Address))
INFO:dedupe.training:(SimplePredicate: (oneGramFingerprint, Phone), TfidfTextCanopyPredicate: (0.6, Site name))
INFO:dedupe.training:(LevenshteinCanopyPredicate: (1, Site name), SimplePredicate: (wholeFieldPredicate, Zip))


In [18]:
threshold = deduper.threshold(data_dd, recall_weight=1)



INFO:dedupe.canopy_index:Removing stop word  s
INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 0.977
INFO:dedupe.api:precision: 0.970
INFO:dedupe.api:With threshold: 0.487


In [25]:
output_file = '\\csv_example_output.csv'
settings_file = '\\csv_example_learned_settings'
training_file = '\\csv_example_training.json'

In [26]:


with open(path+training_file, 'w') as tf:
    deduper.writeTraining(tf)

#Save our weights and predicates to disk. If the settings file exists, we will skip all the training and learning next time we run this file.

with open(path+settings_file, 'wb') as sf:
    deduper.writeSettings(sf)



In [19]:
print('clustering...')
clustered_dupes = deduper.match(data_dd, threshold)

clustering...


INFO:dedupe.canopy_index:Removing stop word  s


In [21]:
clustered_dupes[0]

((0, 1, 215, 509, 510, 1225, 1226, 1879, 2758, 3255),
 array([ 0.99787538,  0.99787538,  0.99332574,  0.99787538,  0.99787538,
         0.99787538,  0.99787538,  0.999742  ,  0.9942195 ,  0.99696296]))

In [24]:
data_dd[215]

{'Address': '1 n. ogden',
 'Agency': None,
 'CC fund': None,
 'Center Director': None,
 'Column': None,
 'Column2': None,
 'Director': None,
 'ECE Available Programs': None,
 'Eearly Head Start Fund': None,
 'Email Address': None,
 'Executive Director': None,
 'Fax': None,
 'Funded Enrollment': None,
 'Head Start Fund': None,
 'IDHS Provider ID': None,
 'Id': '215',
 'Length of Day': '8-11 hours, varies by facility',
 'NAEYC Program Id': None,
 'NAEYC Valid Until': None,
 'Neighborhood': None,
 'Number per Site EHS': None,
 'Number per Site HS': None,
 'Ounce of Prevention Description': None,
 'Phone': '2262649',
 'Progmod': None,
 'Program Name': 'community partnerships',
 'Program Option': None,
 'Purple binder service type': None,
 'Site name': 'salvation army temple',
 'Source': 'cps_early_childhood_portal_scrape.csv',
 'Website': None,
 'Zip': None}