In [78]:
import dedupe

In [79]:
import os
import csv
import re
import logging
import optparse
import dedupe
from unidecode import unidecode

In [88]:
input_file = 'Custodian_Mapping_Table.csv'
output_file = 'output.csv'
settings_file = 'settings.csv'
training_file = 'train.csv'

In [81]:
def preProcess(column):

    try:
        column = column.decode('utf-8',errors="ignore")
    except AttributeError:
        pass
    column = unidecode(column)
    column = re.sub(' +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()

    if not column:
        column = None
    return column


In [285]:
def readData(filename):
    data_d = {}
    with open(filename,encoding='utf-8',errors="ignore") as f:
        reader = csv.DictReader(f)
        for row in reader:
            (num,input_name, cls)= row.items()
            clean_row = (num,input_name, cls)
            row_id = int(row['Key'])
            data_d[row_id] = dict(clean_row)
    return data_d

In [233]:
print('importing data ...')
data_d = readData(input_file)
print(data_d)
print(len(data_d))

importing data ...
{1: {'Key': '1', 'Unique Custodians': 'ABN AMRO BANK', 'Match': 'ABN AMRO Bank'}, 2: {'Key': '2', 'Unique Custodians': 'ABN AMRO CLEARING CHICAGO LLC', 'Match': 'ABN AMRO Bank'}, 3: {'Key': '3', 'Unique Custodians': 'ADVANTAGE FUTURES LLC', 'Match': 'Advantage Futures'}, 4: {'Key': '4', 'Unique Custodians': 'ALBERT FRIED & COMPANY, LLC', 'Match': 'Albert Fried & Company'}, 5: {'Key': '5', 'Unique Custodians': 'ALIANZA FIDUCIARIA', 'Match': 'Alianza Fiduciaria'}, 6: {'Key': '6', 'Unique Custodians': 'ALTREE CUSTODY SERVICE LTD', 'Match': 'Altree Custody Services'}, 7: {'Key': '7', 'Unique Custodians': 'ALTREE CUSTODY SERVICES LTD', 'Match': 'Altree Custody Services'}, 8: {'Key': '8', 'Unique Custodians': 'ALTREE CUSTODY SERVICES LTD.', 'Match': 'Altree Custody Services'}, 9: {'Key': '9', 'Unique Custodians': 'AMERICAN STOCK TRANSFER & TRUST CO., LLC', 'Match': 'American Stock Transfer & Trust Company'}, 10: {'Key': '10', 'Unique Custodians': 'AMERICAN STOCK TRANSFER &

In [85]:
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    fields = [
        {'field' : 'Unique Custodians', 'type': 'String'},
        {'field' : 'Match', 'type': 'String'} 
        ]
    deduper = dedupe.Dedupe(fields)
    deduper.sample(data_d, 100)

    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.readTraining(f)

    print('starting active labeling...')

    dedupe.consoleLabel(deduper)

    deduper.train()

  % (sample_size, len(blocked_sample)))
Unique Custodians : BANK OF AMERICA SECURITIES LLC
Match : Bank of America Merrill Lynch

Unique Custodians : BANK OF AMERICA, N.A.
Match : Bank of America Merrill Lynch

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


starting active labeling...
y


Unique Custodians : NORTHERN TRUST BANK NA
Match : Northern Trust Custody Services

Unique Custodians : NORTHERN TRUST FIDUCIARY SERVICES (IRELAND) LIMITED
Match : Northern Trust Custody Services

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : FIDELITY CAPITAL MARKETS
Match : Fidelity Investments

Unique Custodians : FIDELITY INVESTMENTS, INC.
Match : Fidelity Investments

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious





(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : BMO HARRIS BANK NA.
Match : BMO Harris Bank

Unique Custodians : CITIGROUP INC.
Match : Citi Transaction Services

3/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : SGI HUDSON & CIE
Match : Hudson & Cie

Unique Custodians : WELLS FARGO N.A.
Match : Wells Fargo

3/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : DEUTSCHE BANK (MAURITIUS) LIMITED
Match : Deutsche Bank

Unique Custodians : U.S. BANK, N.A.
Match : U.S. Bank National Association

3/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : BTIG, LLC
Match : BTIG

Unique Custodians : MORGAN STANLEY CAPITAL PRODUCTS LLC
Match : Morgan Stanley

3/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : STATE STREET INTERNATIONAL (IRELAND) LIMITED
Match : State Street Custody Services

Unique Custodians : UBS LIMITED
Match : UBS

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : MUFG INVESTOR SERVICES
Match : MUFG Union Bank

Unique Custodians : MUFG UNION BANK, N.A.
Match : MUFG Union Bank

3/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : JP MORGAN MARKETS LIMITED
Match : J.P. Morgan

Unique Custodians : JPMORGAN CHASE BANK, NATIONAL ASSOCIATION
Match : J.P. Morgan

4/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : THE PRIVATE BANK AND TRUST COMPANY
Match : The Private Bank

Unique Custodians : THE PRIVATEBANK AND TRUST COMPANY
Match : The PrivateBank and Trust

5/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : NORTHERN TRUST CO.
Match : Northern Trust Custody Services

Unique Custodians : PNC BANK, N.A.
Match : PNC

6/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : BGL BNP PARIBAS SOCIT ANONYME
Match : BNP Paribas

Unique Custodians : BNP PARIBAS - LONDON BRANCH
Match : BNP Paribas

6/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : MORGAN STANLEY CAPITAL SERVICES LLC.
Match : Morgan Stanley

Unique Custodians : MORGAN STANLEY SECURITIES LIMITED
Match : Morgan Stanley

7/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : NORTHERN TRUST INTERNATIONAL BANKING
Match : Northern Trust Custody Services

Unique Custodians : NORTHERN TRUST SECURITIES, INC.
Match : Northern Trust Custody Services

8/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : HSBC INSTITUTIONAL TRUST SERVICES (SINGAPORE) LIMITED
Match : HSBC

Unique Custodians : WILMINGTON TRUST COMPANY
Match : Wilmington Trust

9/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : BNP PARIBAS PRIME BROKERAGE INTERNATIONAL, LTD
Match : BNP Paribas

Unique Custodians : UBS AG (LONDON)
Match : UBS

9/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : THE BANK OF NEW YORK
Match : BNY Mellon

Unique Custodians : THE NORTHERN TRUST INTERNATIONAL BANKING CORP.
Match : Northern Trust Custody Services

9/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : NORTHERN TRUST GLOBAL SERVICES
Match : Northern Trust Custody Services

Unique Custodians : THE NORTHERN TRUST CO.
Match : Northern Trust Custody Services

9/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : J.P. MORGAN CHASE, NA
Match : J.P. Morgan

Unique Custodians : JP MORGAN CHASE & CO
Match : J.P. Morgan

10/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : CITIBANK NA
Match : Citi Transaction Services

Unique Custodians : CITIGROUP GLOBAL MARKETS, INC.
Match : Citi Transaction Services

11/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : WELLS FARGO CLEARING SERVICES, LLC
Match : Wells Fargo

Unique Custodians : WELLS FARGO SECURITIES, LLC
Match : Wells Fargo

12/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : HARRIS N.A.
Match : BMO Harris Bank

Unique Custodians : U.S. BANK INSTITUTIONAL TRUST & CUSTODY
Match : U.S. Bank Institutional Trust & Custody

13/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : JEFFERIES BACHE FINANCIAL SERVICES, INC.
Match : Jefferies

Unique Custodians : MERRILL LYNCH GOVERNMENT SECURITIES INC.
Match : Merrill Lynch International

13/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : UBS SECURITIES AG
Match : UBS

Unique Custodians : UBS, AG
Match : UBS

13/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : CITCO BANK CANADA
Match : Citco Global Custody

Unique Custodians : CITCO BANKING CORPORATION N.V.
Match : Citco Global Custody

14/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : CITCO BANK AND TRUST COMPANY (BAHAMAS) LIMITED
Match : Citco Global Custody

Unique Custodians : CITCO BANK NEDERLAND VC
Match : Citco Global Custody

15/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : HSBC BANK USA
Match : HSBC

Unique Custodians : THE BANK OF NEW YORK MELLON (LONDON BRANCH)
Match : BNY Mellon

16/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : BANK OF VALLETTA
Match : Bank of Valletta

Unique Custodians : TRINIDAD & TOBAGO CENTRAL DEPOSITORY
Match : Trinidad and Tobago Central Depository

16/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : GOLDMAN SACHS BANK USA
Match : Goldman Sachs

Unique Custodians : GOLDMAN SACHS INTERNATIONAL BANK
Match : Goldman Sachs

16/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : HSBC BANK NA USA
Match : HSBC

Unique Custodians : HSBC BANK USA NA
Match : HSBC

17/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : DEUTSCHE BANK (MAURITIUS) LIMITED
Match : Deutsche Bank

Unique Custodians : DEUTSCHE BANK SECURITIES INC.
Match : Deutsche Bank

18/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : JP MORGAN CCVM SA
Match : J.P. Morgan

Unique Custodians : VANGUARD CAPITAL
Match : Vanguard Marketing Corporation

19/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : JP MORGAN CHASE BANK, NA
Match : J.P. Morgan

Unique Custodians : NORTHERN TRUST GLOBAL SERVICES LIMITED
Match : Northern Trust Custody Services

19/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : CITIGROUP, INC.
Match : Citi Transaction Services

Unique Custodians : NORTHERN TRUST INTERNATIONAL BANK CORPORATION
Match : Northern Trust Custody Services

19/10 positive, 15/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : MORGAN STANLEY CAPITAL GROUP INC.
Match : Morgan Stanley

Unique Custodians : MORGAN STANLEY CAPITAL SERVICES LLC.
Match : Morgan Stanley

19/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : JP MORGAN CHASE BANK, N.A.
Match : J.P. Morgan

Unique Custodians : JPMORGAN CHASE & CO.
Match : J.P. Morgan

20/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : BARCLAYS BANK MAURITIUS LIMITED
Match : Barclays

Unique Custodians : SG AMERICAS SECURITIES LLC
Match : SG Americas Securities

21/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : THE BANK OF NEW YORK MELLON TRUST COMPANY, N.A.
Match : BNY Mellon

Unique Custodians : TUTUNSKA BANKA
Match : NLB Tutunska banka

21/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : BNY MELLON ALTERNATIVE INVESTMENT SERVICES
Match : BNY Mellon

Unique Custodians : HSBC BANK USA NA
Match : HSBC

21/10 positive, 18/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : J.P. MORGAN CHASE BANK, NA
Match : J.P. Morgan

Unique Custodians : JPMORGAN CHASE BANK, NA
Match : J.P. Morgan

21/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : PNC BANK, NA
Match : PNC

Unique Custodians : PNC BANK,NA
Match : PNC

22/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : CITIGROUP GLOBAL MARKETS INC.
Match : Citi Transaction Services

Unique Custodians : CITIGROUP GLOBAL MARKETS LIMITED
Match : Citi Transaction Services

23/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : CREDIT SUISSE AG, DUBLIN BRANCH
Match : Credit Suisse Prime Fund Services

Unique Custodians : DEUTSCHE BANK AG - LONDON BRANCH
Match : Deutsche Bank

24/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : BTG PACTUAL US CAPITAL, LLC
Match : BTG Pactual

Unique Custodians : MERRILL LYNCH INTERNATIONAL INC. SEOUL BRANCH
Match : Merrill Lynch International

24/10 positive, 20/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : BANK OF NEW YORK MELLON TRUST COMPANY
Match : BNY Mellon

Unique Custodians : BMO HARRIS BANK
Match : BMO Harris Bank

24/10 positive, 21/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : MORGAN STANLEY & CO INTERNATIONAL PLC
Match : Morgan Stanley

Unique Custodians : MORGAN STANLEY & CO INTL PLC
Match : Morgan Stanley

24/10 positive, 22/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : STATE STREET BANK &TRUST COMPANY
Match : State Street Custody Services

Unique Custodians : STATE STREET BANK AND TRUST COMPANY
Match : State Street Custody Services

25/10 positive, 22/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : CITCO BANK NEDERLAND N.V.
Match : Citco Global Custody

Unique Custodians : CITCO BANK NEDERLAND VC
Match : Citco Global Custody

26/10 positive, 22/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : CREDIT SUISSE SECURITIES (EUROPE) LTD.
Match : Credit Suisse Prime Fund Services

Unique Custodians : US BANCORP FUND SERVICES, LLC
Match : US Bancorp Fund Services

27/10 positive, 22/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : ROSENTHAL COLLINS GROUP, LLC
Match : Rosenthal Collins Group

Unique Custodians : WELLS FARGO BANK NA
Match : Wells Fargo

27/10 positive, 23/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : MORGAN STANLEY & CO. LLC
Match : Morgan Stanley

Unique Custodians : STATE STREET GLOBAL MARKETS, LLC
Match : State Street Custody Services

27/10 positive, 24/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : BMO HARRIS BANK N.A
Match : BMO Harris Bank

Unique Custodians : BMO HARRIS BANK, N.A.
Match : BMO Harris Bank

27/10 positive, 25/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : NORTHERN TRUST INTERNATIONAL BANKING CORP
Match : Northern Trust Custody Services

Unique Custodians : THE NORTHERN TRUST INTERNATIONAL BANKING CORPORATION
Match : Northern Trust Custody Services

28/10 positive, 25/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : DEUTSCHE BANK AG - NEW YORK BRANCH
Match : Deutsche Bank

Unique Custodians : DEUTSCHE BANK AG, NEW YORK BRANCH
Match : Deutsche Bank

29/10 positive, 25/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


Unique Custodians : DEUTSCHE BANK (CAYMAN) LIMITED
Match : Deutsche Bank

Unique Custodians : HARRIS NA
Match : BMO Harris Bank

30/10 positive, 25/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : KOTAK MAHINDRA, LTD
Match : Kotak Mahindra Bank

Unique Custodians : THE NORTHERN TRUST INT'L BANKING CORPORATION
Match : Northern Trust Custody Services

30/10 positive, 26/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


Unique Custodians : FIDELITY INVESTMENTS (THE BANK OF NEW YORK, MELLON)
Match : Fidelity Investments

Unique Custodians : US BANK NA
Match : U.S. Bank National Association

30/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.100000, score 0.981583196666
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.4, Match)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.6, Match)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.2, Match)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.8, Match)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.8, Match)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.6, Match)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.4, Match)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.2, Match)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (4, Match)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (3, Match)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (1, Match)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (2, Match)
INFO:dedupe.blocki

FileNotFoundError: [Errno 2] No such file or directory: ''

In [89]:
with open(training_file, 'w') as tf:
    deduper.writeTraining(tf)

with open(settings_file, 'wb') as sf:
    deduper.writeSettings(sf)

In [90]:
threshold = deduper.threshold(data_d, recall_weight=1)

INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (1, Unique Custodians)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.8, Unique Custodians)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.8, Unique Custodians)
INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 1.000
INFO:dedupe.api:precision: 0.986
INFO:dedupe.api:With threshold: 0.846


In [92]:
print('clustering...')
clustered_dupes = deduper.match(data_d, threshold)

print('# duplicate sets', len(clustered_dupes))


INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (1, Unique Custodians)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.8, Unique Custodians)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.8, Unique Custodians)


clustering...
# duplicate sets 90


In [95]:
cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data_d[c] for c in id_set]
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores):
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": score
        }

singleton_id = cluster_id + 1

with open(output_file, 'w',encoding='utf-8',errors='ignore') as f_output, open(input_file,encoding='utf-8',errors='ignore') as f_input:
    writer = csv.writer(f_output)
    reader = csv.reader(f_input)

    heading_row = next(reader)
    heading_row.insert(0, 'confidence_score')
    heading_row.insert(0, 'Cluster ID')
    canonical_keys = canonical_rep.keys()
    for key in canonical_keys:
        heading_row.append('canonical_' + key)

    writer.writerow(heading_row)

    for row in reader:
        row_id = int(row[0])
        if row_id in cluster_membership:
            cluster_id = cluster_membership[row_id]["cluster id"]
            canonical_rep = cluster_membership[row_id]["canonical representation"]
            row.insert(0, cluster_membership[row_id]['confidence'])
            row.insert(0, cluster_id)
            for key in canonical_keys:
                row.append(canonical_rep[key].encode('utf8'))
        else:
            row.insert(0, None)
            row.insert(0, singleton_id)
            singleton_id += 1
            for key in canonical_keys:
                row.append(None)
        writer.writerow(row)

115

82

98

50

63

48

135

136

137

192

197

31

69

71

99

41

46

98

41

82

81

47

98

114

112

115

126

130

138

163

139

161

158

131

129

134

141

134

132

150

176

173

163

44

49

84

91

102

103

97

105

99

150

46

53

82

55

118

51

119

45

85

38

43

57

54

22

40

45

24

29

100

117

120

49

97

103

107

107

108

106

107

110

111

109

125

106

100

105

82

98

117

110

96

96

101

119

106

99

98

99

117

119

87

101

79

111

95

98

102

110

82

111

111

85

83

132

145

57

97

132

137

139

152

167

138

45

106

30

78

93

55

61

99

115

61

66

39

57

67

132

160

135

132

91

196

87

82

71

122

128

136

76

38

64

116

135

143

133

114

121

121

140

120

116

115

132

127

137

125

132

118

122

131

129

123

118

116

115

106

119

116

117

111

121

131

109

116

128

119

114

114

112

138

127

130

127

129

130

112

132

115

96

99

106

113

23

145

147

41

49

64

124

127

61

170

186

210

212

132

135

152

152

144

144

139

173

145

168

152

160

156

157

153

150

147

55

130

160

158

107

99

84

80

81

94

113

92

107

108

105

106

112

109

97

95

95

111

113

104

111

111

112

112

114

115

115

118

109

108

111

111

115

114

115

98

95

104

71

54

79

69

35

73

86

53

145

139

146

151

168

174

143

138

50

106

111

119

132

120

63

112

72

92

96

97

101

112

97

101

113

121

118

106

115

111

100

100

137

37

36

42

105

100

98

39

62

36

28

48

44

79

32

47

32

78

79

110

81

82

113

142

70

143

72

31

85

40

27

36

45

43

35

82

41

43

71

273

235

72

77

82

86

115

130

121

127

129

119

122

73

76

82

97

69

93

87

95

99

81

100

87

92

93

92

97

94

100

101

98

95

92

96

94

96

96

96

99

97

39

98

102

66

60

44

55

37

79

87

85

90

91

90

95

93

99

98

96

114

89

92

94

103

95

90

106

94

95

94

97

105

113

84

90

89

94

92

97

94

113

92

93

35

29

32

73

141

138

85

119

124

132

126

121

150

127

81

68

31

29

51

72

41

162

158

158

159

160

147

151

165

155

160

161

127

135

131

136

139

76

58

106

96

109

114

119

119

122

110

105

122

108

115

110

116

120

121

106

142

110

115

116

113

116

117

117

118

121

120

126

118

114

112

109

121

111

130

117

115

133

134

115

105

73

123

164

160

161

152

149

139

132

137

140

140

130

33

126

147

150

151

132

140

156

134

142

130

149

138

138

141

133

138

142

167

166

146

153

154

161

152

157

157

164

149

39

46

102

124

106

109

52

26

68

34

71

70

33

47

69

75

106

103

132

136

139

130

41

77

54

89

114

110

111

41

71

129

132

75

86

39

90

114

53

147

65

57

57

112

106

127

132

140

152

122

155

127

135

138

139

143

142

141

151

145

152

144

140

134

159

164

144

154

127

147

61

94

102

103

117

115

60

48

29

92

109

106

100

68

76

74

86

91

111

116

119

117

89

96

115

112

108

119

118

134

111

138

141

158

168

161

162

168

174

159

159

131

122

140

121

73

134

189

112

133

154

153

190

147

162

156

150

147

197

166

21

52

70

68

41

44

48

40

94

97

29

35

50

36

36

55

145

151

145

143

161

146

164

132

126

140

30

58

116

119

65

84

90

94

62

89

93

94

92

96

97

95

113

109

92

119

104

89

99

103

106

97

95

99

102

90

92

125

126

98

106

78

109

113

126

116

In [267]:
output_file='output_3.csv'

def read_output_file(filename):
    data_d = []
    with open(filename,encoding='utf-8',errors="ignore") as f:
        reader = csv.DictReader(f)
        for row in reader:
            data_d.append(row.items())
        return data_d
    
data=read_output_file(output_file)
    

In [268]:
# # data=data[0:3]
print(len(data))


760


In [269]:
import itertools
def get_all_record_pairs(data):
    return [list(x) for x in itertools.combinations(data, 2)]

In [270]:
def are_records_same_in_name(record_pair):
        a,b=record_pair[0]
        c,d=record_pair[1]
        return b==d
def are_records_same_in_cluster(record_pair):
        a,b=record_pair[0]
        c,d=record_pair[1]
       
        return a==c

def are_records_similar(record_pair):
        return (are_records_same_in_name(record_pair) and are_records_same_in_cluster(record_pair))

# for record_pair in record_pairs_list:
#     print(are_records_in_same_cluster(record_pair))

In [271]:
def get_true_positive(record_pair_list):
    tp_count=0
    for record_pair in record_pair_list:
        if are_records_similar(record_pair):
                tp_count +=1
    return tp_count
def get_true_negative(record_pair_list):
    tn_count=0
    for record_pair in record_pair_list:
        if (not are_records_in_same_cluster(record_pair) and not are_records_same_in_name(record_pair)):
                tn_count +=1
    return tn_count



In [272]:
def get_false_positive(record_pair_list):
    fp_count=0
    for record_pair in record_pair_list:
        if (are_records_same_in_cluster(record_pair) and not are_records_same_in_name(record_pair)):
                fp_count +=1
    return fp_count
def get_false_negative(record_pair_list):
    fp_count=0
    for record_pair in record_pair_list:
        if (not are_records_same_in_cluster(record_pair) and are_records_same_in_name(record_pair)):
                fp_count +=1
    return fp_count


In [273]:
# data=(data[:2])

In [284]:
# data=data[0:3]

# print(data)
record_pair=get_all_record_pairs(data)
print(len(record_pair))
tp=get_true_positive(record_pair)
tn=get_true_negative(record_pair)
fp=get_false_positive(record_pair)
fn=get_false_negative(record_pair)
print(fn)

288420
578


In [294]:
def get_rand_index(tp,tn,fp,fn):
    return(tp + tn)/(tp+tn+fp+fn)
def get_precision(tp,fp):
    return (tp)/(tp+fp)
def get_recall(tp,fn):
    return (tp)/(tp + fn)

def get_f(prec,recall, beta=1):
    num=((beta + 1)*(prec * recall))
    den=((beta**2 * prec + recall))
    return num/den

In [296]:
print(get_rand_index(tp,tn,fp,fn))
prec=get_precision(tp,fp)
recall=get_recall(tp,fn)
score=get_f(prec,recall)
print(prec,recall,score)

0.9979959780875113
1.0 0.912464031500833 0.9542286981311372
