In [4]:
import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode
import pandas as pd

# Run Dedupe

In [5]:
def preProcess(column):
#
    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub('  +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column

def readData(filename):
#
    data_d = {}

    with open(filename) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
            data_d[filename + str(i)] = dict(clean_row)

    return data_d

def descriptions():
    for dataset in (data_1, data_2):
        for record in dataset.values():
            yield record['description']

In [6]:
import time
start = time.time()
left_file = '/home/colombo/BusinessRegistryTool/Data/mfi/mfi_essential.csv'
right_file = '/home/colombo/BusinessRegistryTool/Data/gleif/gleif_essential.csv'
print('importing data ...')
data_1 = readData(left_file)
data_2 = readData(right_file)

importing data ...


### Depude Precision

In [7]:
output_file = '/home/colombo/BusinessRegistryTool/Competitors/DedupeExp/precision/data_matching_output.csv'
settings_file = '/home/colombo/BusinessRegistryTool/Competitors/DedupeExp/precision/data_matching_learned_settings'
training_file = '/home/colombo/BusinessRegistryTool/Competitors/DedupeExp/precision/data_matching_training.json'

In [8]:
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as sf:
        linker = dedupe.StaticRecordLink(sf)
else:
    fields = [
        {'field': 'name', 'type': 'String'},
        {'field': 'address', 'type': 'String'},
        {'field': 'country', 'type': 'String'},
        {'field': 'post', 'type': 'String'},
        {'field': 'city', 'type': 'String'}]
    linker = dedupe.RecordLink(fields)

    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file) as tf:
            linker.prepare_training(data_1,
                                    data_2,
                                    training_file=tf,
                                    sample_size=15000)
    else:
        linker.prepare_training(data_1, data_2, sample_size=15000)
    end = time.time()
    print('starting active labeling...')

    dedupe.console_label(linker)

    linker.train()
    with open(training_file, 'w') as tf:
        linker.write_training(tf)
    with open(settings_file, 'wb') as sf:
        linker.write_settings(sf)

reading from /home/colombo/BusinessRegistryTool/Competitors/DedupeExp/precision/data_matching_learned_settings


In [9]:
start2 = time.time()
print('clustering...')
linked_records = linker.join(data_1, data_2, 0.0) 

print('# duplicate sets', len(linked_records))

cluster_membership = {}
for cluster_id, (cluster, score) in enumerate(linked_records):
    for record_id in cluster:
        cluster_membership[record_id] = {'Cluster ID': cluster_id,
                                         'Link Score': score}

with open(output_file, 'w') as f:

    header_unwritten = True

    for fileno, filename in enumerate((left_file, right_file)):
        with open(filename) as f_input:
            reader = csv.DictReader(f_input)

            if header_unwritten:

                fieldnames = (['Cluster ID', 'Link Score', 'source file'] +
                              reader.fieldnames)

                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()

                header_unwritten = False

            for row_id, row in enumerate(reader):

                record_id = filename + str(row_id)
                cluster_details = cluster_membership.get(record_id, {})
                row['source file'] = fileno
                row.update(cluster_details)

                writer.writerow(row)

end2 = time.time()

clustering...
# duplicate sets 82259


In [10]:
end2-start2

1564.4845969676971

### Dedupe Recall

In [6]:
output_file = '/home/colombo/BusinessRegistryTool/Competitors/DedupeExp/recall/data_matching_output.csv'
settings_file = '/home/colombo/BusinessRegistryTool/Competitors/DedupeExp/recall/data_matching_learned_settings'
training_file = '/home/colombo/BusinessRegistryTool/Competitors/DedupeExp/recall/data_matching_training.json'

In [10]:
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as sf:
        linker = dedupe.StaticRecordLink(sf)
else:
    fields = [
        {'field': 'name', 'type': 'String'},
        {'field': 'country', 'type': 'String'}]
    linker = dedupe.RecordLink(fields)

    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file) as tf:
            linker.prepare_training(data_1,
                                    data_2,
                                    training_file=tf,
                                    sample_size=15000)
    else:
        linker.prepare_training(data_1, data_2, sample_size=15000)

    print('starting active labeling...')

    dedupe.console_label(linker)

    linker.train()
    with open(training_file, 'w') as tf:
        linker.write_training(tf)
    with open(settings_file, 'wb') as sf:
        linker.write_settings(sf)

print('clustering...')
linked_records = linker.join(data_1, data_2, 1.0) 

print('# duplicate sets', len(linked_records))

cluster_membership = {}
for cluster_id, (cluster, score) in enumerate(linked_records):
    for record_id in cluster:
        cluster_membership[record_id] = {'Cluster ID': cluster_id,
                                         'Link Score': score}

with open(output_file, 'w') as f:

    header_unwritten = True

    for fileno, filename in enumerate((left_file, right_file)):
        with open(filename) as f_input:
            reader = csv.DictReader(f_input)

            if header_unwritten:

                fieldnames = (['Cluster ID', 'Link Score', 'source file'] +
                              reader.fieldnames)

                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()

                header_unwritten = False

            for row_id, row in enumerate(reader):

                record_id = filename + str(row_id)
                cluster_details = cluster_membership.get(record_id, {})
                row['source file'] = fileno
                row.update(cluster_details)

                writer.writerow(row)

name : None
country : ie

name : syquant icav
country : ie

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


starting active labeling...


 n


name : None
country : ie

name : syquant icav
country : ie

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : nrcpd properties limited
country : ie

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : csm diversifie
country : fr

name : holding forest
country : fr

0/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : warburg rw 1 fonds
country : de

name : arvos holding gmbh
country : de

0/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : payoma ireland limited
country : ie

0/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : dara rock unlimited company
country : ie

0/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : irp auto obligations 1
country : fr

name : irp auto obligations 1
country : fr

0/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


name : actis long life infrastructure 2 a scsp
country : lu

name : actis long life infrastructure 2 a scsp
country : lu

1/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


name : allianz ppk 2045
country : pl

name : allianz ppk 2045
country : pl

2/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


name : None
country : ie

name : sanlam universal funds plc sanlam african frontier markets fund
country : ie

3/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ct responsible china ashares equity fund
country : ie

3/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ge capital european treasury services ireland unlimited company
country : ie

3/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 


(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : overdrivex limited
country : ie

3/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : impax listed infrastructure fund
country : ie

3/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ldi solutions plus icav ldi solutions plus nominal funds 20262030
country : ie

3/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : darren lynch executive pension scheme
country : ie

3/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : keerajac limited
country : ie

3/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : impax listed infrastructure fund
country : ie

3/10 positive, 15/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : alphas managed accounts platform lxxxiii limited clark fixed income total return segregated portfolio
country : ie

3/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : towers watson diversifying strategies master fund
country : ie

3/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : avi japanese special situations fund
country : ie

3/10 positive, 18/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : permal managed account platform icav p swm europe ie
country : ie

3/10 positive, 19/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : dara rock unlimited company
country : ie

3/10 positive, 20/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : nuveen global clean infrastructure impact fund
country : ie

3/10 positive, 21/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : nuveen global clean infrastructure impact fund
country : ie

3/10 positive, 22/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : keerajac limited
country : ie

3/10 positive, 23/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ishares msci world parisaligned climate ucits etf
country : ie

3/10 positive, 24/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : anchorage capital europe clo 8 designated activity company
country : ie

3/10 positive, 25/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : chana
country : fr

name : nam.r
country : fr

3/10 positive, 26/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : mbs invest 2
country : de

name : zvkkvs2fonds
country : de

3/10 positive, 27/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : alphas managed accounts platform lxxxiii limited clark fixed income total return segregated portfolio
country : ie

3/10 positive, 28/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : anchorage capital europe clo 8 designated activity company
country : ie

3/10 positive, 29/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : syquant icav
country : ie

3/10 positive, 30/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ge capital european treasury services ireland unlimited company
country : ie

3/10 positive, 31/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : last mile logistics cmbs 20231 uk designated activity company
country : ie

3/10 positive, 32/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : alphas managed accounts platform lxxxiii limited clark fixed income total return segregated portfolio
country : ie

3/10 positive, 33/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : avi japanese special situations fund
country : ie

3/10 positive, 34/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : wellington leasing no. 36 limited
country : ie

3/10 positive, 35/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ct responsible china ashares equity fund
country : ie

3/10 positive, 36/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 


(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 nn


(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : anchorage capital europe clo 8 designated activity company
country : ie

3/10 positive, 37/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : polen capital global growth fund
country : ie

3/10 positive, 38/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ishares msci world parisaligned climate ucits etf
country : ie

3/10 positive, 39/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : new ireland life management services limited
country : ie

3/10 positive, 40/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : new ireland life management services limited
country : ie

3/10 positive, 41/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ldi solutions plus icav ldi solutions plus nominal funds 20262030
country : ie

3/10 positive, 42/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : anchorage capital europe clo 8 designated activity company
country : ie

3/10 positive, 43/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ldi solutions plus icav ldi solutions plus nominal funds 20262030
country : ie

3/10 positive, 44/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : sanlam universal funds plc sanlam african frontier markets fund
country : ie

3/10 positive, 45/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ge capital european treasury services ireland unlimited company
country : ie

3/10 positive, 46/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : impax listed infrastructure fund
country : ie

3/10 positive, 47/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : psm
country : fr

name : m4d
country : fr

3/10 positive, 48/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : psm
country : fr

name : 2jp
country : fr

3/10 positive, 49/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : oakview capital l6 designated activity company
country : ie

3/10 positive, 50/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : metzler international investments plc metzler european equity enhanced
country : ie

3/10 positive, 51/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : gerard kenneally executive pension scheme
country : ie

3/10 positive, 52/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ge capital european treasury services ireland unlimited company
country : ie

3/10 positive, 53/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : impax listed infrastructure fund
country : ie

3/10 positive, 54/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : last mile logistics cmbs 20231 uk designated activity company
country : ie

3/10 positive, 55/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : gerard kenneally executive pension scheme
country : ie

3/10 positive, 56/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : overdrivex limited
country : ie

3/10 positive, 57/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ldi solutions plus icav ldi solutions plus nominal funds 20262030
country : ie

3/10 positive, 58/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ishares msci world parisaligned climate ucits etf
country : ie

3/10 positive, 59/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : fundlogic alternatives plc mariner investment diversifying alternative ucits fund
country : ie

3/10 positive, 60/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : wellington leasing no. 36 limited
country : ie

3/10 positive, 61/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : keerajac limited
country : ie

3/10 positive, 62/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : coach stores ireland limited
country : ie

3/10 positive, 63/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : ge capital european treasury services ireland unlimited company
country : ie

3/10 positive, 64/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : bpc ireland lending ii designated activity company
country : ie

3/10 positive, 65/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : wellington leasing no. 36 limited
country : ie

3/10 positive, 66/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : oakview capital l6 designated activity company
country : ie

3/10 positive, 67/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : anchorage capital europe clo 8 designated activity company
country : ie

3/10 positive, 68/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : gerard kenneally executive pension scheme
country : ie

3/10 positive, 69/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


name : None
country : ie

name : darren lynch executive pension scheme
country : ie

3/10 positive, 70/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 


(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 f


Finished labeling
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"

clustering...


KeyboardInterrupt: 

# Analyse Results

In [12]:
prec = pd.read_csv("/home/colombo/BusinessRegistryTool/Competitors/DedupeExp/precision/data_matching_output.csv",low_memory=False)
prec = prec.dropna(subset=['Cluster ID'])
pairs = prec.groupby('Cluster ID').count()
pairs = pairs[pairs['Link Score']>1].reset_index()['Cluster ID']

In [17]:
lei = []
LEI = []
code = []
for i in pairs:
    code.append(prec[prec['Cluster ID'] == i].iloc[0]['code'])
    lei.append(prec[prec['Cluster ID'] == i].iloc[0]['lei'])
    LEI.append(prec[prec['Cluster ID'] == i].iloc[1]['LEI'])

In [18]:
gg = pd.DataFrame({'lei':lei,'LEI':LEI,'code':code})

In [60]:
# gg.to_excel('/home/colombo/BusinessRegistryTool/Data/results/MFI_GLEIF_DEDUPE.xlsx')

In [None]:
BR_MFI = pd.read_csv("/home/colombo/BusinessRegistryTool/Data/mfi/mfi.csv")
BR_GLEIF = pd.read_csv("/home/colombo/BusinessRegistryTool/Data/gleif/gleif.csv",low_memory=False)
data1 = BR_MFI[['code','lei','address','name']]
data1 = data1.drop_duplicates().dropna()
data2 = BR_GLEIF[BR_GLEIF['Entity.EntityStatus'] == 'ACTIVE'][['LEI','Entity.LegalAddress.FirstAddressLine','Entity.LegalName']]
data2 = data2.drop_duplicates().dropna()

In [None]:
matTrue = pd.merge(data1,data2, how= 'inner', left_on ='lei', right_on = 'LEI')
n = len(matTrue['code'].drop_duplicates())

In [59]:
print('Dedupe Results')
print('Size of mapping table: '+ str(len(gg)))
df1_p = pd.merge(gg,matTrue, how = 'inner', on ='code')
df1_add = pd.merge(gg,matTrue, how = 'left', on ='code')
truepositive = pd.merge(gg[['LEI','code']].drop_duplicates(),matTrue[['code','LEI']].drop_duplicates(), how = 'inner',on = ['code','LEI']).drop_duplicates()
outer_join = pd.merge(df1_add[['code']].drop_duplicates(),matTrue[['code']].drop_duplicates(), how = 'outer', indicator = True)
recall = len(truepositive)/n
print('Size of inner matches: '+ str(len(truepositive)) + ' -> ' + str(round(recall*100,1))+'%')
print('Additional matches (new links discovered): '+ str(len(outer_join[outer_join['_merge'] == 'left_only'])))
precision = len(truepositive)/len(df1_p)
print('Incorrect matches: '+str(len(df1_p[df1_p['LEI_x'] != df1_p['LEI_y']]))+ ' -> Precision = ' + str(round(precision*100,1))+'%')
print('F1-Score: ' + str(2*((precision)*recall)/((precision)+recall)))

Dedupe Results
Size of mapping table: 69833
Size of inner matches: 40443 -> 66.4%
Additional matches (new links discovered): 18877
Incorrect matches: 10525 -> Precision = 78.9%
F1-Score: 0.7213012422083307


In [22]:
print('Dedupe Results')
print('Size of mapping table: '+ str(len(gg)))
df1_p = pd.merge(gg,matTrue, how = 'inner', on ='code')
df1_add = pd.merge(gg,matTrue, how = 'left', on ='code')
truepositive = pd.merge(gg[['LEI','code']].drop_duplicates(),matTrue[['code','LEI']].drop_duplicates(), how = 'inner',on = ['code','LEI']).drop_duplicates()
outer_join = pd.merge(df1_add[['code']].drop_duplicates(),matTrue[['code']].drop_duplicates(), how = 'outer', indicator = True)
recall = len(truepositive)/n
print('Size of inner matches: '+ str(len(truepositive)) + ' -> ' + str(round(recall*100,1))+'%')
print('Additional matches (new links discovered): '+ str(len(outer_join[outer_join['_merge'] == 'left_only'])))
precision = len(truepositive)/len(df1_p)
print('Incorrect matches: '+str(len(df1_p[df1_p['LEI_x'] != df1_p['LEI_y']]))+ ' -> Precision = ' + str(round(precision*100,1))+'%')
print('F1-Score: ' + str(2*((precision)*recall)/((precision)+recall)))

Dedupe Results
Size of mapping table: 82259
Size of inner matches: 40383 -> 66.3%
Additional matches (new links discovered): 24275
Incorrect matches: 17697 -> Precision = 69.2%
F1-Score: 0.6774079914114135


In [125]:
k2 = pd.merge(k,data1, how = 'inner', on = 'lei')

In [127]:
k = pd.merge(k2,data2, how = 'inner', on = 'LEI')

In [128]:
sum(k['lei'] != k['LEI'])

7495

In [134]:
len(k)

47243

In [132]:
k

Unnamed: 0,lei,LEI,name1,name2,Entity.LegalAddress.FirstAddressLine_x,Entity.LegalName_x,code,address,name,Entity.LegalAddress.FirstAddressLine_y,Entity.LegalName_y
0,959800AKN7AMRT0WTG43,959800AKN7AMRT0WTG43,"FONDLORETO EMPLEO, FONDO DE PENSIONES","FONDLORETO EMPLEO, FONDO DE PENSIONES",Paseo De La Castellana 40,FONDLORETO EMPLEO FONDO DE PENSIONES,ESV82838533,"Paseo de la Castellana, 40","FONDLORETO EMPLEO, FONDO DE PENSIONES",Paseo De La Castellana 40,FONDLORETO EMPLEO FONDO DE PENSIONES
1,95980020140005358942,95980020140005358942,"FONDOMEGA, FONDO DE PENSIONES","FONDOMEGA, FONDO DE PENSIONES",PASEO DE LA CASTELLANA 189,"FONDOMEGA, FONDO DE PENSIONES",ESV82498270,"PASEO DE LA CASTELLANA, 189","FONDOMEGA, FONDO DE PENSIONES",PASEO DE LA CASTELLANA 189,"FONDOMEGA, FONDO DE PENSIONES"
2,549300MUNXT61CPKOS18,549300MUNXT61CPKOS18,ASG Merkel I S.à r.l.,ASG Merkel I S.à r.l.,"35a, avenue J.F. Kennedy",ASG Merkel I S.à r.l.,LURCSB0243812,"35a, avenue J.F. Kennedy",ASG Merkel I S.à r.l.,"35a, avenue J.F. Kennedy",ASG Merkel I S.à r.l.
3,213800DMFTNKD1XP6V16,213800DMFTNKD1XP6V16,AXA IM InMotion RCF S.à r.l.,AXA IM InMotion RCF S.à r.l.,"2-4, RUE EUGÈNE RUPPERT",AXA IM INMOTION RCF S.À R.L.,LURCSB0243669,"2-4, rue Eugène Ruppert",AXA IM InMotion RCF S.à r.l.,"2-4, RUE EUGÈNE RUPPERT",AXA IM INMOTION RCF S.À R.L.
4,52990093GF8L3K83HU38,52990093GF8L3K83HU38,Volksbank Braunlage e.G.,Volksbank Braunlage e.G.,Herzog-Wilhelm-Straße 19,Volksbank Braunlage e.G.,DE01499,Herzog-Wilhelm-Straße 19,Volksbank Braunlage e.G.,Herzog-Wilhelm-Straße 19,Volksbank Braunlage e.G.
...,...,...,...,...,...,...,...,...,...,...,...
47238,549300GIUUQLJYFN4I12,549300K11J6OGTBDEE09,BLACKROCK GLOBAL FUNDS - SUSTAINABLE WORLD BON...,BLACKROCK GLOBAL FUNDS - SUSTAINABLE WORLD BON...,100 Bellevue Parkway,BlackRock Global Allocation Variable Series V....,LUO000003C00081,"2-4, rue Eugène Ruppert",BLACKROCK GLOBAL FUNDS - SUSTAINABLE WORLD BON...,100 Bellevue Parkway,BlackRock Global Allocation Variable Series V....
47239,549300ZIDTW9IQG08Q84,MODQGKIZ8C8UYQFQPL52,A&G GLOBAL II SICAV - SIF - ARTEMIS FUND,A&G GLOBAL II SICAV - SIF - ARTEMIS FUND,C/O The Corporation Trust Company,"AG Global Debt Strategy Partners, L.P.",LUO008913C00006,"56, Grand-rue",A&G GLOBAL II SICAV - SIF - ARTEMIS FUND,C/O The Corporation Trust Company,"AG Global Debt Strategy Partners, L.P."
47240,22210014L5NNUVXN5250,213800ZTGT6NCC74OF36,THE FUND SICAV SIF S.A. - RESILIANCE FUND,THE FUND SICAV SIF S.A. - RESILIANCE FUND,C/O RYSAFFE TRUSTEE COMPANY (C.I.) LIMITED,THE FUND OF FAMILY HAPPINESS TRUST,LUO008269C00008,"49, boulevard Prince Henri",THE FUND SICAV SIF S.A. - RESILIANCE FUND,C/O RYSAFFE TRUSTEE COMPANY (C.I.) LIMITED,THE FUND OF FAMILY HAPPINESS TRUST
47241,213800LINXZ2GGCA7L71,213800EO6GQHD4AIED29,HSBC GLOBAL INVESTMENT FUNDS - GLOBAL EMERGING...,HSBC GLOBAL INVESTMENT FUNDS - GLOBAL EMERGING...,GARTENSTRASSE 26,HSBC GLOBAL ASSET MANAGEMENT (SWITZERLAND) AG,LUO000256C00029,"4, rue Peternelchen",HSBC GLOBAL INVESTMENT FUNDS - GLOBAL EMERGING...,GARTENSTRASSE 26,HSBC GLOBAL ASSET MANAGEMENT (SWITZERLAND) AG
