### Question 2: Dedupe

In [1]:
#!pip install dedupe

In [2]:
#!pip install numpy --upgrade
#!pip install unidecode

In [3]:
import os
import csv
import re
import logging
import optparse
import py_entitymatching as em
import dedupe
import numpy
from unidecode import unidecode
import pandas as pd



In [4]:
numpy.__version__

'1.23.4'

In [5]:
def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID and each value is dict
    """

    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['rec_id'])
            data_d[row_id] = dict(clean_row)

    return data_d

In [6]:
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    # If data is missing, indicate that by setting the value to `None`
    if not column:
        column = None
    return column

In [7]:
input_file = 'people.csv' 
data_d = readData(input_file)

In [8]:
#Checking the schema
data_d[1]

{'rec_id': '1',
 ' given_name': 'mitchell',
 ' surname': 'green',
 ' street_number': '7',
 ' address_1': 'wallaby place',
 ' address_2': 'delmar',
 ' suburb': 'cleveland',
 ' postcode': '2119',
 ' state': 'sa',
 ' date_of_birth': '19560409'}

In [9]:
training_file = 'people_training.json'
settings_file = 'people_learned_settings'
output_file = 'dedupe.csv'

In [10]:
os.path.exists(settings_file)

True

In [11]:
os.path.exists(training_file)

True

In [12]:
if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
else:        
    # Define the fields dedupe will pay attention to
    fields = [
        {'field': ' given_name', 'type': 'String'},
        {'field': ' surname', 'type': 'String'},
        {'field': ' address_1', 'type': 'String', 'has missing': True},
        {'field': ' address_2', 'type': 'String', 'has missing': True},
       #{'field': ' suburb', 'type': 'Exact', 'has missing': True},
        #{'field': ' state','type': 'String', 'has missing': True},
        {'field': ' date_of_birth','type': 'String', 'has missing': True},
        ]
    deduper = dedupe.Dedupe(fields)
    
    # If we have training data saved from a previous run of dedupe,
    # look for it and load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(data_d, f)
    else:
        deduper.prepare_training(data_d)
    
    dedupe.console_label(deduper)
        
    # Using the examples we just labeled, train the deduper and learn
    # blocking predicates    
    deduper.train()
    
    # When finished, save our training to disk
    with open(training_file, 'w') as tf:
        deduper.write_training(tf)
        
    with open(settings_file, 'wb') as sf:
        deduper.write_settings(sf)

print('clustering...')
clustered_dupes = deduper.partition(data_d, 0.5)
print('# duplicate sets', len(clustered_dupes))

# Write our original data back out to a CSV with a new column called
# 'Cluster ID' which indicates which records refer to each other.
cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        cluster_membership[record_id] = {
            "Cluster ID": cluster_id,
            "confidence_score": score
        }
        
with open(output_file, 'w') as f_output, open(input_file) as f_input:
    reader = csv.DictReader(f_input)
    fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

    writer = csv.DictWriter(f_output, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        row_id = int(row['rec_id'])
        row.update(cluster_membership[row_id])
        writer.writerow(row)
        

reading from people_learned_settings
clustering...
# duplicate sets 1604


In [13]:
final_clusters = pd.read_csv(output_file)

In [14]:
final_clusters

Unnamed: 0,Cluster ID,confidence_score,rec_id,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth
0,6,0.427840,1,mitchell,green,7,wallaby place,delmar,cleveland,2119,sa,19560409
1,7,0.454876,2,harley,mccarthy,177,pridhamstreet,milton,marsden,3165,nsw,19080419
2,8,0.691934,3,madeline,mason,54,hoseason street,lakefront retrmnt vlge,granville,4881,nsw,19081128
3,719,0.607611,4,isabelle,,23,gundulu place,currin ga,utakarra,2193,wa,19921119
4,997,1.000000,5,taylor,hathaway,7,yuranigh court,brentwood vlge,,4220,nsw,19991207
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,576,0.674437,4996,jack,campbell,169,marr street,rhosewyn,oakleigh,3356,vic,19770109
4996,54,0.619670,4997,william,lazaroff,12,leah ylose,milwlood,forbes,7256,qld,
4997,450,0.613152,4998,destynii,bowerman,12,halford crescent,sutton,nollamara,2431,qld,19880821
4998,73,0.418188,4999,gianni,dooley,38,ashburton circuit,brentwood vlge,ryde,6025,qld,19371212


In [15]:
#Checking unique clusters
print("Unique cluster ids:")
final_clusters['Cluster ID'].nunique()

Unique cluster ids:


1604