In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This code demonstrates the Gazetteer.

We will use one of the sample files from the RecordLink example as the
canonical set.

"""

import os
import csv
import re
import logging
import optparse
import collections

import dedupe
from unidecode import unidecode


def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """

    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub(' +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column


def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID.
    """

    data_d = {}

    with open(filename) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
            if clean_row['price']:
                clean_row['price'] = float(clean_row['price'][1:])
            data_d[filename + str(i)] = dict(clean_row)

    return data_d

In [2]:



output_file = 'gazetteer_output.csv'
settings_file = 'gazetteer_learned_settings'
training_file = 'gazetteer_training.json'

In [3]:
canon_file = os.path.join('data', 'AbtBuy_Buy.csv')
messy_file = os.path.join('data', 'AbtBuy_Abt.csv')

print('importing data ...')
messy = readData(messy_file)

print('N data 1 records: {}'.format(len(messy)))

canonical = readData(canon_file)
print('N data 2 records: {}'.format(len(canonical)))

importing data ...
N data 1 records: 1081
N data 2 records: 1092


In [4]:
print(messy)

{'data/AbtBuy_Abt.csv0': {'unique_id': '1', 'title': 'linksys etherfast 8port 10 100 switch ezxs88w', 'description': 'linksys etherfast 8port 10 100 switch ezxs88w 10 100 dualspeed perport perfect for optimizing 10baset and 100basetx hardware on the same network speeds of up to 200mbps in full duplex operation eliminate bandwidth constraints and clear up bottlenecks', 'price': 44.0}, 'data/AbtBuy_Abt.csv1': {'unique_id': '2', 'title': 'linksys etherfast10 100 5port autosensing switch ezxs55w', 'description': 'linksys etherfast10 100 5port autosensing switch ezxs55w 5 port 10 100 autosensing ports with both half and full duplex modes perfect for integrating your 10baset and 100basetx network hardware switched 10 100 ports run at 10mbps 20mbps 100mbps up to 200mbps', 'price': 29.0}, 'data/AbtBuy_Abt.csv2': {'unique_id': '3', 'title': 'netgear prosafe 5 port 10 100 desktop switch fs105', 'description': 'netgear prosafe 5 port 10 100 desktop switch fs105 5 auto speedsensing 10 100 utp port

In [5]:
def descriptions():
        for dataset in (messy, canonical):
            for record in dataset.values():
                yield record['description']



In [17]:

fields = [
    {'field': 'title', 'type': 'String'},
    {'field': 'title', 'type': 'Text', 'corpus': descriptions()},
    {'field': 'description', 'type': 'Text',
     'has missing': True, 'corpus': descriptions()},
    {'field': 'price', 'type': 'Price', 'has missing': True}]

# Create a new gazetteer object and pass our data model to it.
gazetteer = dedupe.Gazetteer(fields)

# If we have training data saved from a previous run of gazetteer,
# look for it an load it in.
# __Note:__ if you want to train from scratch, delete the training_file

print('reading labeled examples from ', training_file)
with open(training_file) as tf:
    gazetteer.prepare_training(messy, canonical, training_file=tf)

gazetteer.train()
gazetteer.cleanup_training()

gazetteer.index(canonical)

results = gazetteer.search(messy, n_matches=2, generator=True)



reading labeled examples from  gazetteer_training.json


In [18]:
cluster_membership = {}
cluster_id = 0

for cluster_id, (messy_id, matches) in enumerate(results):
    for canon_id, score in matches:
        cluster_membership[messy_id] = {'Cluster ID': cluster_id,
                                        'Link Score': score}
        cluster_membership[canon_id] = {'Cluster ID': cluster_id,
                                        'Link Score': score}
        cluster_id += 1

with open(output_file, 'w') as f:

    header_unwritten = True

    for fileno, filename in enumerate((messy_file, canon_file)):
        with open(filename) as f_input:
            reader = csv.DictReader(f_input)

            if header_unwritten:

                fieldnames = (['Cluster ID', 'Link Score', 'source file'] +
                              reader.fieldnames)

                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()

                header_unwritten = False

            for row_id, row in enumerate(reader):

                record_id = filename + str(row_id)
                cluster_details = cluster_membership.get(record_id, {})
                row['source file'] = fileno
                
                row.update(cluster_details)
                print(fileno)

#                 writer.writerow(row)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
