In [5]:

import os
import csv
import re
import logging
import optparse
import collections

import dedupe
from unidecode import unidecode


def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """

    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub(' +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column


def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID.
    """

    data_d = {}

    with open(filename) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            print(row)
#             clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
#             if clean_row['price']:
#                 clean_row['price'] = float(clean_row['price'][1:])
#             data_d[filename + str(i)] = dict(clean_row)

    return data_d

In [6]:
canon_file = os.path.join('data', 'AbtBuy_Buy.csv')
canonical = readData(canon_file)
print('N data 2 records: {}'.format(len(canonical)))

{'unique_id': '1', 'title': 'Linksys EtherFast EZXS88W Ethernet Switch - EZXS88W', 'description': 'Linksys EtherFast 8-Port 10/100 Switch (New/Workgroup)', 'price': ''}
{'unique_id': '2', 'title': 'Linksys EtherFast EZXS55W Ethernet Switch', 'description': '5 x 10/100Base-TX LAN', 'price': ''}
{'unique_id': '3', 'title': 'Netgear ProSafe FS105 Ethernet Switch - FS105NA', 'description': 'NETGEAR FS105 Prosafe 5 Port 10/100 Desktop Switch', 'price': ''}
{'unique_id': '4', 'title': 'Belkin Pro Series High Integrity VGA/SVGA Monitor Extension Cable - F3H982-10', 'description': '1 x HD-15 - 1 x HD-15 - 10ft - Beige', 'price': ''}
{'unique_id': '5', 'title': 'Netgear ProSafe JFS516 Ethernet Switch', 'description': 'Netgear ProSafe 16 Port 10/100 Rackmount Switch- JFS516NA', 'price': ''}
{'unique_id': '6', 'title': 'LaCie Pocket Floppy Disk Drive - 706018', 'description': 'LaCie Pocket USB Floppy 1.44 MB', 'price': ''}
{'unique_id': '7', 'title': 'Canon KP 36IP Print Cartridge / Paper Kit - 7

In [None]:
cluster_membership = {}
cluster_id = 0

for cluster_id, (messy_id, matches) in enumerate(results):
    for canon_id, score in matches:
        cluster_membership[messy_id] = {'Cluster ID': cluster_id,
                                        'Link Score': score}
        cluster_membership[canon_id] = {'Cluster ID': cluster_id,
                                        'Link Score': score}
        cluster_id += 1

with open(output_file, 'w') as f:

    header_unwritten = True

    for fileno, filename in enumerate((messy_file, canon_file)):
        with open(filename) as f_input:
            reader = csv.DictReader(f_input)

            if header_unwritten:

                fieldnames = (['Cluster ID', 'Link Score', 'source file'] +
                              reader.fieldnames)

                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()

                header_unwritten = False

            for row_id, row in enumerate(reader):

                record_id = filename + str(row_id)
                cluster_details = cluster_membership.get(record_id, {})
                row['source file'] = fileno
                row.update(cluster_details)

                writer.writerow(row)