
https://www.kdnuggets.com/2016/06/regularization-logistic-regression.html

https://github.com/dedupeio/dedupe/tree/master/docs

https://github.com/dedupeio/dedupe-examples/tree/master/csv_example

https://dedupeio.github.io/dedupe-examples/docs/csv_example.html

https://dedupe.io/developers/library/en/latest/API-documentation.html#


In [None]:
import os
import re
import csv
import string
import dedupe
from unidecode import unidecode
import pandas as pd
import numpy as np

In [None]:
path = os.getcwd()
path

In [None]:


def preProcess(column):

#

    try : # python 2/3 string differences
        column = column.decode('utf8')
    except AttributeError:
        pass
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()

#If data is missing, indicate that by setting the value to None

    if not column:
        column = None
    return column

#Read in our data from a CSV file and create a dictionary of records, where the key is a unique record ID and each value is dict

def readData(filename):

#

    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['Id'])
            data_d[row_id] = dict(clean_row)

    return data_d

print('importing data ...')
data_dd = readData(path+'\\raw.txt')



In [None]:
data_dd[0]['Address']

In [None]:
raw = pd.read_csv(path+'\\raw.txt', sep=',', index_col=None, encoding='utf-8', 
                 dtype={'Site name':'object', 'Address':'object',
                        'Zip':'float', 'Phone':'float'})
print(len(raw))
match_cols =  ['Site name', 'Address', 'Zip', 'Phone']
print(raw[match_cols].dtypes)
print(raw[match_cols].isnull().sum())

In [None]:
def cleanse_raw(df, cols):
    for col in cols:
        if df[col].dtype ==  np.object_:
            df[col+'_'] = df[col].copy().str.lower().str.replace('[^a-zA-Z0-9]', ' ').copy()
        else:
            df[col+'_'] = df[col].copy()
    return df

In [None]:
cleansed = cleanse_raw(raw, match_cols)
clean_cols = [x for x in cleansed.columns if '_' in x]

In [None]:
data_d = cleansed[clean_cols].to_dict(orient='index')
data_d[0]

In [None]:
data_d[k].keys()

In [None]:
for k in data_d.keys():
    for k1 in ['Address_', 'Site name_']:
        data_d[k][k1] = str(data_d[k][k1])
    for k2 in ['Zip_', 'Phone_']:
        if np.isnan(data_d[k][k2]):
            data_d[k][k2] = None

# Dedupe

In [None]:
# define fields to use for de-dedup
fields = [
    {'field' : 'Site name', 'type': 'String'},
    {'field' : 'Address', 'type': 'String'},
    {'field' : 'Zip', 'type': 'Exact', 'has missing' : True},
    {'field' : 'Phone', 'type': 'String', 'has missing' : True},
    ]

In [None]:
# Create a new deduper object and pass our data model to it.
deduper = dedupe.Dedupe(fields)

In [None]:
deduper.sample(data_dd, sample_size=10000)

In [None]:
dedupe.consoleLabel(deduper)

In [None]:
 deduper.train()

In [None]:
threshold = deduper.threshold(data_dd, recall_weight=1)



In [None]:
output_file = '\\csv_example_output.csv'
settings_file = '\\csv_example_learned_settings'
training_file = '\\csv_example_training.json'

In [None]:


with open(path+training_file, 'w') as tf:
    deduper.writeTraining(tf)

#Save our weights and predicates to disk. If the settings file exists, we will skip all the training and learning next time we run this file.

with open(path+settings_file, 'wb') as sf:
    deduper.writeSettings(sf)



In [None]:
print('clustering...')
clustered_dupes = deduper.match(data_dd, threshold)

In [None]:
clustered_dupes[0]

In [None]:
data_dd[215]