In [1]:
import pandas as pd
import numpy as np
from tqdm import notebook
import ast

import re
from fuzzywuzzy import fuzz
import os
import tensorflow as tf
import tensorflow_ranking as tfr
from tensorflow.compat.v1 import Session
from tensorflow.python.saved_model import loader
from sklearn.model_selection import KFold
tf.get_logger().setLevel('ERROR')

In [None]:
notebook.tqdm.pandas()
clean = pd.read_csv('../clean_data.csv')
voc = pd.read_csv('vocop-clustered-new.csv', sep='	')

In [None]:
uuid = []
name = []
for y, z in notebook.tqdm(clean.iterrows()):
    for x in ast.literal_eval(z.namen):
        if x['tussenvoegsel'] != None:
            name.append(x['voornaam'] + " " + x['tussenvoegsel'] + " " + x['achternaam'])
            uuid.append(z.uuid)
        elif x['voornaam'] and x['achternaam'] != None:
            name.append(x['voornaam'] + " " + x['achternaam'])
            uuid.append(z.uuid)
name_list = pd.DataFrame(data={'uuid':uuid, 'name':name}, columns=['uuid', 'name'])

In [None]:
name_df = clean.merge(name_list)

In [None]:
def fuzzy_search(name, distance):
    names = np.where((voc.fullNameNormalized.apply(fuzz.ratio, args=[name]) >= 90) | 
                     (voc.fullNameOriginal.dropna().apply(fuzz.ratio, args=[name]) >= 90))
    final = (name, names)
    return final

def find_matches(names, distance):
    name_list = {}
    final = []
    for x in notebook.tqdm(names):
        if x in name_list:
            final.append((x, name_list[x]))
        else:
            result = fuzzy_search(x, distance)
            name_list[x] = result
            final.append((x, result))
    return final

## Ranking

### Preparing the data

In [None]:
df = pd.read_csv('preranking.csv')
df['is_nil'] = 0
for x in df.notary_id.unique():
    nil = {}
    values = df[df.notary_id==x]
    for y in df:
        nil[y] = values[y].max()
    nil['name_count'] = values.name_count.min()
    nil['day_dif'] = values.day_dif.min()
    nil['voc_id'] = 'NIL'
    nil['is_nil'] = 1
    if values.match.mean() > 0:
        nil['match'] = 0
    else:
        nil['match'] = 1
    nil_df = pd.DataFrame(data=nil, index=[-1])
    df = pd.concat([df,nil_df])
df = df.reset_index(drop = True)
indexes = [x for x in np.random.choice(df.notary_id.unique(), int(len(df.notary_id.unique()) *0.8), replace=False)]
train = df[df.notary_id.isin(indexes)]
testval = df[df.notary_id.isin(indexes) == False]

test_indexes = [x for x in np.random.choice(testval.notary_id.unique(), len(testval.notary_id.unique()) // 2, replace=False)]
test = testval[testval.notary_id.isin(test_indexes)]
val = testval[testval.notary_id.isin(test_indexes) == False]


In [None]:
# train.to_json('train7.json')
# test.to_json('test7.json')
# val.to_json('val7.json')

# train = pd.read_json('trainltr.json')
train = pd.read_json('train7.json')
# test = pd.read_json('testltr.json')
test = pd.read_json('test7.json')
# val = pd.read_json('valltr.json')
val = pd.read_json('val7.json')

In [None]:
# train1 = train
# test1 = test
# val1 = val
# train2 = train
# test2 = test
# val2 = val
#train3 = train
#test3 = test
#val3 = val

In [None]:
# subcolumns = ['name_ratio', 'name_count', 'day_dif', 'location', 'rank', 'numships', 'is_nil'] 
# subcolumns = ['name_ratio', 'name_count', 'day_dif', 'location', 'rank', 'numships'] 
subcolumns = ['name_ratio', 'name_count', 'day_dif', 'location', 'rank', 'numships', 'keywords', 'is_nil'] 

In [None]:
file = open('train.txt', 'w')
for x in train.itertuples():
    line = str(x.match) + ' qid:' + str(x.notary_id)
    for y in enumerate(subcolumns):
        if getattr(x, y[1]) != 0:
            line = line + ' ' + str(y[0] + 1) + ':' + str(getattr(x, y[1]))
    if x.Index + 1 == df.shape[0]:
        file.writelines(line)
    else:
        file.writelines(line + '\n')
file.close()

file = open('test.txt', 'w')
for x in test.itertuples():
    line = str(x.match) + ' qid:' + str(x.notary_id)
    for y in enumerate(subcolumns):
        if getattr(x, y[1]) != 0:
            line = line + ' ' + str(y[0] + 1) + ':' + str(getattr(x, y[1]))
    if x.Index + 1 == df.shape[0]:
        file.writelines(line)
    else:
        file.writelines(line + '\n')
file.close()

file = open('vali.txt', 'w')
for x in val.itertuples():
    line = str(x.match) + ' qid:' + str(x.notary_id)
    for y in enumerate(subcolumns):
        if getattr(x, y[1]) != 0:
            line = line + ' ' + str(y[0] + 1) + ':' + str(getattr(x, y[1]))
    if x.Index + 1 == df.shape[0]:
        file.writelines(line)
    else:
        file.writelines(line + '\n')
file.close()

In [None]:
# !saved_model_cli show \
#     --dir six_features/export/1590418714 \
#     --tag_set serve \
#     --signature_def predict 

In [None]:
def serialize_example(values, subcolumn):
    """
    Creates a tf.Example message ready to be written to a file.
    """
    def _float_feature(value):
        """Returns an float_list from a int/float."""
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
    
    # Create a dictionary mapping the feature name to the tf.Example-compatible
    # data type.
    feature = {}
    for x in enumerate(subcolumn):
        feature[str(x[0] + 1)] = _float_feature(values[x[1]])

    # Create a Features message using tf.train.Example.

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

def predict(rows, id_list, directory):
    tags=["serve"]
    signature_def_key = "predict"
    saved_model_dir = directory
    holder = {}
    with Session() as sess:
        loader.load(sess, tags, saved_model_dir)
        for z in id_list:
            values = rows[rows.notary_id == z]
            nil = {}
            for x in values:
                if x != 'voc_id':
                    nil[x] = values[x].max()
            nil['name_count'] = values.name_count.min()
            nil['day_dif'] = values.day_dif.min()
            nil['voc_id'] = 'NIL'
            nil['is_nil'] = 1
            nil_df = pd.DataFrame(data=nil, index=[-1])
            values = pd.concat([values,nil_df])
            serialized_examples = []
            for i in range(len(values)):
                serialized_example = serialize_example(values.iloc[i], subcolumns)
                serialized_examples.append(serialized_example)
            inputs_feed_dict = {'input_example_tensor:0': serialized_examples}
            outputs = sess.run('groupwise_dnn_v2/accumulate_scores/div_no_nan:0', feed_dict=inputs_feed_dict)
            output = [(outputs[y][0], values.iloc[y].voc_id) for y in range(len(outputs))]
            #print(output)
            holder[z] = output
    return holder

def test_ranker(df, directory):
    tp = 0
    fp = 0
    fn1 = 0
    fn2 = 0
    tn = 0
    id_list = df.notary_id.unique()
    
    ranking = predict(df, id_list, directory)
    for x in id_list:
        try:
            predicted_match = max(ranking[x])
        except:
            for y in ranking[x]:
                #print(y[1])
                if y[1] == "NIL":
                    predicted_match=y

        #predicted_match[1] == 'NIL' 
        if predicted_match[1] == 'NIL':
            if df[df.notary_id == x].match.mean() > 0:
                fn1 += 1
            else:
                tn += 1
        else:
            target = df[(df.voc_id == predicted_match[1]) & (df.notary_id == x)]
            if target.match.iloc[0] == 1:
                tp += 1
            else:
                fp += 1
                if df[df.notary_id == x].match.mean() > 0:
                    fn2 += 1
                    
    recall = tp / (tp + (fn1 + fn2))
    if tp == 0:
        precision = 0
    else:
        precision = tp / (tp + fp)
    print('Recall: ' + str(recall))
    print('Precision: ' + str(precision))
    if precision == 0:
        print('F1: 0')
        f1 = 0
    else:
        f1 = 2*((precision*recall) / (precision + recall))
        print('F1: ' + str(f1))
    return {'true_positives':tp, 
            'false_positives':fp, 
            'false_negatives_threshold':fn1, 
            'false_negatives_ranker':fn2, 
            'true_negatives':tn, 
            'recall':recall, 
            'precision':precision,
            'f1':f1}
    

In [None]:
print(len(test[(test['voc_id'] != 'NIL') & (test.match == 1)])/ len(test))
print(test_ranker(test[test['voc_id'] != 'NIL'], 'LTR_models/nil_test2/export/1591714572'))
print('______________________________________________________________________________________')
# print(len(test1[(test1['voc_id'] != 'NIL') & (test1.match == 1)]) / len(test1))
# print(test_ranker(test1[test1['voc_id'] != 'NIL'], 'LTR_models/nil_test/export/1591713231'))
# print('______________________________________________________________________________________')
# print(len(test[(test['voc_id'] != 'NIL') & (test.match == 1)])/ len(test))
# print(test_ranker(test[test['voc_id'] != 'NIL'], 'LTR_models/nil_test/export/1591707535'))
# print('______________________________________________________________________________________')

# print(len(test3[(test3['voc_id'] != 'NIL') & (test3.match == 1)])/ len(test3))
# print(test_ranker(test3[test3['voc_id'] != 'NIL'], 'LTR_models/nil_test4/export/1591711791'))
# print('______________________________________________________________________________________')

## Results



## Cross Validation

In [None]:
def ranker_validation(df, directory, threshold):
    tp = 0
    fp = 0
    fn1 = 0
    fn2 = 0
    tn = 0
    id_list = df.notary_id.unique()
    
    ranking = predict(df, id_list, directory)
    for x in id_list:
        predicted_match = max(ranking[x])

        if predicted_match[0] <= threshold:
            if df[df.notary_id == x].match.mean() > 0:
                fn1 += 1
            else:
                tn += 1
        else:
            target = df[(df.voc_id == predicted_match[1]) & (df.notary_id == x)]
            if target.match.iloc[0] == 1:
                tp += 1
            else:
                fp += 1
                if df[df.notary_id == x].match.mean() > 0:
                    fn2 += 1
                    
    recall = tp / (tp + (fn1 + fn2))
    if tp == 0:
        precision = 0
    else:
        precision = tp / (tp + fp)
    print('Recall: ' + str(recall))
    print('Precision: ' + str(precision))
    if precision == 0:
        print('F1: 0')
        f1 = 0
    else:
        f1 = 2*((precision*recall) / (precision + recall))
        print('F1: ' + str(f1))
    return {'true_positives':tp, 
            'false_positives':fp, 
            'false_negatives_threshold':fn1, 
            'false_negatives_ranker':fn2, 
            'true_negatives':tn, 
            'recall':recall, 
            'precision':precision,
            'f1':f1}
    

In [None]:
kf = KFold(n_splits = 10, shuffle=True)
c = 0
for train_index, test_index in kf.split(df):
    c += 1
#     train = df[df.notary_id.isin(df.notary_id.unique()[train_index])]
#     test = df[df.notary_id.isin(df.notary_id.unique()[test_index])]
    train = df.loc[train_index]
    test = df.loc[test_index]
    
    file = open('ranking_crossvalidation/train_files/train' + str(c) + '.txt', 'w')
    for x in train.itertuples():
        line = str(x.match) + ' qid:' + str(x.notary_id)
        for y in enumerate(subcolumns):
            if getattr(x, y[1]) != 0:
                line = line + ' ' + str(y[0] + 1) + ':' + str(getattr(x, y[1]))
        if x.Index + 1 == df.shape[0]:
            file.writelines(line)
        else:
            file.writelines(line + '\n')
    file.close()
    
    file = open('ranking_crossvalidation/train_files/test' + str(c) + '.txt', 'w')
    for x in test.itertuples():
        line = str(x.match) + ' qid:' + str(x.notary_id)
        for y in enumerate(subcolumns):
            if getattr(x, y[1]) != 0:
                line = line + ' ' + str(y[0] + 1) + ':' + str(getattr(x, y[1]))
        if x.Index + 1 == df.shape[0]:
            file.writelines(line)
        else:
            file.writelines(line + '\n')
    file.close()
    
    file = open('ranking_crossvalidation/train_files/vali' + str(c) + '.txt', 'w')
    for x in test.itertuples():
        line = str(x.match) + ' qid:' + str(x.notary_id)
        for y in enumerate(subcolumns):
            if getattr(x, y[1]) != 0:
                line = line + ' ' + str(y[0] + 1) + ':' + str(getattr(x, y[1]))
        if x.Index + 1 == df.shape[0]:
            file.writelines(line)
        else:
            file.writelines(line + '\n')
    file.close()
    
    test.to_json('ranking_crossvalidation/train_files/test' + str(c) + '.json')

In [None]:
print(len(train))
print(len(test))

In [None]:
test = pd.read_json('ranking_crossvalidation/train_files/test1.json')
print('Amount of matches: ' + str(len(test[test.match == 1])))
print(test_ranker(test[test.voc_id != 'NIL'], 'ranking_crossvalidation/kfold1/export/1591783221'))

In [None]:
test = pd.read_json('ranking_crossvalidation/train_files/test2.json')
ranked = []
print('Amount of matches: ' + str(len(test[test.match == 1])))
print(test_ranker(test[test.voc_id != 'NIL'], 'ranking_crossvalidation/kfold2/export/1591782941'))

In [None]:
test = pd.read_json('ranking_crossvalidation/train_files/test3.json')
print('Amount of matches: ' + str(len(test[test.match == 1])))
print(test_ranker(test[test.voc_id != 'NIL'], 'ranking_crossvalidation/kfold3/export/1591784374'))

In [None]:
test = pd.read_json('ranking_crossvalidation/train_files/test3.json')
print('Amount of matches: ' + str(len(test[test.match == 1])))
print(test_ranker(test[test.voc_id != 'NIL'], 'ranking_crossvalidation/kfold3-2/export/1591784587'))


In [None]:
test = pd.read_json('ranking_crossvalidation/train_files/test4.json')
test = pd.read_json('ranking_crossvalidation/train_files/test3.json')
ranked = []
print('Amount of matches: ' + str(len(test[test.match == 1])))
for x in range(-5, 6, 1):
    #print('Threshold: ' +  str(x))
    rankje = (test_ranker(test, 'ranking_crossvalidation/kfold4/export/1591179797', x))
    ranked.append((rankje['f1'], x))
print('Best f1: ' + str(max(ranked)[0]))
print('Threshold: ' + str(max(ranked)[1]))

In [None]:
test = pd.read_json('ranking_crossvalidation/train_files/test5.json')
for x in range(-5, 6, 1):
    print('Threshold: ' +  str(x))
    print('Amount of matches: ' + str(len(test[test.match == 1])))
    print(test_ranker(test, 'ranking_crossvalidation/kfold5/export/1591180581', 0))    
    print('_________________________________________________________________________')



In [None]:
test = pd.read_json('ranking_crossvalidation/train_files/test6.json')
for x in range(-5, 6, 1):
    print('Threshold: ' +  str(x))
    print('Amount of matches: ' + str(len(test[test.match == 1])))
    print(test_ranker(test, 'ranking_crossvalidation/kfold6/export/1591181359', 0))    
    print('_________________________________________________________________________')

In [None]:
test = pd.read_json('ranking_crossvalidation/train_files/test7.json')
for x in range(-5, 6, 1):
    print('Threshold: ' +  str(x))
    print('Amount of matches: ' + str(len(test[test.match == 1])))
    print(test_ranker(test, 'ranking_crossvalidation/kfold7/export/1591098103', 0))
    print('_________________________________________________________________________')

In [None]:
test = pd.read_json('ranking_crossvalidation/train_files/test8.json')
for x in range(-5, 6, 1):
    print('Threshold: ' +  str(x))
    print('Amount of matches: ' + str(len(test[test.match == 1])))
    print(test_ranker(test, 'ranking_crossvalidation/kfold8/export/1591098103', 0))
    print('_________________________________________________________________________')


In [None]:
test = pd.read_json('ranking_crossvalidation/train_files/test9.json')
for x in range(-5, 6, 1):
    print('Threshold: ' +  str(x))
    print('Amount of matches: ' + str(len(test[test.match == 1])))
    print(test_ranker(test, 'ranking_crossvalidation/kfold9/export/1591035541', 0))
    print('_________________________________________________________________________')

In [None]:
test = pd.read_json('ranking_crossvalidation/train_files/test10.json')
for x in range(-5, 6, 1):
    print('Threshold: ' +  str(x))
    print('Amount of matches: ' + str(len(test[test.match == 1])))
    print(test_ranker(test, 'ranking_crossvalidation/kfold10/export/1591036489', 0))
    print('_________________________________________________________________________')


#### Pointwise LTR_models/pointwise/export/1590928427
Threshold: -2  
Recall: 0.8333333333333334  
Precision: 0.8333333333333334  
F1: 0.8333333333333334

#### Pointwise lower learning LTR_models/pointwise_lower_learning/export/1590936596
Threshold: -1  
Recall: 0.7777777777777778  
Precision: 0.7777777777777778  
F1: 0.7777777777777778

#### Pointwise higher dropout LTR_models/pointwise_higher_dropout/export/1590938159
Threshold: -2  
Recall: 0.7222222222222222  
Precision: 0.6842105263157895  
F1: 0.7027027027027027

#### Pointwise lower dropout LTR_models/pointwise_lower_dropout/export/1591009109
Threshold: 0  
Recall: 0.7692307692307693  
Precision: 1.0  
F1: 0.8695652173913044

#### Pointwise higher learning LTR_models/pointwise_higher_learning/export/1591010427
Threshold: -5  
Recall: 0.9230769230769231  
Precision: 0.75  
F1: 0.8275862068965517

#### Pointwise combined1  LTR_models/pointwise_combined1/export/1591011492
Threshold: -1  
Recall: 0.7692307692307693  
Precision: 0.9090909090909091  
F1: 0.8333333333333333

#### Pairwise LTR_models/pairwise/export/1590929599
Threshold: -1  
Recall: 0.6111111111111112  
Precision: 0.7857142857142857  
F1: 0.6875000000000001

#### Listwise LTR_models/listwise/export/1590930860
Threshold: 0  
Recall: 0.5555555555555556  
Precision: 0.8333333333333334  
F1: 0.6666666666666667  

#### Listwise lower learning LTR_models/listwise_lower_learning/export/1590932665
Threshold: 0  
Recall: 0.6111111111111112  
Precision: 0.9166666666666666  
F1: 0.7333333333333334

#### Listwise higher dropout  LTR_models/listwise_higher_dropout/export/1590934369
Threshold: 0  
Recall: 0.6111111111111112  
Precision: 0.8461538461538461  
F1: 0.7096774193548387  

#### Listwise combined LTR_models/listwise_combined/export/1590935654
Threshold: -1  
Recall: 0.7777777777777778  
Precision: 0.8235294117647058  
F1: 0.7999999999999999

#### Listwise lower dropout LTR_models/listwise_lower_dropout/export/1591013445
Threshold: 0  
Recall: 0.7222222222222222  
Precision: 0.8666666666666667  
F1: 0.7878787878787877

#### Listwise higher learning LTR_models/listwise_higher_learning/export/1591015344
Threshold: 0  
Recall: 0.6666666666666666  
Precision: 0.9230769230769231  
F1: 0.7741935483870968

#### Listwise combined2 LTR_models/listwise_combined2/export/1591016667
Threshold: -1  
Recall: 0.7222222222222222  
Precision: 0.8125  
F1: 0.7647058823529411

#### Listwise combined3 LTR_models/listwise_combined3/export/1591020106
Threshold: 0  
Recall: 0.6666666666666666  
Precision: 1.0  
F1: 0.8

## Dedupe

In [13]:
"""
This code demonstrates how to use dedupe with a comma separated values
(CSV) file. All operations are performed in memory, so will run very
quickly on datasets up to ~10,000 rows.

We start with a CSV file containing our messy data. In this example,
it is listings of early childhood education centers in Chicago
compiled from several different sources.

The output will be a CSV with our clustered results.

For larger datasets, see our [mysql_example](mysql_example.html)
"""

import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode


def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    # If data is missing, indicate that by setting the value to `None`
    if not column:
        column = None
    return column


def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID and each value is dict
    """

    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['index'])
            data_d[row_id] = dict(clean_row)
    return data_d


## Record Linking

In [70]:
import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode

def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """

    column = unidecode(column)
    column = re.sub('\n', ' ', column)
    column = re.sub('-', '', column)
    column = re.sub('/', ' ', column)
    column = re.sub("'", '', column)
    column = re.sub(",", '', column)
    column = re.sub(":", ' ', column)
    column = re.sub('  +', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column


def readData(filename):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID.
    """

    data_d = {}

    with open(filename) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
            data_d[filename + str(i)] = dict(clean_row)

    return data_d

# ## Setup

output_file = 'data_matching_output.csv'
settings_file = 'less_is_more2'
training_file = 'less_is_more2.json'

left_file = 'train_notary.csv'
right_file = 'train_voc.csv'

left_file = 'test_notary.csv'
right_file = 'test_voc.csv'


print('importing data ...')
data_1 = readData(left_file)
data_2 = readData(right_file)

def descriptions():
    for dataset in (data_1, data_2):
        for record in dataset.values():
            yield record['description']

# ## Training

if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as sf:
        linker = dedupe.StaticRecordLink(sf)

else:
    # Define the fields the linker will pay attention to
    #
    # Notice how we are telling the linker to use a custom field comparator
    # for the 'price' field.
    fields = [
        {'field': 'name', 'type': 'String', 'has missing': True},
        {'field': 'rank', 'type': 'String', 'has missing': True},
        {'field': 'location', 'type': 'String', 'has missing': True},
        {'field': 'ship_out', 'type': 'String', 'has missing': True},
        {'field': 'ship_return', 'type': 'String', 'has missing': True}
        ]

    # Create a new linker object and pass our data model to it.
    linker = dedupe.RecordLink(fields)

    # If we have training data saved from a previous run of linker,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file) as tf:
            linker.prepare_training(data_1,
                                    data_2,
                                    training_file=tf,
                                    sample_size=15000)
    else:
        linker.prepare_training(data_1, data_2, sample_size=15000)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as matches
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')

    dedupe.console_label(linker)

    linker.train()

    # When finished, save our training away to disk
    with open(training_file, 'w') as tf:
        linker.write_training(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'wb') as sf:
        linker.write_settings(sf)

# ## Blocking

# ## Clustering

# Find the threshold that will maximize a weighted average of our
# precision and recall.  When we set the recall weight to 2, we are
# saying we care twice as much about recall as we do precision.
#
# If we had more data, we would not pass in all the blocked data into
# this function but a representative sample.

print('clustering...')
linked_records = linker.join(data_1, data_2, 0.0)

print('# duplicate sets', len(linked_records))
# ## Writing Results

# Write our original data back out to a CSV with a new column called
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
for cluster_id, (cluster, score) in enumerate(linked_records):
    for record_id in cluster:
        cluster_membership[record_id] = {'Cluster ID': cluster_id,
                                         'Link Score': score}

with open(output_file, 'w') as f:

    header_unwritten = True

    for fileno, filename in enumerate((left_file, right_file)):
        with open(filename) as f_input:
            reader = csv.DictReader(f_input)

            if header_unwritten:

                fieldnames = (['Cluster ID', 'Link Score', 'source file'] +
                              reader.fieldnames)

                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()

                header_unwritten = False

            for row_id, row in enumerate(reader):

                record_id = filename + str(row_id)
                cluster_details = cluster_membership.get(record_id, {})
                row['source file'] = fileno
                row.update(cluster_details)

                writer.writerow(row)


INFO:dedupe.api:((LevenshteinSearchPredicate: (4, name), TfidfNGramSearchPredicate: (0.6, ship_out)), (SimplePredicate: (commonThreeTokens, name), TfidfTextSearchPredicate: (0.2, ship_return)), (SimplePredicate: (sameThreeCharStartPredicate, location), SimplePredicate: (suffixArray, ship_return)), (SimplePredicate: (commonTwoTokens, ship_return), SimplePredicate: (doubleMetaphone, rank)))


importing data ...
reading from less_is_more2
clustering...
# duplicate sets 23


In [71]:
import csv
import collections
import itertools
import os

def evaluateDuplicates(found_dupes, true_dupes):
    true_positives = found_dupes.intersection(true_dupes)
    false_positives = found_dupes.difference(true_dupes)
    uncovered_dupes = true_dupes.difference(found_dupes)

    print('found duplicate')
    print(len(found_dupes))

    print('precision')
    precision = len(true_positives) / (len(true_positives) + len(false_positives))
    print(precision)

    print('recall')
    recall = len(true_positives) / float(len(true_dupes))
    print(recall)
    
    print('f1')
    f1 = 2*((precision*recall) / (precision + recall))
    print(f1)


def linkPairs(filename, rowname) :
    link_d = {}

    with open(filename) as f:
        reader = csv.DictReader(f, delimiter=',', quotechar='"')
        for i, row in enumerate(reader):
            source_file, link_id = row['source file'], row[rowname]
            if link_id:
                if link_id not in link_d:
                    link_d[link_id] = collections.defaultdict(list)

                link_d[link_id][source_file].append(i)

    link_s = set()

    for members in link_d.values():
        for pair in itertools.product(*members.values()):
            if len(pair) > 1:
                link_s.add(frozenset(pair))

    return link_s

clusters = 'data_matching_output.csv'

true_dupes = linkPairs(clusters, 'cluster_id')
test_dupes = linkPairs(clusters, 'Cluster ID')
evaluateDuplicates(test_dupes, true_dupes)



found duplicate
23
precision
0.7391304347826086
recall
0.68
f1
0.7083333333333334


## Results

### Threshold 0.0:
#### 30 yes 30 
precision: 0.7391304347826086  
recall: 0.68  
f1: 0.7083333333333334  

#### 55 yes 80 no
precision: 0.6923076923076923  
recall: 0.72  
f1: 0.7058823529411765  


### Threshold 0.5
#### 30 yes 30 
precision: 0.8095238095238095  
recall: 0.68  
f1: 0.7391304347826089  

#### 55 yes 80 no
precision: 0.75  
recall: 0.72  
f1: 0.7346938775510204  

#### 47 yes 30 no
precision: 0.7777777777777778  
recall: 0.56  
f1: 0.6511627906976745  

### Threshold 0.99
#### 30 yes 30 
precision: 0.85  
recall: 0.68  
f1: 0.7555555555555556  

#### 55 yes 80 no
precision: 0.7727272727272727  
recall: 0.68  
f1: 0.7234042553191491  


## Nieuwe data

#### 37 yes 100 no
precision: 0.5925925925925926  
recall: 0.64  
f1: 0.6153846153846153  
