# Begin here

Import the required libraries

In [2]:
import pandas as pd
import numpy as np
import recordlinkage
from recordlinkage.preprocessing import clean, phonetic
import pickle

from bitarray import bitarray

In [3]:
import recordlinkage
from recordlinkage.datasets import load_febrl4

## Import the dataset of interest

Import the dataset and define the working dataframe.

#### Load the FEBRL 4 datasets.

The Freely Extensible Biomedical Record Linkage (Febrl) package is distributed with a dataset generator `recordlinkage.datasets.load_febrl4` and four datasets generated with the generator. This function returns the fourth Febrl dataset as a pandas.DataFrame.

Generated as one data set with 10000 records 

    * 5000 originals (dataset4a.csv) and 
    * 5000 duplicates (dataset4b.csv), with one duplicate per original 


Parameters:	return_links (bool) – When True, the function returns also the true links.
Returns:(pandas.DataFrame, pandas.DataFrame) – A pandas.DataFrame with Febrl dataset4a.csv and a pandas dataframe with Febrl dataset4b.csv. When return_links is True, the function returns also the true links.

In [4]:
dfA, dfB, true_links = load_febrl4(return_links= True)

In [5]:
dfA

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-1070-org,michaela,neumann,8,stanley street,miami,winston hills,4223,nsw,19151111,5304218
rec-1016-org,courtney,painter,12,pinkerton circuit,bega flats,richlands,4560,vic,19161214,4066625
rec-4405-org,charles,green,38,salkauskas crescent,kela,dapto,4566,nsw,19480930,4365168
rec-1288-org,vanessa,parr,905,macquoid place,broadbridge manor,south grafton,2135,sa,19951119,9239102
rec-3585-org,mikayla,malloney,37,randwick road,avalind,hoppers crossing,4552,vic,19860208,7207688
...,...,...,...,...,...,...,...,...,...,...
rec-2153-org,annabel,grierson,97,mclachlan crescent,lantana lodge,broome,2480,nsw,19840224,7676186
rec-1604-org,sienna,musolino,22,smeaton circuit,pangani,mckinnon,2700,nsw,19890525,4971506
rec-1003-org,bradley,matthews,2,jondol place,horseshoe ck,jacobs well,7018,sa,19481122,8927667
rec-4883-org,brodee,egan,88,axon street,greenslopes,wamberal,2067,qld,19121113,6039042


In [6]:
#import our datasets 
# dataset1 = pd.read_csv(r'C:\Users\PVP1\Desktop\CDC_Python_Week1\data\Dataset1.csv')
# dataset2 = pd.read_csv(r'C:\Users\PVP1\Desktop\CDC_Python_Week1\data\Dataset2.csv')

dataset1 = dfA.rename(columns={'given_name': 'firstName', 'surname': 'lastName','postcode':'zip code','soc_sec_id':'social_sn','date_of_birth':'dateOfBirth'})
dataset2 = dfB.rename(columns={'given_name': 'firstName', 'surname': 'lastName','postcode':'zip code','soc_sec_id':'social_sn','date_of_birth':'dateOfBirth'})

In [7]:
#create the linkage dataset1
working_df1 = dataset1[['social_sn','firstName','lastName','zip code','dateOfBirth']]
#create the linkage dataset2
working_df2 = dataset2[['social_sn','firstName','lastName','zip code','dateOfBirth']]
#working_df.head()

In [8]:
working_df1.head()

Unnamed: 0_level_0,social_sn,firstName,lastName,zip code,dateOfBirth
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rec-1070-org,5304218,michaela,neumann,4223,19151111
rec-1016-org,4066625,courtney,painter,4560,19161214
rec-4405-org,4365168,charles,green,4566,19480930
rec-1288-org,9239102,vanessa,parr,2135,19951119
rec-3585-org,7207688,mikayla,malloney,4552,19860208


In [9]:
working_df2.head()

Unnamed: 0_level_0,social_sn,firstName,lastName,zip code,dateOfBirth
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rec-561-dup-0,1551941,elton,,3212,19651013
rec-2642-dup-0,8859999,mitchell,maxon,3355,19390212
rec-608-dup-0,9731855,,white,3159,19620216
rec-3239-dup-0,4970481,elk i,menzies,2585,19980624
rec-2886-dup-0,1366884,,garanggar,2342,19921016


In [10]:
# Add the Id columns
# import random

# working_df1['id'] = pd.Series(['NHS{}'.format(x).zfill(5) for x in random.sample(range(99999), 1000)])   
# working_df2['id'] = pd.Series(['CN{}'.format(x).zfill(5) for x in random.sample(range(9999), 1000)]) 

In [11]:
# working_df1 = working_df1.set_index('id')
# working_df2 = working_df2.set_index('id')

In [12]:
#working_df1

In [13]:
#working_df2

## Data Cleaning

### handle missing values
We will fill all missing values with empty strings. Empty strings are able to encoded in bloom filters 

## update this for the NaN

In [14]:
#fill missing data with empty string
working_df1 = working_df1.fillna(' ')
working_df2 = working_df2.fillna(' ')

### clean the data - Using the `clean` function from the `recordlinkage` library

Clean strings in the Series by removing unwanted tokens, whitespace and brackets. 
The `clean` function is from the `recordlinkage` library. 
When the cleaning is completed, all unwanted characters have been taken out of all the values in the dataset and we are ready to begin the encoding steps

In [15]:
# Dataset 1
working_df1['social_sn_cln'] = clean(working_df1['social_sn'])
working_df1['firstname_cln'] = clean(working_df1['firstName'])
working_df1['lastname_cln'] = clean(working_df1['lastName'])
#working_df1['phonenumber_cln'] = clean(working_df1['phoneNumber'])
#working_df1['gender_cln'] = clean(working_df1['gender'])

working_df1.head()

  s = s.str.replace(r'(\[.*?\]|\(.*?\)|\{.*?\})', '')
  s = s.str.replace(replace_by_none, '')
  s = s.str.replace(replace_by_whitespace, ' ')
  s = s.str.replace(r'\s\s+', ' ')


Unnamed: 0_level_0,social_sn,firstName,lastName,zip code,dateOfBirth,social_sn_cln,firstname_cln,lastname_cln
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
rec-1070-org,5304218,michaela,neumann,4223,19151111,5304218,michaela,neumann
rec-1016-org,4066625,courtney,painter,4560,19161214,4066625,courtney,painter
rec-4405-org,4365168,charles,green,4566,19480930,4365168,charles,green
rec-1288-org,9239102,vanessa,parr,2135,19951119,9239102,vanessa,parr
rec-3585-org,7207688,mikayla,malloney,4552,19860208,7207688,mikayla,malloney


In [16]:
# Dataset 2
working_df2['social_sn_cln'] = clean(working_df2['social_sn'])
working_df2['firstname_cln'] = clean(working_df2['firstName'])
working_df2['lastname_cln'] = clean(working_df2['lastName'])
#working_df2['phonenumber_cln'] = clean(working_df2['phoneNumber'])
#working_df2['gender_cln'] = clean(working_df2['gender'])

working_df2.head()

  s = s.str.replace(r'(\[.*?\]|\(.*?\)|\{.*?\})', '')
  s = s.str.replace(replace_by_none, '')
  s = s.str.replace(replace_by_whitespace, ' ')
  s = s.str.replace(r'\s\s+', ' ')


Unnamed: 0_level_0,social_sn,firstName,lastName,zip code,dateOfBirth,social_sn_cln,firstname_cln,lastname_cln
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
rec-561-dup-0,1551941,elton,,3212,19651013,1551941,elton,
rec-2642-dup-0,8859999,mitchell,maxon,3355,19390212,8859999,mitchell,maxon
rec-608-dup-0,9731855,,white,3159,19620216,9731855,,white
rec-3239-dup-0,4970481,elk i,menzies,2585,19980624,4970481,elk i,menzies
rec-2886-dup-0,1366884,,garanggar,2342,19921016,1366884,,garanggar


## Phonetic Encoding:

Convert names or strings into phonetic codes.

We are using the phonetic encoding function `phonetic` contained in the `recordlinkage` library. 

Different encoding methods exists for this library and they include `soundex`,`metaphone`,`nysiis` and `match-rating`. However, in this use case, we will use the `soundex` nethod because it is the most commonly used of all the four methods listed.

You can choose to use any of the listed methods for your implementation.

So here we encode `firstname` and `lastname`. The phonetic encoding method is applied to handle errors that may occur due to name misspelling or different variant of names thereby improving the quality of the match process. Example: `Zoey` vs `Zoe`, `Smith` vs `Smyth` etc.  

The output from this is clean and ready to be converted to bloom filters. 

In [17]:
# Dataset 1
working_df1['enc_firstname_cln'] = recordlinkage.preprocessing.phonetic(working_df1['firstname_cln'], 'soundex', concat=True, encoding='utf-8', decode_error='strict')
working_df1['enc_lastname_cln'] = recordlinkage.preprocessing.phonetic(working_df1['lastname_cln'], 'soundex', concat=True, encoding='utf-8', decode_error='strict')

working_df1.head()

  s = s.str.replace(r"[\-\_\s]", "")


Unnamed: 0_level_0,social_sn,firstName,lastName,zip code,dateOfBirth,social_sn_cln,firstname_cln,lastname_cln,enc_firstname_cln,enc_lastname_cln
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-1070-org,5304218,michaela,neumann,4223,19151111,5304218,michaela,neumann,M240,N550
rec-1016-org,4066625,courtney,painter,4560,19161214,4066625,courtney,painter,C635,P536
rec-4405-org,4365168,charles,green,4566,19480930,4365168,charles,green,C642,G650
rec-1288-org,9239102,vanessa,parr,2135,19951119,9239102,vanessa,parr,V520,P600
rec-3585-org,7207688,mikayla,malloney,4552,19860208,7207688,mikayla,malloney,M240,M450


In [18]:
# Dataset 2
working_df2['enc_firstname_cln'] = recordlinkage.preprocessing.phonetic(working_df2['firstname_cln'], 'soundex', concat=True, encoding='utf-8', decode_error='strict')
working_df2['enc_lastname_cln'] = recordlinkage.preprocessing.phonetic(working_df2['lastname_cln'], 'soundex', concat=True, encoding='utf-8', decode_error='strict')

working_df2.head()

  s = s.str.replace(r"[\-\_\s]", "")


Unnamed: 0_level_0,social_sn,firstName,lastName,zip code,dateOfBirth,social_sn_cln,firstname_cln,lastname_cln,enc_firstname_cln,enc_lastname_cln
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-561-dup-0,1551941,elton,,3212,19651013,1551941,elton,,E435,
rec-2642-dup-0,8859999,mitchell,maxon,3355,19390212,8859999,mitchell,maxon,M324,M250
rec-608-dup-0,9731855,,white,3159,19620216,9731855,,white,,W300
rec-3239-dup-0,4970481,elk i,menzies,2585,19980624,4970481,elk i,menzies,E420,M522
rec-2886-dup-0,1366884,,garanggar,2342,19921016,1366884,,garanggar,,G652


### Privacy Presevation
All protected information are converted to bloom filters to preseve the privacy of the records. The protected dataset is retruned and feeds the record linkage ML model

----

## Bloom Filter Method

A bloom filter is a data structure that tells you whether an element is present in a set in a memory efficient way. The base structure of a bloom filter is a bit vector.

0|0|0|1|0|0|1|0
:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:
0|1|2|3|4|5|6|7

The number below is the index while the binary above represents the bits.

To add an element to the bloom filter, the element is hashed and set the bits in the bit vector at the index of those hash values to 1.<br>
For hashing, we used the `fnv` hash function. There are other has functions that can be used in this case like the `mummur hash`, or some other cryptographic hashes like the `MD5`, `SHA1` etc.

To determine the size of bit array:



To determine the number of hash functions:




In [19]:
# hash functions murmurh and fnv
#import mmh3
#import fnvhash as fnv
from hashlib import sha256
from hashlib import md5
from nltk import ngrams
from itertools import combinations
from struct import pack

In [20]:
l = 1000 #bloom filter length
k = 15 #number of hash functions
#H = [sha256(md5(bytes(i)).digest()) for i in range(k)]

# Converts a field to a set of n-grams."""
def n_grams(field):    
    return [''.join(ng) for ng in ngrams(' {} '.format(field), 2)]

# Returns a bit vector with all values set to zero."""
def bit_vector(size):    
    return [0 for _ in range(size)]

# Returns the indices generated by h(x)."""   
def hash_indices(x): 
    s = sha256()
    m = md5()
    if type(x) is str:
        x = x.encode('UTF-8')    
    for h in range(k):
        s.update(x)
        m.update(x)        
        func1 = int(s.hexdigest(), 16)
        func2 = int(m.hexdigest(), 16)
        digest = int((func1 + h*func2) % l) #double hashing function
        yield digest
        
def construct_fbf(field):
    bigram = [n_grams(f) for f in field]
    bitvector = [bitarray(bit_vector(l)) for _ in field]
    for (S, v) in zip(bigram, bitvector):
        for x in S:
            for i in hash_indices(x):
                v[i] = 1
    return bitvector

In [21]:
#convert the selected columns to bloomfilters

# Dataset 1
working_df1['bf_firstname'] = construct_fbf(working_df1['enc_firstname_cln'])
working_df1['bf_lastname'] = construct_fbf(working_df1['enc_lastname_cln'])
working_df1['bf_ssn'] = construct_fbf(working_df1['social_sn_cln'])
working_df1['bf_dob'] = construct_fbf(working_df1['dateOfBirth'])

#-----------------------------------------------------------------------------------------
# Dataset 2
#convert the selected columns to bloomfilters
working_df2['bf_firstname'] = construct_fbf(working_df2['enc_firstname_cln'])
working_df2['bf_lastname'] = construct_fbf(working_df2['enc_lastname_cln'])
working_df2['bf_ssn'] = construct_fbf(working_df2['social_sn_cln'])
working_df2['bf_dob'] = construct_fbf(working_df2['dateOfBirth'])

# -----------------------------------------------------------------------------------------
#create the privacy preserved dataset ready to be shared and for record linkage. 
final_dataset1 = working_df1[['bf_ssn','bf_firstname','bf_lastname','bf_dob','zip code']]
final_dataset2 = working_df2[['bf_ssn','bf_firstname','bf_lastname','bf_dob','zip code']]
final_dataset1.head()

Unnamed: 0_level_0,bf_ssn,bf_firstname,bf_lastname,bf_dob,zip code
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rec-1070-org,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4223
rec-1016-org,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4560
rec-4405-org,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4566
rec-1288-org,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2135
rec-3585-org,"[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",4552


In [23]:
final_dataset2.head()

Unnamed: 0_level_0,bf_ssn,bf_firstname,bf_lastname,bf_dob,zip code
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rec-561-dup-0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ...",3212
rec-2642-dup-0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3355
rec-608-dup-0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",3159
rec-3239-dup-0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2585
rec-2886-dup-0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2342


## Blocking & Indexing will happen here!

In [96]:
from recordlinkage.index import Block
from recordlinkage.base import BaseIndexAlgorithm
from datetime import datetime

In [97]:
def dice_coefficient(bf1, bf2):
    bf1 = np.asarray(bf1).astype(bool)
    bf2 = np.asarray(bf2).astype(bool)

    if bf1.shape != bf2.shape:
        raise ValueError("Shape mismatch: im1 and im2 must have the same shape.")

    # Compute Dice coefficient
    intersection = np.logical_and(bf1, bf2)

    return 2. * intersection.sum() / (bf1.sum() + bf2.sum())

### block on firstname only 

In [98]:
# block on firstname only 
class BlockBloomfiltersOnfirstname(BaseIndexAlgorithm):

    def _link_index(self, df_a, df_b):
        block_lst = []
        for i, bf_b in enumerate(df_b['bf_firstname']):
            block = [(df_a.index[x],df_b.index[i]) for x, bf_a in enumerate(df_a['bf_firstname']) if dice_coefficient(bf_b,bf_a) == 1]                  
            block_lst.extend(block)
        return pd.MultiIndex.from_tuples(block_lst, names=[df_a.index.name, df_b.index.name])

#blocking on firstname
indexerf = BlockBloomfiltersOnfirstname()

begin = datetime.now()

candidatesf = indexerf.index(final_dataset1, final_dataset2)
#candidates

finish = datetime.now()
ttcf = finish-begin
candidatesf_count = len(candidatesf)

### block on lastname only 

In [99]:
# block on lastname only 
class BlockBloomfiltersOnLastname(BaseIndexAlgorithm):

    def _link_index(self, df_a, df_b):
        block_lst = []
        for i, bf_b in enumerate(df_b['bf_lastname']):
            block = [(df_a.index[x],df_b.index[i]) for x, bf_a in enumerate(df_a['bf_lastname']) if dice_coefficient(bf_b,bf_a) == 1]                  
            block_lst.extend(block)
        return pd.MultiIndex.from_tuples(block_lst, names=[df_a.index.name, df_b.index.name])
    
#blocking on lastname
indexerl = BlockBloomfiltersOnLastname()
begin = datetime.now()

candidatesl = indexerl.index(final_dataset1, final_dataset2)
#candidates

finish = datetime.now()
ttcl = finish-begin
candidatesl_count = len(candidatesl)

### block on firstname or lastname 

In [100]:
class BlockBloomfiltersOnfirstorLastname(BaseIndexAlgorithm):

    def _link_index(self, df_a, df_b):
        block_lst = []
        b_data_fn = df_b['bf_firstname']
        b_data_ln = df_b['bf_lastname']
        a_data_fn = df_a['bf_firstname']
        a_data_ln = df_a['bf_lastname']
        
        for i, (fn1, ln1) in enumerate(zip(b_data_fn, b_data_ln)):
            block = [(df_a.index[x],df_b.index[i]) for x, (fn2, ln2) in enumerate(zip(a_data_fn,a_data_ln)) if dice_coefficient(fn2,fn1) == 1 or dice_coefficient(ln2,ln1) == 1]
            block_lst.extend(block)
            
        return pd.MultiIndex.from_tuples(block_lst, names=[df_a.index.name, df_b.index.name])
    
#blocking on firstname or lastname
indexerfl = BlockBloomfiltersOnfirstorLastname()
begin = datetime.now()

candidatesfl = indexerfl.index(final_dataset1, final_dataset2)
#candidates

finish = datetime.now()
ttcfl = finish-begin
candidatesfl_count = len(candidatesfl)

## Comparison

The python `recordlinkage` toolkit has some defined classes for comapring similarities between record pais. This includes methods to compare strings, numerical measures, distance measures etc. <br>
However, because our dataset contains `bloomfilters` which are of type `bytearray`, we needed to find a different way to compare the bloomfilter record pais. In our use case, we are using the `Dice coefficient`. See description below.<br> <br>
We overwrite the abstract method `recordlinkage.base.BaseCompareFeature._compute_vectorized()` in the base class with our user-defined algorithm composed of the `dice coefficient` function. See the `CompareBloomFilters` class below.

---
### Dice coefficient for comparing set similarity.

Computes the Dice coefficient, a measure of set similarity. The formula is given by:

2*|X Y|/(|X|+|Y|)

#### Parameters
----
bf1 : array-like, bool
    
    (Any array of arbitrary size. If not boolean, will be converted.)
    
bf2 : array-like, bool
    
    (Any other array of identical size. If not boolean, will be converted.)

#### Returns
----
dice : float

    Dice coefficient as a float on range [0,1].
    Maximum similarity = 1
    No similarity = 0

#### Notes
----
The order of inputs for `dice` is irrelevant. The result will be identical if `im1` and `im2` are switched.

In [101]:
# A typical Dice Coefficient function will look like this

# def dice_coefficient(bf1, bf2):
#     bf1 = np.asarray(bf1).astype(np.bool)
#     bf2 = np.asarray(bf2).astype(np.bool)

#     if bf1.shape != bf2.shape:
#         raise ValueError("Shape mismatch: im1 and im2 must have the same shape.")

#     # Compute Dice coefficient
#     intersection = np.logical_and(bf1, bf2)

#     return 2. * intersection.sum() / (bf1.sum() + bf2.sum())

In [102]:
#Import the record linkage libraries
from recordlinkage.index import Block
from recordlinkage.compare import Exact, Numeric, String
from recordlinkage.base import BaseCompareFeature

In [103]:
class CompareBloomFilters(BaseCompareFeature):

    def _compute_vectorized(self, df1, df2):

    # If the bloomfilters in both records are identical, the similarity is 1.
    # If they are not identical the similarity is 0. 
    # The closer they bloomfilters are, the similarity tends to 1.

        lst = []
        # we pick a single bloom filter from each of the series, df1, df2, convert them to boolean, 
        # get the intersection which is a logical and operation and then 
        # calculate the similarity of the 2 bloom filters based on the formular stated in the introduction cell above
        for i in range(len(df1)): 
            #converts the bloom filter from the series to boolean types
            array1 = np.asarray(df1.iloc[i]).astype(bool) 
            array2 = np.asarray(df2.iloc[i]).astype(bool) 
            # logical and operation
            intersection = np.logical_and(array1, array2) 
            # calculate the similarity score between the 2 bloom filters
            similarity = 2. * intersection.sum() / (array1.sum() + array2.sum()) # 
            lst.append(similarity)
        
        #convert the list of similarities to a series. The compute method requires either a series or a numpy array
        similarity_list = pd.Series(lst)
    
        return similarity_list

In [104]:
#initialise the compare methods
comp = recordlinkage.Compare()

# initialise similarity measurement algorithms
comp.add(CompareBloomFilters('bf_firstname', 'bf_firstname', label='firstname'))
comp.add(CompareBloomFilters('bf_lastname', 'bf_lastname', label='lasttname'))
comp.add(CompareBloomFilters('bf_ssn', 'bf_ssn', label='ssn'))
#comp.exact('gender_cln', 'gender_cln', label='gender')
#comp.add(CompareBloomFilters('bf_phone', 'bf_phone', label='phoneNumber'))
comp.add(CompareBloomFilters('bf_dob', 'bf_dob', label='dob'))
comp.exact('zip code', 'zip code', label='zipCode')

#verify the features. gives a list of the algorithms to create
comp.features

[<CompareBloomFilters 'firstname'>,
 <CompareBloomFilters 'lasttname'>,
 <CompareBloomFilters 'ssn'>,
 <CompareBloomFilters 'dob'>,
 <Exact 'zipCode'>]

In [105]:
# the method .compute() calculates the similarities across the record pairs and returns the DataFrame with the feature (similarity) vectors.  
# -- candidates - contains the multiIndex record pairs to compare
# -- dataset1, dataset2 - the dataframes you are matching
comparison_outputl = comp.compute(candidatesl, final_dataset1, final_dataset2)
comparison_outputf = comp.compute(candidatesf, final_dataset1, final_dataset2)
comparison_outputfl = comp.compute(candidatesfl, final_dataset1, final_dataset2)


In [106]:
# with open(r'C:\Users\PVP1\Desktop\CDC_Python_Week1\data\comparison_output_fln.pkl', 'wb') as f:
#     pickle.dump(comparison_output, f)

## Classification

## Record linkage Expectation Maximization Classifier

In [107]:
import recordlinkage as rl

#### classification results when blocking on lastname

In [108]:
t_index = len(final_dataset1) * len(final_dataset2)

In [109]:
ecm = rl.ECMClassifier()
data = comparison_outputl
data = data > 0.75 
col_list = ['firstname','lasttname','ssn','dob','zipCode']
for x in col_list:
    data[x] = data[x].map({True:1,False:0})

ecm_links = ecm.fit_predict(data)

comparison_outputl['true_links'] = [True if comparison_outputl.index[x] in list(true_links) else False for x in range(len(comparison_outputl))]
BMl = comparison_outputl['true_links'].sum()
cm = rl.confusion_matrix(true_links, ecm_links,len(data))
TM = cm[0][0]
FN = cm[0][1]
FM = cm[1][0]
TN = cm[1][1]
PCl = BMl/(TM + FN)
PQl = BMl/candidatesl_count
RRl = 1 - candidatesl_count/t_index
# f score
fscorel = rl.fscore(true_links,ecm_links)
# Precision
precisionl = rl.precision(true_links,ecm_links)
# Recall/Sensitivity
recalll = rl.recall(true_links,ecm_links)
#print('F score =',(fscorel), ', Precision =',precisionl,', Sensitivity =', recalll,', Pair Completeness =', PCl)

  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


#### classification results when blocking on firstname

In [110]:
ecm = rl.ECMClassifier()
data = comparison_outputf
data = data > 0.75 
col_list = ['firstname','lasttname','ssn','dob','zipCode']
for x in col_list:
    data[x] = data[x].map({True:1,False:0})

    
ecm_links = ecm.fit_predict(data)

comparison_outputf['true_links'] = [True if comparison_outputf.index[x] in list(true_links) else False for x in range(len(comparison_outputf))]
BMf = comparison_outputf['true_links'].sum()
cm = rl.confusion_matrix(true_links, ecm_links,len(data))
TM = cm[0][0]
FN = cm[0][1]
FM = cm[1][0]
TN = cm[1][1]
PCf = BMf/(TM + FN)
PQf = BMf/candidatesf_count
RRf = 1 - candidatesf_count/t_index
# f score
fscoref = rl.fscore(true_links,ecm_links)
# Precision
precisionf = rl.precision(true_links,ecm_links)
# Recall/Sensitivity
recallf = rl.recall(true_links,ecm_links)
#print('F score =',(fscoref), ', Precision =',precisionf,', Sensitivity =', recallf,', Pair Completeness =', PCf)

  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


#### classification results when blocking on firstname or lastname

In [111]:
ecm = rl.ECMClassifier()
data = comparison_outputfl
data = data > 0.75 
col_list = ['firstname','lasttname','ssn','dob','zipCode']
for x in col_list:
    data[x] = data[x].map({True:1,False:0})

    
ecm_links = ecm.fit_predict(data)

comparison_outputfl['true_links'] = [True if comparison_outputfl.index[x] in list(true_links) else False for x in range(len(comparison_outputfl))]
BMfl = comparison_outputfl['true_links'].sum()
BNfl = ~comparison_outputfl['true_links'].sum()
cm = rl.confusion_matrix(true_links, ecm_links,len(data))
TM = cm[0][0]
FN = cm[0][1]
FM = cm[1][0]
TN = cm[1][1]
PCfl = BMfl/(TM + FN)
PQfl = BMfl/candidatesfl_count
RRfl = 1 - candidatesfl_count/t_index
# f score
fscorefl = rl.fscore(true_links,ecm_links)
# Precision
precisionfl = rl.precision(true_links,ecm_links)
# Recall/Sensitivity
recallfl = rl.recall(true_links,ecm_links)
#print('F score =',(fscorefl), ', Precision =',precisionfl,', Sensitivity =', recallfl,', Pair Completeness =', PCfl)

  return len(links_true & links_pred)
  return int(total) - len(links_true | links_pred)


## Evaluation



##### Linkage Quality (classification) Measures
F score:  F-score is the harmonic mean of Precision and Recall, calculated as F-measure = 2 × (Precision × Recall)/(Precision + Recall)</br></br>
Precision : Precision is the fraction of record pairs classified as matches by a decision model that are true matches: Precision = TM/(TM + FM)</br></br>
Sensitivity/Recall: Recall is the fraction of true matches that are correctly classified as
matches by a decision model: Recall = TM/(TM + FN)</br></br>

##### Blocking technique Measures
Pairs completeness (PC): measures the effectiveness of a blocking technique in the record linkage process </br>
PC = BM/(TM + FN)  </br></br>
Pairs Quality (PQ): measures the efficiency of a blocking technique </br>
PQ = BM/(BM + BN) </br></br>

##### Scalability Measures
Reduction ratio: how much a blocking technique is able to reduce the number of candidate record pairs that are beign generated</br>
RR = 1.0 - (BM + BN) / (TM + FN + FM + TN)</br></br>

BM: true matches included in the candidate record pairs generated by blocking </br>
BN: true non-matches included in the candidate record pairs </br>
TM: True matches based on the classification model</br>
TN: True non-matches based on the classification model</br>
FM: False matches based on the classification model</br>
FN: False non-matches based on the classification model

In [112]:
Evaluation_results = {'Blocking technique':['firstname','lastname','firstname or lastname'], \
                      'Time to complete':[ttcf,ttcl,ttcfl], \
                     'Number of candidate pairs':[candidatesf_count,candidatesl_count,candidatesfl_count], \
                     'Pairs Completeness (PC)':[PCf,PCl,PCfl], \
                     'Pairs Quality (PQ)':[PQf,PQl,PQfl], \
                     'F score':[fscoref,fscorel,fscorefl], \
                     'Precision':[precisionf,precisionl,precisionfl], \
                     'Sensitivity (Recall)':[recallf,recallf,recallfl],\
                     'Reduction ratio (RR)':[RRf,RRl,RRfl]}

pd.DataFrame(Evaluation_results)

Unnamed: 0,Blocking technique,Time to complete,Number of candidate pairs,Pairs Completeness (PC),Pairs Quality (PQ),F score,Precision,Sensitivity (Recall),Reduction ratio (RR)
0,firstname,0 days 00:06:20.878405,186263,0.7558,0.020289,0.845339,0.995393,0.7346,0.992549
1,lastname,0 days 00:06:50.372987,120412,0.7786,0.032331,0.854897,0.992857,0.7346,0.995184
2,firstname or lastname,0 days 00:12:25.532275,302659,0.9022,0.014905,0.917916,0.996254,0.851,0.987894


## Record Linkage NaiveBayes Classifier

In [None]:
from __future__ import print_function
import numpy as np
import recordlinkage as rl
from recordlinkage.datasets import binary_vectors

In [None]:
# Initialise the NaiveBayesClassifier.
nbc = rl.NaiveBayesClassifier()
nbc.fit(data, true_links)
nbc_pred = nbc.predict(data)
nbc_pred

In [None]:
cm = rl.confusion_matrix(true_links, links_pred, total=len(data))
# f score
fscore = rl.fscore(true_links,links_pred)
# Precision
precision = rl.precision(true_links,links_pred)
# Recall/Sensitivity
recall = rl.recall(true_links,links_pred)
print("Confusion matrix:\n", cm)

In [None]:
print('When blocking on firstname only: F score =',(fscore), ', Precision =',precision,', Sensitivity =', recall)

In [None]:
hardening technique - https://github.com/gen-too/primat
