# CMPE 297 HW 7 - Active Learning end2end
## Abhishek Bais

#1.0 Setup

In [1]:
# Runtime restart might be required
!pip install numpy --upgrade



# 2.0 Import libraries

In [2]:
!pip install altair dedupe dedupe-variable-name jellyfish recordlinkage



In [3]:
!curl -O https://raw.githubusercontent.com/rachhouse/intro-to-data-linking/main/tutorial_notebooks/linking_tutorial_functions.py
%load linking_tutorial_functions.py

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 18508  100 18508    0     0   286k      0 --:--:-- --:--:-- --:--:--  291k


In [4]:
import datetime
import itertools
import os
import pathlib
import re
from typing import Any, Dict, Optional

import dedupe
import pandas as pd

import linking_tutorial_functions as tutorial

INFO:root:Generating grammar tables from /usr/lib/python3.7/lib2to3/Grammar.txt
INFO:root:Generating grammar tables from /usr/lib/python3.7/lib2to3/PatternGrammar.txt


#3.0 Define Working env,filepaths

For convenience, we'll define a `pathlib.Path` to reference our current working directory.

In [5]:
WORKING_DIR = pathlib.Path(os.path.abspath(''))
WORKING_DIR

PosixPath('/content')

#4.0 Load Training Dataset and Ground Truth Labels

In [7]:
df_A, df_B, df_ground_truth = tutorial.load_febrl_training_data(True)

Let's take a quick look at our training dataset to refresh on the columns, formats, and data.

In [13]:
df_A.head()

Unnamed: 0_level_0,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
person_id_A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
fbc4143d-15f9-4f27-b5f0-dedbadce6616,matilda,struck,8,ballard place,,west perth,2470,qld,10/02/61,32.0,03 05903135,8276847
48a56cad-7ba6-45e1-97cd-517ba65bdab5,lachlan,eglinton,36,kambalda crescent,villa 427,auburn,5109,,01/08/26,27.0,,9937958
b1792d21-e4be-4b86-8dea-454ffa5194c5,mikayla,asher,588,britten-jones drive,,miami,4218,nsw,11/02/25,32.0,03 33770501,7017310
96653d73-bebc-4459-94f3-c3f0a8c514d4,grace,bristow,7,,wandella park snowy,cardiff,6163,nsw,01/20/40,,07 37864073,3535974
41f038b8-77c0-45a5-9e1f-e62b8637ffd1,wilson,bishop,11,chisholm street,,bronte,2490,nsw,03/05/21,27.0,04 15209769,5573522


# 5.0 Perform Data Augmentation

We'll do minimal data augmentation before feeding our training data to `dedupe`; we just want to format the date of birth data as `mm/dd/yy`, and ensure all columns are in string format and stripped of trailing/leading whitespace. Additionally, `dedupe` requires input data to be in dictionaries, using the record id as the key and the record metadata as the value. So, we'll convert our dataframes to this format.

In [14]:
def format_dob(dob: str) -> Optional[str]:
    """ Transform date of birth format from YYYYMMDD to mm/dd/yy.
        If DOB cannot be transformed, return None.
    """
    try:
        if re.match(r"\d{8}", dob):
            return (datetime.datetime.strptime(dob, "%Y%m%d")).strftime("%m/%d/%y")
    except:
        pass

    return None

def strip_and_null(x: Any) -> Optional[str]:
    """ Stringify incoming variable, remove trailing/leading whitespace
        and return resulting string. Return None if resulting string is empty.
    """
    x = str(x).strip()
    
    if x == "":
        return None
    else:
        return x
    
def convert_df_to_dict(df: pd.DataFrame) -> Dict[str, Dict]:
    """ Convert pandas DataFrame to dict keyed by record id.
        Convert all fields to strings or Nones to satisfy dedupe.
        Transform date format of date_of_birth field.
    """    

    for col in df.columns:
        df[col] = df[col].apply(lambda x: strip_and_null(x))

    df["date_of_birth"] = df["date_of_birth"].apply(lambda x: format_dob(x))    

    return df.to_dict("index")

In [15]:
records_A = convert_df_to_dict(df_A)
records_B = convert_df_to_dict(df_B)

We can examine a small sample of the resulting transformed records:

In [16]:
[records_A[k] for k in list(records_A.keys())[0:2]]

[{'address_1': 'ballard place',
  'address_2': 'None',
  'age': '32',
  'date_of_birth': None,
  'first_name': 'matilda',
  'phone_number': '03 05903135',
  'postcode': '2470',
  'soc_sec_id': '8276847',
  'state': 'qld',
  'street_number': '8',
  'suburb': 'west perth',
  'surname': 'struck'},
 {'address_1': 'kambalda crescent',
  'address_2': 'villa 427',
  'age': '27',
  'date_of_birth': None,
  'first_name': 'lachlan',
  'phone_number': 'None',
  'postcode': '5109',
  'soc_sec_id': '9937958',
  'state': 'None',
  'street_number': '36',
  'suburb': 'auburn',
  'surname': 'eglinton'}]

# 6.0 Prepare Training

When we linked our data via SimSum and supervised learning, we defined our blockers and comparators manually with `recordlinkage`. The `dedupe` library takes an active learning approach to blocking and classification and will use our feedback gathered during the labeling session to learn blocking rules and train a classifier. 

To prepare our `dedupe.RecordLink` object for training, first we'll define the fields that we think `dedupe` should pay attention to when matching records - these definitions will serve as the comparators. The `field` contains the name of the attribute to use for comparison, and the `type` defines the comparison type.

In [17]:
%%time

fields = [
    { "field" : "first_name", "type" : "Name" },
    { "field" : "surname", "type" : "Name" },
    { "field" : "address_1", "type" : "ShortString" },
    { "field" : "address_2", "type" : "ShortString" },
    { "field" : "suburb", "type" : "ShortString" },
    { "field" : "postcode", "type" : "Exact" },
    { "field" : "state", "type" : "Exact" },
    { "field" : "date_of_birth", "type" : "DateTime" },
    { "field" : "soc_sec_id", "type" : "Exact" },
]

linker = dedupe.RecordLink(fields)
linker.prepare_training(records_A, records_B)

INFO:dedupe.canopy_index:Removing stop word re
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)


CPU times: user 47.2 s, sys: 437 ms, total: 47.6 s
Wall time: 47.5 s


#7.0 Do active learning

At this point, we're ready to provide feedback to `dedupe` via an active learning labeling session. For this, `dedupe` supplies a convenience method to iterate through pairs it is uncertain about. As you provide feedback for each pair, dedupe learns blocking rules and recalculates its linking model weights.

You can use `y` (yes, match), `n` (no, not match), and `u` (unsure) to provide feedback on candidate links. When you're ready to exit the labeling session, use `f`.

In [18]:
dedupe.console_label(linker)

first_name : liam
surname : kapeller
address_1 : None
address_2 : None
suburb : goshen
postcode : 2831
state : nsw
date_of_birth : None
soc_sec_id : 8403096

first_name : ali
surname : kapelldr
address_1 : None
address_2 : None
suburb : goshen
postcode : None
state : nsw
date_of_birth : None
soc_sec_id : 8403569

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


yes


(y)es / (n)o / (u)nsure / (f)inished


yes


(y)es / (n)o / (u)nsure / (f)inished


no


(y)es / (n)o / (u)nsure / (f)inished


no


(y)es / (n)o / (u)nsure / (f)inished


yes


(y)es / (n)o / (u)nsure / (f)inished


yes


(y)es / (n)o / (u)nsure / (f)inished


yes


(y)es / (n)o / (u)nsure / (f)inished


no


(y)es / (n)o / (u)nsure / (f)inished


yes


(y)es / (n)o / (u)nsure / (f)inished


yes


(y)es / (n)o / (u)nsure / (f)inished


no


(y)es / (n)o / (u)nsure / (f)inished


yes


(y)es / (n)o / (u)nsure / (f)inished


yes


(y)es / (n)o / (u)nsure / (f)inished


yes


(y)es / (n)o / (u)nsure / (f)inished


yes


(y)es / (n)o / (u)nsure / (f)inished


finished


(y)es / (n)o / (u)nsure / (f)inished


u


first_name : rourke
surname : collo
address_1 : burston place
address_2 : None
suburb : belmont
postcode : 6170
state : qld
date_of_birth : None
soc_sec_id : 7418040

first_name : rourje
surname : collo
address_1 : burston place
address_2 : None
suburb : belmont
postcode : 6170
state : qld
date_of_birth : None
soc_sec_id : 7418004

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : lara
surname : paterson
address_1 : ferber place
address_2 : the willows
suburb : woombye
postcode : 4455
state : vic
date_of_birth : None
soc_sec_id : 8366330

first_name : lara
surname : patezln
address_1 : ferber place
address_2 : the willows
suburb : woombye
postcode : 4455
state : vic
date_of_birth : None
soc_sec_id : 3124171

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)
INFO:dedupe.training:SimplePredicate: (firstTwoTokensPredicate, address_1)
first_name : sarsha
surname : mortlock
address_1 : None
address_2 : None
suburb : mckail
postcode : 2456
state : vic
date_of_birth : None
soc_sec_id : 3972303

first_name : sarsha
surname : mortlock
address_1 : None
address_2 : None
suburb : mckail
postcode : 2456
state : vic
date_of_birth : None
soc_sec_id : 3973203

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : thomas
surname : white
address_1 : None
address_2 : None
suburb : scarborough
postcode : 4817
state : qld
date_of_birth : None
soc_sec_id : 3479432

first_name : thomas
surname : wight
address_1 : None
address_2 : None
suburb : scarborough
postcode : 4817
state : qld
date_of_birth : None
soc_sec_id : 3479542

3/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:PartialPredicate: (commonFourGram, first_name, Surname)
first_name : bailey
surname : godfrey
address_1 : woinarski place
address_2 : None
suburb : patterson lakes
postcode : 5082
state : vic
date_of_birth : None
soc_sec_id : 2302451

first_name : godfrey
surname : bailexy
address_1 : woinarski place
address_2 : None
suburb : patterson lakes
postcode : 5082
state : vic
date_of_birth : None
soc_sec_id : 2302451

3/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


first_name : emiily
surname : lowe
address_1 : pollock street
address_2 : beechworth
suburb : madora
postcode : 5161
state : vic
date_of_birth : None
soc_sec_id : 6055911

first_name : philipp
surname : lowe
address_1 : pollock street
address_2 : beechworth
suburb : madora
postcode : 5161
state : vic
date_of_birth : None
soc_sec_id : 6055911

3/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : wil
surname : cure
address_1 : northbourne avenue
address_2 : None
suburb : witchcliffe
postcode : 3355
state : qld
date_of_birth : None
soc_sec_id : 5949749

first_name : wil
surname : cu fje
address_1 : northbourne avenue
address_2 : None
suburb : witchcliffe
postcode : 3355
state : qld
date_of_birth : None
soc_sec_id : 5949749

4/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:PartialPredicate: (commonSixGram, first_name, Surname)
INFO:dedupe.training:SimplePredicate: (firstTwoTokensPredicate, address_1)
first_name : yana
surname : matthews
address_1 : maribyrnong avenue
address_2 : longstay caravn park
suburb : cooroy
postcode : 6109
state : qld
date_of_birth : None
soc_sec_id : 3450547

first_name : yana
surname : matthepws
address_1 : maribyrnonvgavenue
address_2 : longstay caravn park
suburb : cooroy
postcode : 6109
state : qld
date_of_birth : None
soc_sec_id : 3450547

5/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : rhys
surname : ryan
address_1 : mckail crescent
address_2 : glenview
suburb : lennox head
postcode : 2165
state : vic
date_of_birth : None
soc_sec_id : 4198475

first_name : rhys
surname : ryabe
address_1 : mckail ckrscent
address_2 : glenview
suburb : lennox head
postcode : 2165
state : vic
date_of_birth : None
soc_sec_id : 4198475

6/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:PartialPredicate: (commonSixGram, first_name, Surname)
INFO:dedupe.training:SimplePredicate: (firstTwoTokensPredicate, address_1)
INFO:dedupe.training:SimplePredicate: (commonThreeTokens, address_2)
first_name : blade
surname : lowry
address_1 : palmer street
address_2 : None
suburb : plumpton
postcode : 2537
state : vic
date_of_birth : None
soc_sec_id : 3702857

first_name : blade
surname : loqau
address_1 : palmer srreet
address_2 : kurrajong
suburb : plumpton
postcode : 2537
state : vima
date_of_birth : None
soc_sec_id : 3702857

7/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:PartialPredicate: (commonFourGram, first_name, Surname)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)
first_name : claudia
surname : moscatt
address_1 : emu bank street
address_2 : brookfield
suburb : cundletown
postcode : 2500
state : None
date_of_birth : None
soc_sec_id : 2577386

first_name : claudis
surname : moscatt
address_1 : emu bank street
address_2 : brookfield
suburb : cundletown
postcode : 2500
state : None
date_of_birth : None
soc_sec_id : 2577368

8/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : ella
surname : fenwick
address_1 : antill street
address_2 : None
suburb : wilson
postcode : 3658
state : nsw
date_of_birth : None
soc_sec_id : 4327554

first_name : fenwick
surname : ella
address_1 : antill street
address_2 : None
suburb : wilson
postcode : 3658
state : nsll
date_of_birth : None
soc_sec_id : 4326554

9/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:PartialPredicate: (commonFourGram, first_name, Surname)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)
INFO:dedupe.training:PartialIndexTfidfNGramSearchPredicate: (0.6, surname, CorporationName)
first_name : ryan
surname : d'sylau
address_1 : starke street
address_2 : None
suburb : chain valley bay
postcode : 4116
state : qld
date_of_birth : None
soc_sec_id : 1566653

first_name : ran
surname : d'sylau
address_1 : starke street
address_2 : None
suburb : chain valley bay
postcode : 4116
state : qml
date_of_birth : None
soc_sec_id : 1565554

10/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, suburb)


We can now train our linker, based on the labeling session feedback.

In [19]:
%%time
linker.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.000100, score 0.1388888888888889
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (twoGramFingerprint, suburb), SimplePredicate: (wholeFieldPredicate, postcode), SimplePredicate: (sortedAcronym, address_1))
INFO:dedupe.training:(SimplePredicate: (commonThreeTokens, address_2), SimplePredicate: (commonSixGram, surname))


CPU times: user 5.66 s, sys: 553 ms, total: 6.21 s
Wall time: 5.68 s


Let's persist our training data (captured during in the labeling session), as well as the learned model weights.

In [20]:
ACTIVE_LEARNING_DIR = WORKING_DIR / "dedupe_active_learning"
ACTIVE_LEARNING_DIR.mkdir(parents=True, exist_ok=True)

SETTINGS_FILE = ACTIVE_LEARNING_DIR / "dedupe_learned_settings"
TRAINING_FILE = ACTIVE_LEARNING_DIR / "dedupe_training.json"

with open(TRAINING_FILE, "w") as fh:
    linker.write_training(fh)
    
with open(SETTINGS_FILE, "wb") as sf:
    linker.write_settings(sf)

## Examine Learned Blockers

Now, let's take a look at the predicates (blockers) that `dedupe` learned during our active learning labeling session. Note that `dedupe` can learn composite predicates/blockers, i.e. individual predicates can be combined with logical operators.

In [21]:
linker.predicates

((SimplePredicate: (twoGramFingerprint, suburb),
  SimplePredicate: (wholeFieldPredicate, postcode),
  SimplePredicate: (sortedAcronym, address_1)),
 (SimplePredicate: (commonThreeTokens, address_2),
  SimplePredicate: (commonSixGram, surname)))

Next, let's examine the resulting candidate pairs and look at our blocking efficiency. The `.pairs` method will give us all candidate record pairs that are generated by blocking with the learned blockers.

In [22]:
candidate_pairs = [x for x in linker.pairs(records_A, records_B)]
print(f"{len(candidate_pairs):,} candidate pairs generated from blocking.")

1,244 candidate pairs generated from blocking.


You'll notice that, in contrast to `recordlinkage`, our post-blocking candidate pairs contain both the record ids as well as the record metadata.

In [23]:
candidate_pairs[0]

(('48a56cad-7ba6-45e1-97cd-517ba65bdab5',
  {'address_1': 'kambalda crescent',
   'address_2': 'villa 427',
   'age': '27',
   'date_of_birth': None,
   'first_name': 'lachlan',
   'phone_number': 'None',
   'postcode': '5109',
   'soc_sec_id': '9937958',
   'state': 'None',
   'street_number': '36',
   'suburb': 'auburn',
   'surname': 'eglinton'}),
 ('c77c2c04-4415-4c4d-b248-18dc28fd63d0',
  {'address_1': 'kambalda crescent',
   'address_2': 'None',
   'age': 'None',
   'date_of_birth': None,
   'first_name': 'lachlan',
   'phone_number': 'None',
   'postcode': '5109',
   'soc_sec_id': '9937958',
   'state': 'None',
   'street_number': '366',
   'suburb': 'auburn',
   'surname': 'eglinton'}))

We can assemble our candidate pair ids into an indexed pandas dataframe for easier comparision with our known true links.

In [24]:
df_candidate_links = pd.DataFrame(
    [(x[0][0], x[1][0]) for x in candidate_pairs]
).rename(columns={0 : "person_id_A", 1 : "person_id_B"}).set_index(["person_id_A", "person_id_B"])

df_candidate_links.head()

person_id_A,person_id_B
48a56cad-7ba6-45e1-97cd-517ba65bdab5,c77c2c04-4415-4c4d-b248-18dc28fd63d0
b4e3efc2-9c8f-4e3e-8b98-9bfa842094f9,e63f19ca-3f5b-4021-ac1e-05fc7495bd48
4091b2cd-f68c-447e-80ff-5ee4dde4f057,de64cc87-e3f4-4546-8e20-1294b19f9cac
75d0094f-ff34-4ebb-947f-387c040e463e,d2f7b732-f4fa-455a-a5c4-9dd4c5e3f6bf
77d7922c-c550-4420-b66d-bc6249d57b30,a1d7a5aa-9a71-4a07-977b-a405468d0cd9


Now, let's take a look at our learned blocker performance.

In [25]:
max_candidate_pairs = df_A.shape[0]*df_B.shape[0]

print(f"{max_candidate_pairs:,} total possible pairs.")

# Calculate search space reduction.
search_space_reduction = round(1 - len(candidate_pairs)/max_candidate_pairs, 6)
print(f"\n{len(candidate_pairs):,} pairs after full blocking: {search_space_reduction}% search space reduction.")

# Calculate retained true links percentage.
total_true_links = df_ground_truth.shape[0]
true_links_after_blocking = pd.merge(
    df_ground_truth,
    df_candidate_links,
    left_index=True,
    right_index=True,
    how="inner"
).shape[0]

retained_true_link_percent = round((true_links_after_blocking/total_true_links) * 100, 2)
print(f"{retained_true_link_percent}% true links retained after blocking.")

10,562,500 total possible pairs.

1,244 pairs after full blocking: 0.999882% search space reduction.
41.47% true links retained after blocking.


## Score Pairs and Examine Learned Classifier

After `dedupe` has trained blockers and a classification model based on our labeling session, we can link the records in our training dataset via the `.join` method.

In [26]:
%%time
linked_records = linker.join(records_A, records_B, threshold=0.0, constraint="one-to-one")

CPU times: user 225 ms, sys: 55.6 ms, total: 281 ms
Wall time: 694 ms


`linker.join` will return the links, along with a model confidence.

In [27]:
linked_records[0:3]

[(('ffd668ac-2f63-4c05-a6a3-58ebcf1f4a80',
   '3e8c4b67-3611-4a08-84c8-b082b627bb21'),
  1.0),
 (('fe729c15-1e07-4b57-9a23-6c74a395f3d2',
   '00f1473f-0b83-4eef-aa13-22fc3702b521'),
  1.0),
 (('fc8bd23d-afdf-4a67-81d8-31fafe560c76',
   '241eb6d8-c988-4838-96e3-1fff5e778bd4'),
  1.0)]

We'll format the `dedupe` linker predictions into a format that we can use with our existing evaluation functions.

In [28]:
df_predictions = pd.DataFrame(
    [ {"person_id_A" : x[0][0], "person_id_B" : x[0][1], "model_score" : x[1]} for x in linked_records]
)

df_predictions = df_predictions.set_index(["person_id_A", "person_id_B"])

df_predictions = pd.merge(
    df_predictions,
    df_ground_truth,
    left_index=True,
    right_index=True,
    how="left",
)

df_predictions["ground_truth"].fillna(False, inplace=True)
df_predictions

Unnamed: 0_level_0,Unnamed: 1_level_0,model_score,ground_truth
person_id_A,person_id_B,Unnamed: 2_level_1,Unnamed: 3_level_1
ffd668ac-2f63-4c05-a6a3-58ebcf1f4a80,3e8c4b67-3611-4a08-84c8-b082b627bb21,1.000000e+00,True
fe729c15-1e07-4b57-9a23-6c74a395f3d2,00f1473f-0b83-4eef-aa13-22fc3702b521,1.000000e+00,True
fc8bd23d-afdf-4a67-81d8-31fafe560c76,241eb6d8-c988-4838-96e3-1fff5e778bd4,1.000000e+00,True
fbca8c32-e03b-4e0c-8a58-72d3705433dc,c3fcea96-187d-4abb-a434-258b8cb7ea87,1.000000e+00,True
f7debc6e-07c2-4ea0-b368-6ca2c12d1129,12256412-3fc2-4fdf-9923-92bdecb23217,1.000000e+00,True
...,...,...,...
a7f99357-e10c-451c-9e60-5a98744c29b4,8fd53b5b-c362-461a-b39b-1e01eec4f190,5.652306e-05,True
f413d6e2-765f-4db1-86f6-133cf5812a7e,e1c87188-70ed-4018-b87e-ebb6e6cf9371,5.241123e-05,True
f9904199-572b-4d31-9c12-2deeff715732,a0b36d3e-3b69-45f8-970c-6b39f5bd7d5a,4.813147e-06,True
9e2ed7fd-982d-44a6-ab53-79fa265a38a7,c0da0a12-71fa-4176-9f58-c57d651cbcb7,3.466260e-06,True


## Choosing a Linking Model Score Threshold

The `dedupe` `.join` method that we used to score our training data directly incorporates the learned blockers. Thus, note that the scored pairs appearing on the distribution represent blocked pairs, and that our blockers *significantly* reduced the candidate pair search space.

### Model Score Distribution

In [29]:
df_predictions["ground_truth"].value_counts()

True    1244
Name: ground_truth, dtype: int64

In [30]:
tutorial.plot_model_score_distribution(df_predictions)

INFO:numexpr.utils:NumExpr defaulting to 2 threads.


### Precision and Recall vs. Model Score

In [31]:
df_eval = tutorial.evaluate_linking(
    df=df_predictions
)

In [32]:
df_eval.head()

Unnamed: 0,threshold,tp,fp,tn,fn,precision,recall,f1
0,0.0,1244,0,0,0,1.0,1.0,1.0
1,0.020408,1220,0,0,24,1.0,0.980707,0.99026
2,0.040816,1219,0,0,25,1.0,0.979904,0.98985
3,0.061224,1215,0,0,29,1.0,0.976688,0.988207
4,0.081633,1214,0,0,30,1.0,0.975884,0.987795


In [33]:
tutorial.plot_precision_recall_vs_threshold(df_eval)

## Iterating with Active Learning

When using active learning, we iterate on our linking solution, and incorporate progressively more labeled training data. Perhaps we're not satisfied with the current performance of the blockers or classifier, and we'd like to create more labeled examples for dedupe to train on.

Recall that earlier, we saved off our existing training data from the first labeling session. We can load this persisted data into a `dedupe` linker, and kick off another labeling session. Perhaps, after investigating the data during our first cycle, we don't think that dedupe should include `address_1` and `address2` in its comparators.

### Tweak the Linker and Use Existing Training Data

In [34]:
%%time

fields = [
    { "field" : "first_name", "type" : "Name" },
    { "field" : "surname", "type" : "Name" },
    { "field" : "suburb", "type" : "ShortString" },
    { "field" : "postcode", "type" : "Exact" },
    { "field" : "state", "type" : "Exact" },
    { "field" : "date_of_birth", "type" : "DateTime" },
    { "field" : "soc_sec_id", "type" : "Exact" },
]

linker2 = dedupe.RecordLink(fields)

with open(TRAINING_FILE, "r") as fh:
    linker2.prepare_training(records_A, records_B, training_file=fh)

INFO:dedupe.api:reading training from file
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, suburb)


CPU times: user 35.3 s, sys: 443 ms, total: 35.7 s
Wall time: 35.6 s


Now, we can kick off a second active learning/labeling session.

In [35]:
dedupe.console_label(linker2)

first_name : vincent
surname : ryan
suburb : drysdale
postcode : 2026
state : qld
date_of_birth : None
soc_sec_id : 9494629

first_name : vincent
surname : ryan
suburb : drysdqle
postcode : 2026
state : qld
date_of_birth : None
soc_sec_id : 9496429

10/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


y


first_name : joshua
surname : paterson
suburb : westmeadows
postcode : 3315
state : qld
date_of_birth : None
soc_sec_id : 3856870

first_name : oakleigh
surname : catt-green
suburb : westmead
postcode : None
state : qld
date_of_birth : None
soc_sec_id : 5322032

11/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:LevenshteinSearchPredicate: (1, suburb)
first_name : charlotte
surname : schwetlik
suburb : st kilda east
postcode : 2865
state : nsw
date_of_birth : None
soc_sec_id : 1738103

first_name : yt
surname : hodby
suburb : st kilda
postcode : 3083
state : nsw
date_of_birth : None
soc_sec_id : 5085170

12/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:TfidfNGramSearchPredicate: (0.6, suburb)
first_name : benjamin
surname : coleman
suburb : strathalbyn
postcode : 3361
state : vic
date_of_birth : None
soc_sec_id : 2343989

first_name : benjamin
surname : coleman
suburb : strathwaivyn
postcode : 3361
state : vic
date_of_birth : None
soc_sec_id : 2343989

13/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


first_name : blayke
surname : white
suburb : mount colah
postcode : 2099
state : tas
date_of_birth : None
soc_sec_id : 2064611

first_name : blayke
surname : white
suburb : mountrcllah
postcode : 2099
state : tas
date_of_birth : None
soc_sec_id : 2064161

13/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


first_name : ridley
surname : gibb
suburb : yarraville
postcode : 2263
state : qld
date_of_birth : None
soc_sec_id : 2815724

first_name : ridley
surname : gibb
suburb : yarrabklle
postcode : 2256
state : qld
date_of_birth : None
soc_sec_id : 2815727

13/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


first_name : nicholas
surname : myles
suburb : ivanhoe
postcode : 2145
state : qld
date_of_birth : None
soc_sec_id : 9650072

first_name : nichalpas
surname : myles
suburb : ivanuoebe
postcode : 2154
state : qld
date_of_birth : None
soc_sec_id : 9650072

13/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : marcus
surname : hodby
suburb : lalor park
postcode : 2340
state : qld
date_of_birth : None
soc_sec_id : 1585746

first_name : emma
surname : reid
suburb : lalor
postcode : 2454
state : None
date_of_birth : None
soc_sec_id : 6833893

14/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:TfidfNGramSearchPredicate: (0.6, suburb)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)
first_name : luke
surname : dixon
suburb : carlingford
postcode : 4178
state : sa
date_of_birth : None
soc_sec_id : 5626376

first_name : maco
surname : ngueyn
suburb : carrun
postcode : 5073
state : nsw
date_of_birth : None
soc_sec_id : 5479979

15/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameFiveCharStartPredicate, suburb)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)


### Retrain the Linker and Examine Blocking Performance

Now, let's retrain, and examine blocker performance. Ideally, we see an improved true link retention following our second labeling session.

In [36]:
%%time
linker2.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
  * (true_distinct + false_distinct)))
INFO:rlr.crossvalidation:optimum alpha: 0.000100, score 0.11584924082047791
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (sameFiveCharStartPredicate, suburb)
INFO:dedupe.training:(PartialPredicate: (fingerprint, surname, Surname), SimplePredicate: (commonFourGram, first_name), SimplePredicate: (commonFourGram, suburb))


CPU times: user 5.42 s, sys: 446 ms, total: 5.87 s
Wall time: 5.36 s


In [37]:
candidate_pairs = [x for x in linker2.pairs(records_A, records_B)]
print(f"{len(candidate_pairs):,} candidate pairs generated from blocking.")

df_candidate_links = pd.DataFrame(
    [(x[0][0], x[1][0]) for x in candidate_pairs]
).rename(columns={0 : "person_id_A", 1 : "person_id_B"}).set_index(["person_id_A", "person_id_B"])

max_candidate_pairs = df_A.shape[0]*df_B.shape[0]

print(f"{max_candidate_pairs:,} total possible pairs.")

# Calculate search space reduction.
search_space_reduction = round(1 - len(candidate_pairs)/max_candidate_pairs, 6)
print(f"\n{len(candidate_pairs):,} pairs after full blocking: {search_space_reduction}% search space reduction.")

# Calculate retained true links percentage.
total_true_links = df_ground_truth.shape[0]
true_links_after_blocking = pd.merge(
    df_ground_truth,
    df_candidate_links,
    left_index=True,
    right_index=True,
    how="inner"
).shape[0]

retained_true_link_percent = round((true_links_after_blocking/total_true_links) * 100, 2)
print(f"{retained_true_link_percent}% true links retained after blocking.")

23,513 candidate pairs generated from blocking.
10,562,500 total possible pairs.

23,513 pairs after full blocking: 0.997774% search space reduction.
89.77% true links retained after blocking.


### Evaluate Classification Performance

In [38]:
%%time
linked_records = linker2.join(records_A, records_B, threshold=0.0, constraint="one-to-one")

CPU times: user 839 ms, sys: 187 ms, total: 1.03 s
Wall time: 5.45 s


In [39]:
df_predictions = pd.DataFrame(
    [ {"person_id_A" : x[0][0], "person_id_B" : x[0][1], "model_score" : x[1]} for x in linked_records]
)

df_predictions = df_predictions.set_index(["person_id_A", "person_id_B"])

df_predictions = pd.merge(
    df_predictions,
    df_ground_truth,
    left_index=True,
    right_index=True,
    how="left",
)

df_predictions["ground_truth"].fillna(False, inplace=True)
df_predictions

Unnamed: 0_level_0,Unnamed: 1_level_0,model_score,ground_truth
person_id_A,person_id_B,Unnamed: 2_level_1,Unnamed: 3_level_1
ffd668ac-2f63-4c05-a6a3-58ebcf1f4a80,e601f439-c5b9-4ee7-a2fb-d437e6d23b76,1.000000e+00,False
ffa8ddc9-ed8d-4bfc-bd1a-aa7e8b86122b,3b7136be-2898-4b83-a975-de87bc1ae845,1.000000e+00,False
ff86e492-166d-4652-bf5b-61b9eef60e51,55259d5a-42f0-4fec-833b-6fb8c43282eb,1.000000e+00,False
ff6a9b51-79ac-408b-b126-f1e03d5f020e,eca37d5d-8bc7-41a6-a42e-006454943f6c,1.000000e+00,False
fed0c57c-c844-47f9-a338-3777f231729b,ada6a8b4-e8da-4210-85ac-def96af78e3f,1.000000e+00,False
...,...,...,...
6eab79a4-7f70-44c5-a2cc-433b8eadd3ab,6c4aec9e-c3d2-4a63-9840-d706b512e1bb,7.834787e-14,False
a2dd3141-b478-45a7-b9da-0b3f01cf413c,64323d4b-14ca-4e82-be3d-88c39e93ddcc,7.320954e-14,False
46644402-64a8-4f55-a802-d66ea58b3c07,915ea543-4f28-45be-bbeb-318bc336bce9,2.065414e-14,False
457f7c63-550e-4a54-a7da-c5e9e3379700,43b24b8f-edfc-4efd-b7c4-6efcbaf00b79,1.394021e-14,True


In [40]:
df_predictions["ground_truth"].value_counts()

False    1458
True     1357
Name: ground_truth, dtype: int64

In [41]:
tutorial.plot_model_score_distribution(df_predictions)

In [42]:
df_eval = tutorial.evaluate_linking(
    df=df_predictions
)

tutorial.plot_precision_recall_vs_threshold(df_eval)