In [None]:
# default_exp core

In [None]:
#hide
from nbdev.showdoc import *

# ProofZero Core SDK

A feature parser-hasher client to the Proof Zero matching engine.

## Benefits

Generating matches across datasets is difficult. The same entities are described by different data (for example, one person using different phone numbers, or addresses). Data are often in different formats -- sometimes just a postal code is stored, sometimes a geo-region, sometimes a street address.

However all of these differences in physical data representation refer to the same logical entities.

The Proof Zero SDK generates pseudokeys that match entities across data sets containing different physical data representations. These pseudokeys enable a match between incomplete, incorrect, and unclean data sets. The percent strength of these pseudokey-based matches is also indicated.

The SDK also acts as a general-purpose feature hashing engine.

## Workflow

In order to join two datasets using pseudokeys we first `load` our data, and then:

1. `tokenize` it -- decompose our data into standard formats,
2. `index` it -- recompose our tokens into cryptographic indexes (privacy-aware feature hashes),
3. `match` it -- compare indexed data sets to generate matches and measure match quality.

This is the "TIM" workflow: `tokenize`, `index`, `match`.

## Security and data privacy

All SDK features run locally, so data remain within control of the user. match optionally allows use of Proof Zero's compute cluster and matching models to generate matches more quickly and with higher accuracy.

By using IndexFrame objects data remain in the control of the local system, even when using Proof Zero's compute cluster.

## Exported types

We provide several lightweight types that track progress through the TIM workflow:

* `TokenFrame` is a `pandas` DataFrame that has been tokenized.
* `IndexFrame` is a `TokenFrame` that has been hashed/indexed.
* `MatchFrame` is a match between two `IndexFrame`s.

These types help systems and analysts keep track of where a given dataset is in the tokenize, index, match (TIM) workflow.

## Loading data

We load our data into standard `pandas` DataFrames.

In [None]:
#export
import pandas as pd
def load(filename: str) -> pd.DataFrame:
    """
    Convenience function for loading files using `pandas`. Full version supports XLSX, etc.
    """
    return pd.read_csv(filename)

In [None]:
data_1 = load('./data_1.csv')
assert (data_1[:1].equals(pd.DataFrame({'acct_num': 'PXCG66212484637575', 'name': 'Kristin Sanchez MD', 'address': '31417 Gina Lodge, Bradleytown, MB P7G 4N5', 'phone': '1 (598) 742-6794', 'SIN': '203 268 552', 'DOB': '1943/04/15'}, index=[0])))
data_1[:1]

Unnamed: 0,acct_num,name,address,phone,SIN,DOB
0,PXCG66212484637575,Kristin Sanchez MD,"31417 Gina Lodge, Bradleytown, MB P7G 4N5",1 (598) 742-6794,203 268 552,1943/04/15


## Tokenizing data

We apply parsers to the DataFrames we want to process in order to decompose the physical data into standard types. This lets us re-compose a hashed index later that encodes knowledge of the underlying data structure.

In [None]:
#export
import itertools
import functools
from typing import Callable, NewType

# The `TokenFrame` type is a wrapper around a `pandas` DataFrame that represents a DataFrame that has been tokenized using the `tokenize` function.
TokenFrame = NewType('TokenFrame', pd.DataFrame)

def tokenize(
    df: pd.DataFrame, schema: dict, suffix_delim: str = ""
) -> TokenFrame:
    """
    Takes a `pandas` DataFrame and a schema. The schema is a `dict` that maps columns in the DataFrame to a list of parsers that are executed in order.
    
    Denormalizes the passed DataFrame by applying the parsers in the schema.
    
    Returns the denormalized DataFrame.
    """
    def map_schema(map_row, map_schema, map_delim):
        results = [
            functools.reduce(
                lambda data, fxn: fxn(data), map_schema[i], map_row[i]
            )
            for i in map_row.index
        ]
        indicies = [
            # Allow parsers to return a named component and use that name to index
            # our new columns, else use the index number as a string.
            map_delim.join(
                [map_row.index[i], str(v[1]) if isinstance(v, tuple) else str(j)]
            )
            for i, u in enumerate(map_row.index)
            for j, v in enumerate(results[i])
        ]
        series = pd.Series(
            data=[v[0] if isinstance(v, tuple) else v for v in itertools.chain(*results)], index=indicies
        )
        return series

    return pd.concat(
        [
            df,
            df.apply(
                map_schema,
                axis=1,
                args=(schema, suffix_delim),
                result_type="expand",
            ),
        ],
        axis=1,
    )

In [None]:
from proofzero_sdk.util import parseString, parseName, parseAddress, parsePhone, parseSIN, parseDate
tokens_1 = tokenize(df=data_1[:1], schema={
    'acct_num': [parseString],
    'name': [parseName],
    'address': [parseAddress],
    'phone': [parsePhone],
    'SIN': [parseSIN],
    'DOB': [parseDate]
}, suffix_delim='_')
tokens_1[:1]

Unnamed: 0,acct_num,name,address,phone,SIN,DOB,acct_num_0,name_title,name_first,name_middle,...,phone_10,phone_11,SIN_0,SIN_1,SIN_2,SIN_3,DOB_0,DOB_1,DOB_2,DOB_3
0,PXCG66212484637575,Kristin Sanchez MD,"31417 Gina Lodge, Bradleytown, MB P7G 4N5",1 (598) 742-6794,203 268 552,1943/04/15,PXCG66212484637575,,Kristin,,...,10,,203-268-552,203,268,552,1943-04-15,1943,4,15


## Indexing data

In [None]:
#export

from proofzero_sdk.util import sha2

# The `IndexFrame` type is a wrapper around a `TokenFrame` (ie, a `pandas` DataFrame) that represents a tokenized DataFrame that has been indexed using the `index` function.
IndexFrame = NewType('IndexFrame', TokenFrame)

def index(
    df: TokenFrame, schema: dict, hasher: Callable = sha2, suffix_delim: str = ""
) -> IndexFrame:
    """
    Tokenizes the passed DataFrame using `tokenize`, then applies a hash function to the tokenized features.
    
    The default hash function is `sha2`, from the Proof Zero utility SDK.
    """
    #return tokenize(df, schema, suffix_delim).applymap(hasher)
    return df.applymap(hasher)

In [None]:
index_1 = index(df=tokens_1, schema={
    'acct_num': [parseString],
    'name': [parseName],
    'address': [parseAddress],
    'phone': [parsePhone],
    'SIN': [parseSIN],
    'DOB': [parseDate]
}, suffix_delim='_')
index_1[:1]

Unnamed: 0,acct_num,name,address,phone,SIN,DOB,acct_num_0,name_title,name_first,name_middle,...,phone_10,phone_11,SIN_0,SIN_1,SIN_2,SIN_3,DOB_0,DOB_1,DOB_2,DOB_3
0,9802534054846731338401687891444750119150976961...,4418369248196592421047499630841732388346555327...,9594784023836219556184433361991357495783756657...,7344459144365265142091263159766424283511268480...,1010505653944236458406177846488691380962181960...,5141391390246777661242717464678167327763772345...,9802534054846731338401687891444750119150976961...,3877226117079751550214273725156091025388555585...,9229611837054111477081027882940374069625694253...,3877226117079751550214273725156091025388555585...,...,9639508933775287426333904353415401342534309318...,6586455234700639270600574409018427069812517107...,1148673251264303964479456609021610855802601825...,2948056107977080813370872483486457740221751010...,5545403145286023377289154093249644763249517306...,1018998564061470247772449997659560822438884579...,7644637171458321163635300620903733517024394144...,1020423811237369131177406386728771151256212182...,6275815013823685249316828430045322558920567663...,9930297093447128312648856684579760333304390435...


## Matching data

In [None]:
#export

# The `MatchFrame` type is a wrapper around a `IndexFrame` (ie, a `TokenFrame` and ultimately a `pandas` DataFrame) that represents an indexed DataFrame that has been matched using the `match` function.
MatchFrame = NewType('MatchFrame', IndexFrame)

def match(
    index_0: IndexFrame, index_1: IndexFrame, sensitivity = 0.5, api_key: str = None
) -> pd.DataFrame:
    """
    Discover matches between the two passed DataFrames. If `api_key` is set the indexed data can be matched using Proof Zero's cluster.
    
    `index_0` should contain exactly one row. If a whole-frame match is required pass this function to `pandas.DataFrame.apply`.
    
    `sensitivity` is the lowest [Jaccard Index](https://en.wikipedia.org/wiki/Jaccard_index), as a percentage, that indicates a match.
    
    This function is compute-intense. Contact us for cloud scaling help.
    """
    column_intersect = list(set(index_0.columns) & set(index_1.columns))
    column_union = list(set(index_0.columns) | set(index_1.columns))
    column_jaccard = len(column_intersect) / len(column_union) # Can use the column Jaccard Index to normalize the row Jaccard, below.
    
    if (len(index_0) != 1):
        raise RuntimeError('Pass exactly one index as index_0.')
        
    if (len(column_intersect) < 1):
        raise RuntimeError('No schema overlap -- some columns must match (parsing functions in schema must emit tags that match across both frames).')

    df = pd.DataFrame(index_1)
    df = df[df.columns[df.apply(lambda c: len(c.unique()) > 1)]]
    df['_match'] = df.apply(lambda r: len(set(r) & set(index_0.iloc[0])) / len(set(r) | set(index_0.iloc[0])), axis=1)
    df = df.sort_values(by='_match', ascending=False)
    return df[df['_match'] > sensitivity]

In [None]:
data_2 = load('./data_2.csv')
data_2.iloc[[2]]

Unnamed: 0,acct_num,name,address,phone,SIN,DOB
2,CUVC30129179791293,Kristin Sanchez MD,"31417 Gina Lodge, Bradleytown, MB P7G 4N5",742 598 6794,203 268 552,1943/04/15


In [None]:
tokens_2 = tokenize(df=data_2, schema={
    'acct_num': [parseString],
    'name': [parseName],
    'address': [parseAddress],
    'phone': [parsePhone],
    'SIN': [parseSIN],
    'DOB': [parseDate]
}, suffix_delim='_').reset_index(drop=True)
tokens_2.iloc[[2]]

Unnamed: 0,acct_num,name,address,phone,SIN,DOB,DOB_0,DOB_1,DOB_2,DOB_3,...,phone_10,phone_11,phone_2,phone_3,phone_4,phone_5,phone_6,phone_7,phone_8,phone_9
2,CUVC30129179791293,Kristin Sanchez MD,"31417 Gina Lodge, Bradleytown, MB P7G 4N5",742 598 6794,203 268 552,1943/04/15,1943-04-15,1943,4,15,...,20,,742,598,6794,7425986794,,,,742 598 6794


In [None]:
index_2 = index(df=tokens_2, schema={
    'acct_num': [parseString],
    'name': [parseName],
    'address': [parseAddress],
    'phone': [parsePhone],
    'SIN': [parseSIN],
    'DOB': [parseDate]
}, suffix_delim='_')
index_2.iloc[[2]]

Unnamed: 0,acct_num,name,address,phone,SIN,DOB,DOB_0,DOB_1,DOB_2,DOB_3,...,phone_10,phone_11,phone_2,phone_3,phone_4,phone_5,phone_6,phone_7,phone_8,phone_9
2,3118710362407207060820519817066124611602675044...,4418369248196592421047499630841732388346555327...,9594784023836219556184433361991357495783756657...,5103122257192237244690193744677031637888782293...,1010505653944236458406177846488691380962181960...,5141391390246777661242717464678167327763772345...,7644637171458321163635300620903733517024394144...,1020423811237369131177406386728771151256212182...,6275815013823685249316828430045322558920567663...,9930297093447128312648856684579760333304390435...,...,1950749364813865911797004310287114589195880318...,6586455234700639270600574409018427069812517107...,1535860380141259287318590160938114468677934395...,6055181033144981996992600968534808922389255124...,5598189335674161013176229442725892028281044088...,2209634314611670011518640833334290179954075682...,6586455234700639270600574409018427069812517107...,6586455234700639270600574409018427069812517107...,6586455234700639270600574409018427069812517107...,5103122257192237244690193744677031637888782293...


In [None]:
match_frame = match(index_1, index_2)
match_frame

Unnamed: 0,acct_num,name,address,phone,SIN,DOB,DOB_0,DOB_1,DOB_2,DOB_3,...,phone_0,phone_10,phone_2,phone_3,phone_4,phone_5,phone_6,phone_7,phone_9,_match
2,3118710362407207060820519817066124611602675044...,4418369248196592421047499630841732388346555327...,9594784023836219556184433361991357495783756657...,5103122257192237244690193744677031637888782293...,1010505653944236458406177846488691380962181960...,5141391390246777661242717464678167327763772345...,7644637171458321163635300620903733517024394144...,1020423811237369131177406386728771151256212182...,6275815013823685249316828430045322558920567663...,9930297093447128312648856684579760333304390435...,...,7341275782727882540070624817464065162426234694...,1950749364813865911797004310287114589195880318...,1535860380141259287318590160938114468677934395...,6055181033144981996992600968534808922389255124...,5598189335674161013176229442725892028281044088...,2209634314611670011518640833334290179954075682...,6586455234700639270600574409018427069812517107...,6586455234700639270600574409018427069812517107...,5103122257192237244690193744677031637888782293...,0.692308
