In [None]:
# default_exp core

In [None]:
#hide
from nbdev.showdoc import *

# ProofZero Core SDK

A feature parser-hasher client to the Proof Zero matching engine.

## Why

Generating matches across datasets is difficult. Different data are collected about the same entities (one person enters different phone numbers, or addresses). Data are in different formats -- sometimes just a postal code is stored, sometimes a geo-region, sometimes a street address.

However all of these differences in physical data representation refer to the same logical entities.

The Proof Zero SDK generates pseudokeys that match entities across data sets containing different physical data representations. These pseudokeys establish a primary/foreign key relationship between incomplete, incorrect, and unclean data sets. A measure of match strength is also created.

## Workflow

In order to join two datasets using pseudokeys we first `load` our data, and then:

1. `tokenize` it -- decompose our data into standard formats,
2. `index` it -- recompose our tokens into cryptographic indexes,
3. `match` it -- compare indexed data sets to generate matches and measure match quality.

This is the TIM: `tokenize`, `index`, `match` workflow.

## Security and data privacy

All SDK features run locally, so data remain within control of the user. match optionally allows use of Proof Zero's compute cluster and matching models to generate matches more quickly and with higher accuracy.

By using IndexFrame objects data remain in the control of the local system, even when using Proof Zero's compute cluster.

## Exported types

We provide several lightweight types that track progress through the TIM workflow:

* `TokenFrame` is a `pandas` DataFrame that has been tokenized.
* `IndexFrame` is a `TokenFrame` that has been hashed/indexed.
* `MatchFrame` is a match between two `IndexFrame`s.

These types help systems and analysts keep track of where a given dataset is in the tokenize, index, match (TIM) workflow.

## Loading data

In [None]:
#export
import pandas as pd
def load(filename: str) -> pd.DataFrame:
    """
    Convenience function for loading files using `pandas`. Full version supports XLSX, etc.
    """
    return pd.read_csv(filename)

## Tokenizing data

In [None]:
#export
from typing import Callable, NewType

# The `TokenFrame` type is a wrapper around a `pandas` DataFrame that represents a DataFrame that has been tokenized using the `tokenize` function.
TokenFrame = NewType('TokenFrame', pd.DataFrame)

def tokenize(
    df: pd.DataFrame, schema: dict, suffix_delim: str = ""
) -> TokenFrame:
    """
    Takes a `pandas` DataFrame and a schema. The schema is a `dict` that maps columns in the DataFrame to a list of parsers that are executed in order.
    
    Denormalizes the passed DataFrame by applying the parsers in the schema.
    
    Returns the denormalized DataFrame.
    """
    def map_schema(map_row, map_schema, map_delim):
        results = [
            functools.reduce(
                lambda data, fxn: fxn(data), map_schema[i], map_row[i]
            )
            for i in map_row.index
        ]
        indicies = [
            # Allow parsers to return a named component and use that name to index
            # our new columns, else use the index number as a string.
            map_delim.join(
                [map_row.index[i], v[0] if isinstance(v, tuple) else str(j)]
            )
            for i, u in enumerate(map_row.index)
            for j, v in enumerate(results[i])
        ]
        series = pd.Series(
            data=list(itertools.chain(*results)), index=indicies
        )
        return series

    return pd.concat(
        [
            df,
            df.apply(
                map_schema,
                axis=1,
                args=(schema, suffix_delim),
                result_type="expand",
            ),
        ],
        axis=1,
    )

## Indexing data

In [None]:
#export

from proofzero_sdk.util import sha2

# The `IndexFrame` type is a wrapper around a `TokenFrame` (ie, a `pandas` DataFrame) that represents a tokenized DataFrame that has been indexed using the `index` function.
IndexFrame = NewType('IndexFrame', TokenFrame)

def index(
    df: TokenFrame, schema: dict, hasher: Callable = sha2
) -> IndexFrame:
    """
    Tokenizes the passed DataFrame using `tokenize`, then applies a hash function to the tokenized features.
    
    The default hash function is `sha2`, from the Proof Zero utility SDK.
    """
    return tokenize(df, schema).applymap(hasher)

## Matching data

In [None]:
#export

# The `MatchFrame` type is a wrapper around a `IndexFrame` (ie, a `TokenFrame` and ultimately a `pandas` DataFrame) that represents an indexed DataFrame that has been matched using the `match` function.
MatchFrame = NewType('MatchFrame', IndexFrame)

def match(
    index_0: IndexFrame, index_1: IndexFrame, network: bool = False
) -> pd.DataFrame:
    """
    Discover matches between the two passed DataFrames. If `network` is set to `True` the indexed data are matched using Proof Zero's cluster.
    """
    return pd.DataFrame()