In [None]:
# default_exp core

In [None]:
#hide
from nbdev.showdoc import *

# ProofZero SDK

> A feature parser-hasher client to the Proof Zero matching engine.

In [None]:
#export
import pandas as pd
def load(filename):
    """
    Convenience function for loading files using `pandas`. Full version supports XLSX, etc.
    """
    return pd.read_csv(filename)

In [None]:
#export
def tokenize(
    df: pd.DataFrame, schema: dict, suffix_delim: str = ""
) -> pd.DataFrame:
    """
    Takes a `pandas` DataFrame and a schema. The schema is a `dict` that maps columns in the DataFrame to a list of parsers that are executed in order.
    
    Denormalizes the passed DataFrame by applying the parsers in the schema.
    
    Returns the denormalized DataFrame.
    """
    def map_schema(map_row, map_schema, map_delim):
        results = [
            functools.reduce(
                lambda data, fxn: fxn(data), map_schema[i], map_row[i]
            )
            for i in map_row.index
        ]
        indicies = [
            # Allow parsers to return a named component and use that name to index
            # our new columns, else use the index number as a string.
            map_delim.join(
                [map_row.index[i], v[0] if isinstance(v, tuple) else str(j)]
            )
            for i, u in enumerate(map_row.index)
            for j, v in enumerate(results[i])
        ]
        series = pd.Series(
            data=list(itertools.chain(*results)), index=indicies
        )
        return series

    return pd.concat(
        [
            df,
            df.apply(
                map_schema,
                axis=1,
                args=(schema, suffix_delim),
                result_type="expand",
            ),
        ],
        axis=1,
    )

In [None]:
#export
from typing import Callable
def index(
    df: pd.DataFrame, schema: dict, hasher: Callable = None # sha2.apply
) -> pd.DataFrame:
    """
    Tokenizes the passed DataFrame using `tokenize`, then applies a hash function to the tokenized features.
    """
    return tokenize(df, schema).applymap(hasher)