In [None]:
# default_exp util

In [None]:
#hide
from nbdev.showdoc import *

# Cleanroom Utility SDK

This module contains parsers and hash functions intended to be used by the `core` module in its workflow.

## Exported types

In [None]:
#export
from typing import NewType, Any
Hash = NewType('Hash', int)

## Hash functions

This section contains feature hashing functions for consistently turning categorical data into (typically numeric) hashes. This is done to aid comparison of, and sharing of, data.

### No-op

The no-op hash function does not hash passed values. It is primarily included for testing and illustration purposes.

In [None]:
#export
def noop(value: Any) -> Hash:
    """
    A no-op hash function (ie, passthrough, no encryption applied). Useful for testing.
    """
    return value

### No-op Testing

In [None]:
test_noop = noop('no transform')
assert(test_noop == 'no transform')

### SHA-256

A secure feature hashing function appropriate for networked use cases.

In [None]:
#export
import sys
import struct
import hashlib
import pandas as pd
def sha2(value: Any, encoding: str = "utf-8") -> Hash:
    """
    A cross-platform implementation wrapper around SHA2. Generates consistent hashes across different hardware and interpreter platforms.
    """
    # Infer type using Python built-in so we can set formatting string.
    type_hint = type(value)

    # Prepend '!' to force system-indepent memory formatting.
    # See https://docs.python.org/3/library/struct.html#struct-format-strings
    format_code = "!"

    # Select based on type_hint.
    # See https://docs.python.org/3/library/struct.html#format-characters
    if type_hint is int:
        format_code = "".join([format_code, "i"])
    elif type_hint is float:
        format_code = "".join([format_code, "d"])
    elif type_hint is bool:
        format_code = "".join([format_code, "?"])
    elif type_hint is str:
        # Decode the string into an immutable byte buffer (see below).
        value = bytes(value, encoding)
        format_code = "".join([format_code, str(len(value)), "s"])
    else:
        # Best-efforts conversion of the value into a string representation.
        value = repr(value)

        # Now treat it as a string (see above).
        value = bytes(value, encoding)
        format_code = "".join([format_code, str(len(value)), "s"])

    # Pack the value into a struct so we can get its memory representation.
    # The format_code having a prepended '!' ensures platform independence.
    value_buf = struct.pack(format_code, value)

    # Hash the buffer and return the hex string.
    #return Hash(hashlib.sha256(value_buf).hexdigest())

    # int to enabled feature hashing -- makes matches and analysis 90% faster.
    return int.from_bytes(hashlib.sha256(value_buf).digest(), sys.byteorder)

### SHA256 Testing

In [None]:
cryptotext = sha2('test')
assert (cryptotext == 3637832425643895610435099290665119336511696415395986933609875766427977287327)

## Parsing functions

This section contains utility functions for decomposing passed values into structured, tagged, lists of the factors that comprise the passed value.

Parsing function either pass back a `list` of values or of `tuple`s. If they return a `tuple` it should be of the form: `(value, tag)` where `value` is a decomposed factor and `tag` is metadata describing `value`. Tags are used to assist matching.

For example, the phone number (647) 927-4901 might be parsed into the list of tuples: `[(647, 'area_code'), (927, 'central_office'), (4901, 'line_number')]`.

### String Parsing

This is a general purpose string parser.

In [None]:
#export
from typing import List
def parseString(value: str, delim: str = " ") -> List[str]:
    """
    Tokenize the passed string with an arbitrary delimiter, `delim`.
    """
    return value.split(sep=delim, maxsplit=-1)

### String Parser Testing

In [None]:
test_string = parseString('This-is-a-test', delim='-')
assert(test_string == ['This', 'is', 'a', 'test'])

In [None]:
test_string = parseString('This is a test')
assert(test_string == ['This', 'is', 'a', 'test'])

In [None]:
test_string = parseString('abc123')
assert(test_string == ['abc123'])

### Name Parser

This is an opinionated parser for human names.

In [None]:
#export
from nameparser import HumanName
def parseName(value: str) -> List[str]:
    """
    Tokenize the passed human name field in an opinionated way.
    """
    name = HumanName(value)
    name_dict = name.as_dict()
    return [(v, k) for (k, v) in name_dict.items()]

### Name Parser Testing.

In [None]:
test_name = parseName('Mr. Alex "Al" R. Flanagan I')
assert(test_name == [('Mr.', 'title'), ('Alex', 'first'), ('R.', 'middle'), ('Flanagan', 'last'), ('I', 'suffix'), ('Al', 'nickname')])

### Address Parser

A tokenizing parser for street addresses. For illustration purposes this is a convenience wrapper around the string parser.

In [None]:
#export
def parseAddress(value: str, delim: str = " ") -> List[str]:
    """
    Tokenize the passed address field in an opinionated way.
    """
    return parseString(value, delim)

### Address Parser Testing

In [None]:
test_address = parseAddress('92 Yonge St., Toronto Ontario, M5J 0B1')
assert(test_address == ['92', 'Yonge', 'St.,', 'Toronto', 'Ontario,', 'M5J', '0B1'])

### Phone Number Parser

This is a convenience wrapper around the Python version of Google's `libphonenum`. This is a general pattern: where available, use a standard or defacto standard parser.

In [None]:
#export
import phonenumbers
from typing import Tuple, Union
def parsePhone(value: str) -> List[Tuple[Union[str, int, None], int]]:
    """
    Tokenize the passed phone number field in an opinionated way.
    """
    # Get a phone number from the passed value.
    region = "US"
    phone_obj = phonenumbers.parse(value, region, keep_raw_input=True)
    # Use RFC3966 because it handles extensions.
    number_format = phonenumbers.PhoneNumberFormat.RFC3966
    phone_parsed = phonenumbers.format_number(phone_obj, number_format)

    national_number = str(phone_obj.national_number)
    area_code = national_number[:3]
    central_office = national_number[3:6]
    line_number = national_number[6:]

    retval = [
        (str(phone_parsed), 'full_phone'),
        (str(phone_obj.country_code), 'country_code'),
        (str(area_code), 'area_code'),
        (str(central_office), 'central_office'),
        (str(line_number), 'line_number'),
        (str(phone_obj.national_number), 'national_number'),
        (str(phone_obj.extension), 'extension'),
        (str(phone_obj.italian_leading_zero), 'italian_zero'),
        (str(phone_obj.number_of_leading_zeros), 'leading_zero_count'),
        (str(phone_obj.raw_input), 'raw'),
        (str(phone_obj.country_code_source), 'country_code_source'),
        (str(phone_obj.preferred_domestic_carrier_code), 'preferred_domestic_carrier'),
    ]

    return retval

### Phone Number Parser Testing

In [None]:
test_phone = parsePhone('647-927-7392')
assert (test_phone == [('tel:+1-647-927-7392', 'full_phone'), ('1', 'country_code'), ('647', 'area_code'), ('927', 'central_office'), ('7392', 'line_number'), ('6479277392', 'national_number'), ('None', 'extension'), ('None', 'italian_zero'), ('None', 'leading_zero_count'), ('647-927-7392', 'raw'), ('20', 'country_code_source'), ('None', 'preferred_domestic_carrier')])

### SIN Number Parser

A validating parser for Canadian Social Insurance Numbers (SINs).

In [None]:
#export
from stdnum.ca import sin
def parseSIN(value: str) -> List[Tuple[Union[str, int, None], int]]:
    """
    Tokenize the passed SIN field in an opinionated way.
    """
    # Get a sin from the passed value.
    valid_sin = sin.validate(value)  # Can raise and return False
    sin_val = sin.format(valid_sin)
    components = sin_val.split("-")
    return [
        (sin_val, 0),
        (components[0], 1),
        (components[1], 2),
        (components[2], 3),
    ]

### SIN Number Parser Validation

In [None]:
test_social = parseSIN('046 454 286')
assert(test_social == [('046-454-286', 0), ('046', 1), ('454', 2), ('286', 3)])

### Date Parser

Standard date parser.

In [None]:
#export
from datetime import datetime
import dateparser
def parseDate(value: str) -> List[Tuple[Union[str, int, None], int]]:
    """
    Tokenize the passed date field in an opinionated way.
    """
    # Get a datetime from the passed value.
    parse_val: Optional[datetime] = dateparser.parse(value)
    if parse_val is None:
        return [(None, None)]
    return [
        (parse_val.date().isoformat(), 'iso_format'),
        (str(parse_val.year), 'year'),
        (str(parse_val.month), 'month'),
        (str(parse_val.day), 'day'),
    ]

### Date Parser Testing

In [None]:
test_date = parseDate('1999-07-01')
assert(test_date == [('1999-07-01', 'iso_format'), ('1999', 'year'), ('7', 'month'), ('1', 'day')])