# HMAC SHA256 Hashing Using Pandas

### Imports

In [5]:
import hashlib
import hmac
import re
from pathlib import Path
from typing import Dict, Union, List
import subprocess

import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Load NYU provided secret key

In [6]:
with open('<path to file with key>') as infile:
    key = bytes(infile.read().strip(), 'utf-8')

### Functions

In [7]:
_GPG_COMMAND = 'gpg --output {outfile} --encrypt --trusted-key {gpg} --recipient {gpg} {infile}'


def _standard_string_cleaning(value: str) -> str:
    """
    Takes a value, removes all punctuation and, reduces spaces to single space, calls strip
    
    Args:
       value: String value to be cleaned
       
    Returns:
       Cleaned string
    """
    return re.sub(r'\s{2,}', ' ', re.sub(r'[^A-Za-z0-9\s]+', '', value)).strip()


def clean_name(name_value: str) -> str:
    """
    Takes a name value, remove non-alpha characters, reduce multiple spaces, and trim
    
    Args:
        name_value: Name value to be cleaned
        
    Returns:
        Cleaned name as string
    """
    return re.sub(r'[^A-Z\s]+', '', _standard_string_cleaning(name_value).upper())


def clean_ssn(ssn_value: str) -> str:
    """
    Takes a SSN value, remove non-alpha characters, reduce multiple spaces, and trim
    
    Args:
        ssn_value: SSN value as string
        
    Returns:
        Cleaned SSN as string
    """
    return re.sub(r'[^0-9\s]+', '', _standard_string_cleaning(ssn_value))
    
    
def hash_value(value: str, key: bytes) -> str:
    """
    Apply HMAC SHA 256 to an individual value and return the hexdigest
    
    Args:
        value: String representation of value
        key: Bytes object representing hash key
        
    Returns:
        Hexdigest as string
    """
    return hmac.new(key, bytes(value, 'utf-8'), digestmod=hashlib.sha256).hexdigest()


def create_hashed_unique_value_dict(df: pd.DataFrame, col: str, typ: str, key: bytes) -> Dict[str, str]:
    """
    Create dict with original value as key and hashed value as dict
    
    Args:
        df: Pandas DataFrame
        col: String name of column/series
        typ: name or ssn
        key: Encryption key as bytes object
        
    Returns:
        Dictionary with original value as key and encrypted value as value
        
    """
    unique_values = df[col].unique()
    func = {'name': clean_name, 'ssn': clean_ssn}[typ]
    return {value: hash_value(func(value), key) for value in unique_values}
    
    
def encrypt_csv(filepath: Union[str, Path], 
                outpath: Union[str, Path],
                columns: List[Dict[str, str]], 
                key: bytes, 
                gpg_key_name: str, sep: str=',') -> bool:
    """
    Takes a CSV and encrypts fields given a list of dictionaries {col: name or ssn},
    serializes to csv then encrypts via gpg
    
    Args:
        filepath: String or Path object representing file path
        outpath: String or Path object to output final product
        collumns: List of tuples with column name as element 0 and encrypt category (name or ssn) as element 1, only give target columns
        key: Encryption key
        gpg: gpg key name to pull from file system (assumes you have imported key using gpg)
        
    Keyword Args:
        sep: Delimiter to use, defaults to comma
        
    Returns:
        Boolean indicating success
    """
    outpath = Path(outpath)
    tmp_csv_path = outpath.parent / 'tmp_encrypt.csv'
    df = pd.read_csv(filepath, dtype=str, sep=sep)
    for col_details in columns:
        df[col_details[0]] = df[col_details[0]].replace(create_hashed_unique_value_dict(df, *col_details, key))
    df.to_csv(tmp_csv_path, index=0)
    run = subprocess.run(_GPG_COMMAND.format(gpg=gpg_key_name, outfile=outpath.as_posix() , infile=tmp_csv_path.as_posix()).split(), capture_output=True)
    if run.returncode != 0:
        print(run.stderr)
        raise RuntimeError("Failed to encrypt")
    return True

### Usage

This notebook is intended to provide a one function call to encrypt the appropriate fields and the overall file.  The expected parameters, in order, are:

```filepath```: The file path to your target file
```outpath```: The file path to write the final encrypted path
```columns```: A list of tuples, where the first value is the column name, and the second value is what data type to encrypt
```key```: Hash key provided by Coleridge
```gpg```: Public key for file encryption

KEYWORD (OPTIONAL):

```sep```: Delimiter for input file, defaults to comma

```python
encrypt_csv('<input file path>', '<encrypted file output path>', [('SSN', 'ssn'), ('first_name', 'name'), ('last_name', 'name')], '<hash key>', '<gpg key>')
```

**EXAMPLE**

Let's encrypt a csv with family information for a TANF household.  In this case, the data provided was ```tab``` delimited.

In [6]:
encrypt_csv('/home/user/secured_data/tanf_family_data.csv', 
            '/home/user/secured_data/tanf_family_encrypted', 
            [('RPT_SSN_1', 'ssn'), ('Recipient_First_Name', 'name')], 
            'XXXX|XXXX|XXXX|XXXX', 
            '5FED9A4FC02ADD64', 
            sep='\t')

True