In [28]:
import pandas as pd
import json
from hashing.preprocessing import pre_phonetic_encoding, pre_string_clean#, pre_tokenize, pre_sort_names

In [5]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, IntegerType, DateType, BinaryType, BooleanType, ByteType, MapType
from pyspark.sql.functions import expr, lit
from pyspark.sql.functions import udf
from pyspark.sql import functions as sf

In [7]:
spark = SparkSession.builder.appName('Entity-Resolution-Hasing').getOrCreate()
sparkContext=spark.sparkContext

2022-04-22 10:12:04 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [8]:
df1 = spark.read.csv("../simulated_data/dataset_1.csv", header=True,
    mode="DROPMALFORMED",)
df1

DataFrame[person_id: string, service_date: string, first_name: string, middle_name: string, last_name: string, dob: string, ssn: string]

In [32]:
# df2 = spark.read.csv("../simulated_data/dataset_2.csv", header=True,
#     mode="DROPMALFORMED",)
# df2

## Parameter Alignment

In [31]:
# Here, import set of unique identifiers and fuzzy identifiers that are agreed upon in the matching
f = open("param_config.json")
fields = json.load(f)
# fields

{'fields': [{'field_origin': 'first_name',
   'field_name': 'name_phonetic',
   'preprocess_steps': ['drop_chars', 'sort', 'phonetic'],
   'matching_type': 'string',
   'ngrams': True,
   'weight': 0.5},
  {'field_origin': 'ssn',
   'field_name': 'ssn_clean',
   'preprocess_steps': ['drop_chars'],
   'matching_type': 'exact',
   'ngrams': False,
   'weight': 0.5}]}

## Preprocessing steps 
Assign each preprocessing step to a udf, then apply to dataframe

In [13]:
## UDFs
udf_pre_phonetic_encoding = udf(pre_phonetic_encoding, StringType()) 

In [38]:
def preprocess(field_configs, data):
    print(field_configs)
    data = data.withColumn(field_configs['field_name'], data[field_configs['field_origin']])
    for step in field_configs['preprocess_steps']:
        if step == 'drop_chars':
            data = data#.withColumn(field_configs['field']+'_'+step, #udfXYZ)
        elif step == 'sort':
            data = data
        elif step == 'phonetic':
            data = data.withColumn(field_configs['field_name'], udf_pre_phonetic_encoding(field_configs['field_name']))
        else:
            raise ValueError("Unknown preprocessing step")
    return data
            
    

In [41]:
def preprocess_all(configs, data):
    for config in configs['fields']:
        data = preprocess(config, data)
    return data

In [42]:
data_proc = preprocess_all(fields, df1)

{'field_origin': 'first_name', 'field_name': 'name_phonetic', 'preprocess_steps': ['drop_chars', 'sort', 'phonetic'], 'matching_type': 'string', 'ngrams': True, 'weight': 0.5}
{'field_origin': 'ssn', 'field_name': 'ssn_clean', 'preprocess_steps': ['drop_chars'], 'matching_type': 'exact', 'ngrams': False, 'weight': 0.5}


## Now to the hashing and linking

### Bloom filter on name

In [43]:
## TO DO: MOVE TO bloom.py
import bitarray
import mmh3
def bloom_hash(item, size, hash_count):
    bit_array = bitarray.bitarray(size)
    bit_array.setall(0)
    for ii in range(hash_count):
        index = mmh3.hash(item, ii) % size
        bit_array[index] = 1
    return bytes(bit_array)

bitarray_type = StringType()#ArrayType(ByteType())
bitarray_type.needConversion()
bitarray_type

StringType

In [44]:
udf_bloom = udf(bloom_hash, bitarray_type)

In [45]:
def bloom_filters(configs, data, key = '_bloomhash'):
    for field in configs['fields']:
        data = data.withColumn(field['field_name'] + key, udf_bloom(field['field_name'], lit(100), lit(5)))
    return data

In [46]:
data_proc = bloom_filters(fields, data_proc, key = '_bloomhash')

In [27]:
data_proc.write.option("header", True).csv("hashed_output")

In [36]:
# data_proc = df1.withColumn('index_key', 
#                     sf.concat(sf.col('bloom_1'),sf.lit('_'), sf.col('bloom_2')))

#### Steps
For each bloom filter function:
* Map: "Hash" entities according to algorithm
* Map: Assign block, sum 1s in hash
* Join: inner join to compute pairwise distances (Dice coefficient)
* Groupby: find min distance
* Filter for entities that meet threshold - these are the potential candidates

For all entity dataframes with associated candidates
* join by entity
* Pick candidate (how tho? :) )