In [33]:
import pandas as pd
import json
from hashing.preprocessing import pre_phonetic_encoding, pre_string_clean#, pre_tokenize, pre_sort_names
from hashing.bloom import bloom_hash

In [34]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, IntegerType, DateType, BinaryType, BooleanType, ByteType, MapType
from pyspark.sql.functions import expr, lit
from pyspark.sql.functions import udf
from pyspark.sql import functions as sf

In [35]:
spark = SparkSession.builder.appName('Entity-Resolution-Hasing').getOrCreate()
sparkContext=spark.sparkContext

In [36]:
dataset = "../simulated_data/dataset_1.csv"

In [37]:
df1 = spark.read.csv(dataset, header=True,
    mode="DROPMALFORMED",)


## Parameter Alignment

In [38]:
# Here, import set of unique identifiers and fuzzy identifiers that are agreed upon in the matching
f = open("param_config.json")
fields = json.load(f)
# fields

## Preprocessing steps 
Assign each preprocessing step to a udf, then apply to dataframe

In [39]:
## UDFs
udf_pre_phonetic_encoding = udf(pre_phonetic_encoding, StringType()) 
## TO DO: add all udfs here

In [40]:
# preprocess implements all preprocessing steps for a column
def preprocess(field_configs, data):
    data = data.withColumn(field_configs['field_name'], data[field_configs['field_origin']])
    for step in field_configs['preprocess_steps']:
        if step == 'drop_chars':
            data = data#.withColumn(field_configs['field']+'_'+step, #udfXYZ)
        elif step == 'sort':
            data = data
        elif step == 'phonetic':
            data = data.withColumn(field_configs['field_name'], udf_pre_phonetic_encoding(field_configs['field_name']))
        else:
            raise ValueError("Unknown preprocessing step")
    return data
            
def preprocess_all(configs, data):
    for config in configs['fields']:
        data = preprocess(config, data)
    return data
    
    

In [41]:
data_proc = preprocess_all(fields, df1)

## Now to the hashing and linking

### Bloom filter on name

In [42]:
bitarray_type = StringType()#ArrayType(ByteType())
bitarray_type.needConversion()
bitarray_type

StringType

In [43]:
udf_bloom = udf(bloom_hash, bitarray_type)

In [44]:
def bloom_filters(configs, data, key = '_bloomhash'):
    for field in configs['fields']:
        data = data.withColumn(field['field_name'] + key, udf_bloom(field['field_name'], lit(100), lit(5)))
    return data

In [45]:
key = '_bloomhash'
data_proc = bloom_filters(fields, data_proc, key)

In [46]:
import datetime as dt
timestamp = str(int(dt.datetime.timestamp(dt.datetime.utcnow())))
output = "hashed_output" + key + "_" + timestamp
print(output)

hashed_output_bloomhash_1650943904


In [47]:
data_proc.write.option("header", True).csv(output)

                                                                                

In [51]:
# Directly from dictionary
f = timestamp + "_param_config.json"
with open(f, 'w') as outfile:
    json.dump(fields, outfile)