In [174]:
import pandas as pd
from preprocessing import pre_phonetic_encoding, pre_string_clean#, pre_tokenize, pre_sort_names

In [175]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, IntegerType, DateType, BinaryType, BooleanType, ByteType, MapType
from pyspark.sql.functions import expr, lit
from pyspark.sql.functions import udf

In [4]:
spark = SparkSession.builder.appName('Entity-Resolution').getOrCreate()
sparkContext=spark.sparkContext

2022-01-07 11:03:50 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


#### For now, we assume that entities have been deduplicated already

In [5]:
df1 = spark.read.csv("../simulated_data/dataset_1.csv", header=True,
    mode="DROPMALFORMED",)
df1

DataFrame[person_id: string, service_date: string, first_name: string, middle_name: string, last_name: string, dob: string, ssn: string]

In [7]:
df2 = spark.read.csv("../simulated_data/dataset_2.csv", header=True,
    mode="DROPMALFORMED",)
df2

DataFrame[person_id: string, service_date: string, first_name: string, middle_name: string, last_name: string, dob: string, ssn: string]

## Preprocessing steps 
Assign each preprocessing step to a udf, then apply to dataframe

In [13]:
## UDFs
udf_pre_phonetic_encoding = udf(pre_phonetic_encoding, StringType()) 

In [14]:
df1 = df1.withColumn("phonetic_name", udf_pre_phonetic_encoding("first_name"))
df1 = df1.withColumn("phonetic_last_name", udf_pre_phonetic_encoding("last_name"))
df1.head()

                                                                                

Row(person_id='1000', service_date='2020-09-02', first_name='christiano', middle_name='Allen', last_name='Marquez', dob='1967-05-29', ssn='119-77-5956', phonetic_name='XRSXN', phonetic_last_name='MRKS')

In [15]:
df2 = df2.withColumn("phonetic_name", udf_pre_phonetic_encoding("first_name"))
df2 = df2.withColumn("phonetic_last_name", udf_pre_phonetic_encoding("last_name"))
df2.head()

Row(person_id='1000', service_date='2020-09-02', first_name='chris', middle_name='Allen', last_name='Marquez', dob='1967-05-29', ssn='119-77-5956', phonetic_name='XRS', phonetic_last_name='MRKS')

## Now to the hashing and linking

### Bloom filter on name

In [131]:
import bitarray
import mmh3
def bloom_hash(item, size, hash_count):
    bit_array = bitarray.bitarray(size)
    bit_array.setall(0)
    for ii in range(hash_count):
        index = mmh3.hash(item, ii) % size
        bit_array[index] = 1
    return bytes(bit_array)

In [132]:
bloom_hash("bahg", 30, 3)

b'\x00@\xa0\x00'

In [133]:
bitarray_type = StringType()#ArrayType(ByteType())
bitarray_type.needConversion()
bitarray_type

StringType

In [134]:
udf_bloom = udf(bloom_hash, bitarray_type)

#### Steps
For each bloom filter function:
* Map: "Hash" entities according to algorithm
* Map: Assign block, sum 1s in hash
* Join: inner join to compute pairwise distances (Dice coefficient)
* Groupby: find min distance
* Filter for entities that meet threshold - these are the potential candidates

For all entity dataframes with associated candidates
* join by entity
* Pick candidate (how tho? :) )

In [135]:
df1 = df1.withColumn("bloom_1", udf_bloom("phonetic_name", lit(100), lit(5)))
df1 = df1.withColumn("bloom_1_binary", df1.bloom_1.cast(BinaryType()))
df1.head()

Row(person_id='1000', service_date='2020-09-02', first_name='christiano', middle_name='Allen', last_name='Marquez', dob='1967-05-29', ssn='119-77-5956', phonetic_name='XRSXN', phonetic_last_name='MRKS', bloom_1='[B@599154e2', bloom_1_binary=bytearray(b'[B@599154e2'))

In [171]:
def assign_block(bits, n_digits = 3):
    if len(bits) < 3:
        raise ValueError("bloom filter bytearray should be longer than 2 digits")
    else:
        digits = [int(x) for x in list(bits[0:4])]
        if digits[0] == 0:
            return 3 + sum(digits)
        else:
            return sum(digits)
        
def sum_digits(bits):
    digits = [int(x) for x in list(bits)]
    return sum(digits)

def dice_coeff(bits1, bits2):
    bits = list(zip(bits1, bits2))
    commons = sum([x[0] == x[1] for x in bits])
    return (2 * commons) / (sum_digits(bits1) + sum_digits(bits2))

In [172]:
print(assign_block('111'))
print(assign_block('000'))
print(assign_block('001'))
print(assign_block('011'))
print(assign_block('101'))

3
3
4
5
2


In [173]:
dice_coeff("1001", "1111")

0.6666666666666666

### Composite Bloom filter

self join code for reference:

In [48]:
df = sparkContext.parallelize(
    [("a", 1,2),("a",1,4),("b",5,6),("b",10,2),("c",1,1)]
  ).toDF()#"id","x","y" )

In [56]:
df.collect()

[Row(_1='a', _2=1, _3=2),
 Row(_1='a', _2=1, _3=4),
 Row(_1='b', _2=5, _3=6),
 Row(_1='b', _2=10, _3=2),
 Row(_1='c', _2=1, _3=1)]

In [55]:
left = df.alias("left")
right = df.alias("right")

left.join(right,"_1").select(df._1,
      (left._2-right._2).alias("dx"),
      (left._3-right._3).alias("dy")).collect()

                                                                                

[Row(_1='c', dx=0, dy=0),
 Row(_1='b', dx=0, dy=0),
 Row(_1='b', dx=0, dy=0),
 Row(_1='b', dx=0, dy=0),
 Row(_1='b', dx=0, dy=0),
 Row(_1='a', dx=0, dy=0),
 Row(_1='a', dx=0, dy=0),
 Row(_1='a', dx=0, dy=0),
 Row(_1='a', dx=0, dy=0)]