In [34]:
import pandas as pd
import json

In [35]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, IntegerType, DateType, BinaryType, BooleanType, ByteType, MapType
from pyspark.sql.functions import expr, lit
from pyspark.sql.functions import udf
from pyspark.sql import functions as sf

In [36]:
#from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, NGram, HashingTF, MinHashLSH

In [37]:
spark = SparkSession.builder.appName('Entity-Resolution-Matching').getOrCreate()
sparkContext=spark.sparkContext

## Master data

In [38]:
df = spark.read.option("header",True).csv("hashed_output")

## Candidates dataset

In [39]:
#trivial matching on the same data
df2 = spark.read.option("header", True).csv("hashed_output")

In [40]:
# df.take(3)

[Row(person_id='1000', service_date='2020-10-20', first_name='vicky', middle_name='Kathy', last_name='Singh', dob='1984-04-01', ssn='895-37-4654', name_phonetic='FK', ssn_clean='895-37-4654', name_phonetic_hash='[B@34d548a', ssn_clean_hash='[B@43a4256f'),
 Row(person_id='1001', service_date='2021-09-06', first_name='Gary', middle_name='Stacey', last_name='Watson', dob='1984-10-22', ssn='220-72-9331', name_phonetic='KR', ssn_clean='220-72-9331', name_phonetic_hash='[B@2166b6b2', ssn_clean_hash='[B@4d63d36b'),
 Row(person_id='1002', service_date='2020-11-07', first_name='Anna', middle_name='Joseph', last_name='Camacho', dob='1996-04-11', ssn='404-44-6841', name_phonetic='AN', ssn_clean='404-44-6841', name_phonetic_hash='[B@48e64f10', ssn_clean_hash='[B@495c4e1f')]

## Parameter Alignment 

In [41]:
# Here, import set of unique identifiers and fuzzy identifiers that are agreed upon in the matching
f = open("param_config.json")
fields = json.load(f)
# fields

In [31]:
matching_fields = [x['field_name']+'_hash' for x in fields]

In [32]:
matching_fields

['name_phonetic_hash', 'ssn_clean_hash']

### Match new records df2 to existing records df

In [17]:
model = Pipeline(stages=[
    RegexTokenizer(
        pattern="", inputCol="text", outputCol="tokens", minTokenLength=1
    ),
    NGram(n=3, inputCol="tokens", outputCol="ngrams"),
    HashingTF(inputCol="ngrams", outputCol="vectors"),
    MinHashLSH(inputCol="vectors", outputCol="lsh")
]).fit(df)

# db_hashed = model.transform(db)
# query_hashed = model.transform(query)

# model.stages[-1].approxSimilarityJoin(db_hashed, query_hashed, 0.75).show()

In [23]:
# brp = MinHashLSH(inputCol='name_phonetic_hash', outputCol="lsh", numHashTables=3)

# model = brp.fit(df_feat)

# Feature Transformation
print("The hashed dataset where hashed values are stored in the column 'hashes':")
model.transform(df_feat).show(2)

The hashed dataset where hashed values are stored in the column 'hashes':
+---------+------------+----------+-----------+---------+----------+-----------+-------------+---------+------------------+--------------+-----------+--------------------+--------------------+--------------------+--------------------+
|person_id|service_date|first_name|middle_name|last_name|       dob|        ssn|name_phonetic|ssn_clean|name_phonetic_hash|ssn_clean_hash|   features|              tokens|              ngrams|name_phonetic_hash_2|                 lsh|
+---------+------------+----------+-----------+---------+----------+-----------+-------------+---------+------------------+--------------+-----------+--------------------+--------------------+--------------------+--------------------+
|     1000|  2020-10-20|     vicky|      Kathy|    Singh|1984-04-01|895-37-4654|           10|       10|        [B@34d548a|   [B@43a4256f|[10.0,10.0]|[[, b, @, 3, 4, d...|[[ b @, b @ 3, @ ...|(262144,[23995,44...|[[4.9001

In [26]:
db_hashed = model.transform(df_feat)

In [27]:
model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed, 0.75).show()

2022-04-22 10:29:58 WARN  Utils:66 - Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.


                                                                                

+--------------------+--------------------+------------------+
|            datasetA|            datasetB|           distCol|
+--------------------+--------------------+------------------+
|[1019, 2021-01-27...|[1019, 2021-01-27...|               0.0|
|[1009, 2021-01-06...|[1009, 2021-01-06...|               0.0|
|[1003, 2021-10-24...|[1003, 2021-10-24...|               0.0|
|[1000, 2020-10-20...|[1000, 2020-10-20...|               0.0|
|[1015, 2021-02-17...|[1015, 2021-02-17...|               0.0|
|[1024, 2021-04-21...|[1024, 2021-04-21...|               0.0|
|[1012, 2021-07-07...|[1012, 2021-07-07...|               0.0|
|[1007, 2021-10-21...|[1007, 2021-10-21...|               0.0|
|[1006, 2020-11-07...|[1006, 2020-11-07...|               0.0|
|[1022, 2021-08-17...|[1022, 2021-08-17...|               0.0|
|[1001, 2021-09-06...|[1001, 2021-09-06...|               0.0|
|[1005, 2021-08-10...|[1005, 2021-08-10...|               0.0|
|[1010, 2020-09-20...|[1010, 2020-09-20...|            

In [24]:
# Compute the locality sensitive hashes for the input rows, then perform approximate
# similarity join.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxSimilarityJoin(transformedA, transformedB, 1.5)`
print("Approximately joining dfA and dfB on Jaccard coefficient distance smaller than 1.5:")
model.approxSimilarityJoin(df_feat, df_feat, 1.5, distCol="Jaccard")\
    .select(col("datasetA.person_id").alias("idA"),
            col("datasetB.person_id").alias("idA"),
            col("Jaccard")).show()


Approximately joining dfA and dfB on Jaccard coefficient distance smaller than 1.5:


AttributeError: 'PipelineModel' object has no attribute 'approxSimilarityJoin'

In [31]:
# Now that we have our hashes, we need to decide on a similarity distance and a approximate matching algorithm

In [None]:
## Option 1: home made dice coefficients

In [43]:
def sum_digits(bits):
    digits = [int(x) for x in list(bits)]
    return sum(digits)

def dice_coeff(bits1, bits2):
    return '10'
#     bits = list(zip(bits1, bits2))
#     commons = sum([x[0] == x[1] for x in bits])
#     return (2 * commons) / (sum_digits(bits1) + sum_digits(bits2))

In [44]:
udf_dice = udf(dice_coeff, StringType())

In [34]:
left = df1.alias("left")
right = df2.alias("right")

In [50]:
left.crossJoin(right).select(df1.person_id,
      udf_dice(left.bloom_1, right.bloom_1).alias("dx")).take(5)

[Row(person_id='1000', dx='10'),
 Row(person_id='1000', dx='10'),
 Row(person_id='1000', dx='10'),
 Row(person_id='1000', dx='10'),
 Row(person_id='1000', dx='10')]

self join code for reference:

In [5]:
df = sparkContext.parallelize(
    [("a", 1,2),("a",1,4),("b",5,6),("b",10,2),("c",1,1)]
  ).toDF()#"id","x","y" )

                                                                                

In [6]:
df.collect()

                                                                                

[Row(_1='a', _2=1, _3=2),
 Row(_1='a', _2=1, _3=4),
 Row(_1='b', _2=5, _3=6),
 Row(_1='b', _2=10, _3=2),
 Row(_1='c', _2=1, _3=1)]

In [7]:
left = df.alias("left")
right = df.alias("right")

left.join(right,"_1").select(df._1,
      (left._2-right._2).alias("dx"),
      (left._3-right._3).alias("dy")).collect()

                                                                                

[Row(_1='c', dx=0, dy=0),
 Row(_1='b', dx=0, dy=0),
 Row(_1='b', dx=0, dy=0),
 Row(_1='b', dx=0, dy=0),
 Row(_1='b', dx=0, dy=0),
 Row(_1='a', dx=0, dy=0),
 Row(_1='a', dx=0, dy=0),
 Row(_1='a', dx=0, dy=0),
 Row(_1='a', dx=0, dy=0)]

## Approx LSH

In [59]:
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

dataA = [(0, Vectors.dense([1.0, 1.0]),),
         (1, Vectors.dense([1.0, -1.0]),),
         (2, Vectors.dense([-1.0, -1.0]),),
         (3, Vectors.dense([-1.0, 1.0]),)]
dfA = spark.createDataFrame(dataA, ["id", "features"])

dataB = [(4, Vectors.dense([1.0, 0.0]),),
         (5, Vectors.dense([-1.0, 0.0]),),
         (6, Vectors.dense([0.0, 1.0]),),
         (7, Vectors.dense([0.0, -1.0]),)]
dfB = spark.createDataFrame(dataB, ["id", "features"])

key = Vectors.dense([1.0, 0.0])



In [65]:
brp = MinHashLSH(inputCol="features", outputCol="hashes",
                                  numHashTables=3)
model = brp.fit(dfA)

brp.fit(dfB)

# Feature Transformation
print("The hashed dataset where hashed values are stored in the column 'hashes':")
model.transform(dfA).show()



The hashed dataset where hashed values are stored in the column 'hashes':
+---+-----------+--------------------+
| id|   features|              hashes|
+---+-----------+--------------------+
|  0|  [1.0,1.0]|[[6.70322104E8], ...|
|  1| [1.0,-1.0]|[[6.70322104E8], ...|
|  2|[-1.0,-1.0]|[[6.70322104E8], ...|
|  3| [-1.0,1.0]|[[6.70322104E8], ...|
+---+-----------+--------------------+



In [66]:
model.transform(dfB).show()

+---+----------+--------------------+
| id|  features|              hashes|
+---+----------+--------------------+
|  4| [1.0,0.0]|[[6.70322104E8], ...|
|  5|[-1.0,0.0]|[[6.70322104E8], ...|
|  6| [0.0,1.0]|[[1.055109162E9],...|
|  7|[0.0,-1.0]|[[1.055109162E9],...|
+---+----------+--------------------+



In [63]:
# Compute the locality sensitive hashes for the input rows, then perform approximate
# similarity join.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxSimilarityJoin(transformedA, transformedB, 1.5)`
print("Approximately joining dfA and dfB on Jaccard coefficient distance smaller than 1.5:")
model.approxSimilarityJoin(dfA, dfB, 1.5, distCol="Jaccard")\
    .select(col("datasetA.id").alias("idA"),
            col("datasetB.id").alias("idB"),
            col("Jaccard")).show()



Approximately joining dfA and dfB on Jaccard coefficient distance smaller than 1.5:


                                                                                

+---+---+-------+
|idA|idB|Jaccard|
+---+---+-------+
|  2|  6|    0.5|
|  3|  6|    0.5|
|  0|  5|    0.5|
|  1|  7|    0.5|
|  0|  4|    0.5|
|  2|  7|    0.5|
|  1|  5|    0.5|
|  2|  5|    0.5|
|  0|  7|    0.5|
|  1|  4|    0.5|
|  0|  6|    0.5|
|  2|  4|    0.5|
|  3|  7|    0.5|
|  1|  6|    0.5|
|  3|  4|    0.5|
|  3|  5|    0.5|
+---+---+-------+



In [68]:
# Compute the locality sensitive hashes for the input rows, then perform approximate nearest
# neighbor search.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxNearestNeighbors(transformedA, key, 2)`
print("Approximately searching dfA for 2 nearest neighbors of the key:")
candidates = model.approxNearestNeighbors(dfA, key, 2)
candidates.filter(candidates.distCol > 0.4).show()

Approximately searching dfA for 2 nearest neighbors of the key:
+---+----------+--------------------+-------+
| id|  features|              hashes|distCol|
+---+----------+--------------------+-------+
|  0| [1.0,1.0]|[[6.70322104E8], ...|    0.5|
|  1|[1.0,-1.0]|[[6.70322104E8], ...|    0.5|
+---+----------+--------------------+-------+

