In [0]:
!pip install faker 
!pip install rapidfuzz

In [0]:
dbutils.widgets.text("ref count","100000")
dbutils.widgets.text("input count","10000")

In [0]:

row_no = int(dbutils.widgets.get("ref count"))

import datetime
import time
import uuid
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, LongType
import random
from faker import Faker
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.random import RandomRDDs
from pyspark.sql import SparkSession
from pyspark.sql.types import *

data_df  = RandomRDDs.uniformVectorRDD(spark, row_no,1).map(lambda a : a.tolist()).toDF()    # row, column

fake = Faker('en_US')
fake1 = Faker('en_GB')   # To generate phone numbers

start = datetime.datetime.utcnow()
print("Starting preparation: ", start.strftime("%Y-%m-%d %H:%M:%S.%f"))

# Adding an id column with unique values
nameUdf = udf(lambda : str(fake.name()),StringType())

data_df = data_df.withColumn("ref_name", nameUdf())

data_df = data_df.drop("_1")

data_df.write.format("delta").saveAsTable("DIM_CUST")

end = datetime.datetime.utcnow()
print("Completed : ", end.strftime("%Y-%m-%d %H:%M:%S.%f"))
print("Time taken : ", (end - start))
spark.sql("SELECT * FROM DIM_CUST").show(5)

In [0]:

input_no = int(dbutils.widgets.get("input count"))

data_df  = RandomRDDs.uniformVectorRDD(spark, input_no,1).map(lambda a : a.tolist()).toDF()    # row, column

fake = Faker('en_US')
fake1 = Faker('en_GB')   # To generate phone numbers

start = datetime.datetime.utcnow()
print("Starting preparation: ", start.strftime("%Y-%m-%d %H:%M:%S.%f"))

# Adding an id column with unique values
nameUdf = udf(lambda : str(fake.name()),StringType())

data_df = data_df.withColumn("name", nameUdf())

data_df = data_df.drop("_1")

data_df.write.format("delta").saveAsTable("v_input_df")

end = datetime.datetime.utcnow()
print("Completed : ", end.strftime("%Y-%m-%d %H:%M:%S.%f"))
print("Time taken : ", (end - start))
spark.sql("SELECT * FROM v_input_df").show(5)

In [0]:
# PYSPARK CODE
from pyspark.sql.functions import udf,broadcast, col, pandas_udf
import pyspark.sql.functions as F
from pyspark.sql.types import StringType,IntegerType,DoubleType, FloatType
from rapidfuzz import fuzz
import pandas as pd

# Python UDF
def match_string(s1, s2):
    val = fuzz.token_sort_ratio(s1, s2)
    return val

spark.udf.register("fuzztest1",  match_string,FloatType())

# Pandas UDF
@pandas_udf(FloatType())
def fuzz_pandas_udf(a: pd.Series, b: pd.Series) -> pd.Series:
    return a.combine(b, lambda x, y: fuzz.token_sort_ratio(x, y))

spark.udf.register("fuzztest2",  fuzz_pandas_udf)

In [0]:
%sql
-- Select distinct cities from the input file/table
CREATE OR REPLACE TEMPORARY VIEW v_distinct_input_names 
AS 
SELECT DISTINCT name as Input_name FROM v_input_df;

-- cross join the distinct cities with the reference table and broadcast the smaller table
CREATE OR REPLACE TEMPORARY VIEW v_crossjoin 
AS 
SELECT /*+ BROADCAST(v_distinct_input_names) */
ref_name,  Input_name FROM DIM_CUST LEFT JOIN v_distinct_input_names;

--Partition the table to 24
CREATE OR REPLACE TEMPORARY VIEW vw_fuzzy0 AS
SELECT /*+ REPARTITION(24) */ * FROM v_crossjoin;

CREATE OR REPLACE TEMPORARY VIEW vw_fuzzy1 AS
SELECT ref_name, Input_name, fuzztest1(ref_name,Input_name) AS similarity_score FROM vw_fuzzy0;


CREATE OR REPLACE TEMPORARY VIEW v_fuzzy2 AS 
 SELECT ref_name, Input_name, similarity_score
 FROM
 (
     SELECT ROW_NUMBER() OVER (PARTITION BY input_name ORDER BY similarity_score DESC) as RowNum, *
     FROM vw_fuzzy1
 ) 
 WHERE RowNum = 1;

CREATE OR REPLACE TEMPORARY VIEW v_fuzzy3 AS
SELECT input_name, ref_name, similarity_score FROM v_fuzzy2;

SELECT * FROM v_fuzzy3 order by similarity_score DESC;