In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

# find the closest matching string | best match

In [1]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("ClosestMatchExample").getOrCreate()

# Sample Data: Strings to match
data = [
    (1, "apple"),
    (2, "applle"),
    (3, "banana"),
    (4, "bananna"),
    (5, "grape"),
    (6, "grapefruit"),
    (7, "pineapple"),
    (8, "mango"),
    (9, "oranges"),
    (10, "blueberry"),
    (11, "blackberry"),
    (12, "pear"),
    (13, "peach"),
    (14, "apricot"),
    (15, "plum"),
    (16, "watermelon")
]

# Create DataFrame
df = spark.createDataFrame(data, ["id", "fruit_name"])

# Create a Temporary View for Spark SQL
df.createOrReplaceTempView("fruits")

# Show the original DataFrame
df.show(truncate=False)


+---+----------+
|id |fruit_name|
+---+----------+
|1  |apple     |
|2  |applle    |
|3  |banana    |
|4  |bananna   |
|5  |grape     |
|6  |grapefruit|
|7  |pineapple |
|8  |mango     |
|9  |oranges   |
|10 |blueberry |
|11 |blackberry|
|12 |pear      |
|13 |peach     |
|14 |apricot   |
|15 |plum      |
|16 |watermelon|
+---+----------+



In [None]:
df.cache()

In [3]:
from pyspark.sql.functions import levenshtein, col

# Self-join to compare each fruit with all others
df_joined = df.alias("t1").crossJoin(df.alias("t2")) \
    .filter(col("t1.fruit_name") != col("t2.fruit_name")) \
    .withColumn("distance", levenshtein(col("t1.fruit_name"), col("t2.fruit_name"))) \
    .orderBy("t1.fruit_name", "distance")

# Show closest matches
df_joined.select(col("t1.fruit_name").alias("input_fruit"),
                 col("t2.fruit_name").alias("closest_match"),
                 col("distance")).show(15, truncate=False)


+-----------+-------------+--------+
|input_fruit|closest_match|distance|
+-----------+-------------+--------+
|apple      |applle       |1       |
|apple      |plum         |4       |
|apple      |pear         |4       |
|apple      |pineapple    |4       |
|apple      |grape        |4       |
|apple      |mango        |5       |
|apple      |apricot      |5       |
|apple      |peach        |5       |
|apple      |banana       |5       |
|apple      |oranges      |6       |
|apple      |bananna      |6       |
|apple      |blueberry    |8       |
|apple      |watermelon   |8       |
|apple      |grapefruit   |8       |
|apple      |blackberry   |8       |
+-----------+-------------+--------+
only showing top 15 rows



In [None]:
res = spark.sql("""
                
  SELECT t1.fruit_name AS input_fruit, t2.fruit_name AS closest_match, 
       LEVENSHTEIN(t1.fruit_name, t2.fruit_name) AS distance
FROM fruits t1
JOIN fruits t2 ON t1.fruit_name != t2.fruit_name
ORDER BY t1.fruit_name, distance
LIMIT 15;
              
                
                """)
res.show()