In [1]:
!ls /usr/lib/jvm

java-1.11.0-openjdk-amd64  java-11-openjdk-amd64


In [2]:
# !apt-get update
# !apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [3]:
# !wget -q https://downloads.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
#  # Unzip file
# !tar -xvzf spark-3.5.0-bin-hadoop3.tgz

In [4]:
#!pip install -q findspark
!pip install pyspark



In [5]:
!pip install python-levenshtein



In [65]:
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from pyspark.sql.functions import split
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import col, concat_ws, substring, row_number, concat
from pyspark.sql.functions import lower, count, when, lit
from pyspark.sql.window import Window
from pyspark.sql.functions import regexp_replace, regexp_extract, collect_list, explode, udf
from pyspark.sql.types import DoubleType, BooleanType
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import array
from collections import Counter
import math
import time
from Levenshtein import distance, ratio
import os

In [7]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
!ls /content/gdrive/MyDrive/'Colab Notebooks'/Dia_Exercise/

 acm_1995_2004.csv		  dblp_1995_2004.csv	     match_blocked_dblp_acm_1995_2004.csv
 acm_resolved.csv		  dblp.txt		     match_dblp_acm_1995_2004.csv
 acm.txt			  dia_assignments.ipynb      Matched_Entities.csv
'Copy of dia_assignments.ipynb'   dia_assignments_v2.ipynb


In [9]:
!ls /content/gdrive/MyDrive/TestMamun.txt

/content/gdrive/MyDrive/TestMamun.txt


In [10]:
file_path_dblp = "/content/gdrive/MyDrive/Colab Notebooks/Dia_Exercise/dblp.txt"
file_path_acm = "/content/gdrive/MyDrive/Colab Notebooks/Dia_Exercise/acm.txt"

In [11]:
csv_dlp = "/content/gdrive/MyDrive/Colab Notebooks/Dia_Exercise/dblp_1995_2004.csv"
csv_acm = "/content/gdrive/MyDrive/Colab Notebooks/Dia_Exercise/acm_1995_2004.csv"

In [12]:
file_path = "/content/gdrive/MyDrive/TestMamun_2.txt"

In [13]:
spark = SparkSession.builder.appName("RDDPrintExample").getOrCreate()

# # Define the custom delimiter
# delimiter = "\n\n"

# # Create an RDD using newAPIHadoopFile with TextInputFormat
# rdd = spark.sparkContext.newAPIHadoopFile(
#     file_path,
#     "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
#     "org.apache.hadoop.io.LongWritable",
#     "org.apache.hadoop.io.Text",
#     conf={"textinputformat.record.delimiter": delimiter},
# )


In [14]:
# Define the schema
pub_schema = StructType([
    StructField("title", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("year", StringType(), True),
    StructField("journal", StringType(), True),
    StructField("index", StringType(), True),
])

In [15]:
def create_dataframe_from_file(file_path, pub_schema):
    # Define the custom delimiter
    delimiter = "\n\n"

    # Create an RDD using newAPIHadoopFile with TextInputFormat
    rdd = spark.sparkContext.newAPIHadoopFile(
        file_path,
        "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
        "org.apache.hadoop.io.LongWritable",
        "org.apache.hadoop.io.Text",
        conf={"textinputformat.record.delimiter": delimiter},
    )

    # Filter and map the RDD to extract relevant fields
    data_rdd = rdd.filter(lambda x: not x[1].startswith('#%')).map(lambda x: tuple(
        (
            next((field[2:] for field in x[1].splitlines() if field.startswith('#*')), None),  # Paper Title
            next((field[2:] for field in x[1].splitlines() if field.startswith('#@')), None),  # Authors
            next((field[2:] for field in x[1].splitlines() if field.startswith('#t')), None),  # Year
            next((field[2:] for field in x[1].splitlines() if field.startswith('#c')), None),  # Publication Venue
            next((field[6:] for field in x[1].splitlines() if field.startswith('#index')), None)  # Index ID
        )
    ))

    # Create DataFrame using the defined schema
    df = spark.createDataFrame(data_rdd, schema=pub_schema)

    return df

# df = create_dataframe_from_file(file_path, pub_schema)
# df.show(truncate=False)


In [16]:
def filter_and_clean_df(df):
    # Filter publications between 1995 and 2004 in VLDB and SIGMOD venues
    filtered_df = df.filter(
        (col("year").cast("int").between(1995, 2004)) &
        (col("journal").rlike("(?i)SIGMOD|VLDB"))
    )
    # # Clean the journal column
    # cleaned_df = filtered_df.withColumn("journal",
    #                                     regexp_replace(
    #                                         regexp_replace("journal", "(?i).*\\bVLDB\\b.*", "VLDB"),
    #                                         "(?i).*\\bSIGMOD\\b.*", "SIGMOD"))

    return filtered_df

In [17]:
def remove_null_rows(dataframe):
    return dataframe.dropna()

In [18]:
def remove_special_chars(df):
    # Remove special characters and replace with single space
    pattern = r'[^\wÄÖÜäöü,]|(\s{2,})'
    df = df.withColumn("title", regexp_replace("title", pattern, ' ')).withColumn("authors", regexp_replace("authors", pattern, ' '))
    return df


In [19]:
def write_to_csv(dataframe, path):
  dataframe.repartition(1).write.format('csv').option("header", "true").option("quote", "\"").option("escape", "\"").save(path, mode='overwrite')

In [57]:
def remove_inter_duplicates(df):
    # Convert titles, authors, and year to lowercase for case-insensitive comparison
    df = df.withColumn("lower_title", lower(col("title"))) \
           .withColumn("lower_authors", lower(col("authors")))

    # Find duplicates using window function
    windowSpec = Window.partitionBy("lower_title", "lower_authors", "year", "index")
    df_with_count = df.withColumn("count", count("*").over(windowSpec))

    # # Filter only rows with count > 1, indicating duplicates
    # duplicates_df = df_with_count.filter(col("count") > 1).orderBy(col("count").desc()) \
    #                              .drop("lower_title", "lower_authors")

    # Remove duplicates from original DataFrame
    df_unique = df_with_count.filter(col("count") == 1).drop("count", "lower_title", "lower_authors")

    return df_unique

In [21]:
def pipeline(txt_file_path, *funcs):
    df = create_dataframe_from_file(txt_file_path, pub_schema)
    for func in funcs:
        df = func(df)
    return df
df_acm = pipeline(file_path_acm, remove_null_rows, filter_and_clean_df, remove_special_chars, remove_inter_duplicates)
df_dlp = pipeline(file_path_dblp, remove_null_rows, filter_and_clean_df, remove_special_chars, remove_inter_duplicates)

In [22]:
#write_to_csv(df_acm, csv_acm)
#write_to_csv(df_dlp, csv_dlp)

In [23]:
def read_csv_with_schema(spark, file_path, schema):
    """
    Read records from a CSV file using a specified schema.

    Args:
    - spark: SparkSession object
    - file_path: path to the CSV file
    - schema: schema to be applied to the DataFrame

    Returns:
    - DataFrame containing the records from the CSV file with the specified schema
    """
    # Read CSV file with schema
    df = spark.read.csv(file_path, schema=schema, header=True, encoding="UTF-8")
    return df
df_acm = read_csv_with_schema(spark, csv_acm, pub_schema)
df_dlp = read_csv_with_schema(spark, csv_dlp, pub_schema)

In [24]:
ground_truth_duplicates_path = "/content/gdrive/MyDrive/Colab Notebooks/Dia_Exercise/match_dblp_acm_1995_2004.csv"
blocked_duplicates_path = "/content/gdrive/MyDrive/Colab Notebooks/Dia_Exercise/Matched_Entities.csv"
resolved_data_path = "/content/gdrive/MyDrive/Colab Notebooks/Dia_Exercise/Resolved_Entities.csv"

In [59]:
def measure_execution_time(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Execution time of {func.__name__}: {execution_time} seconds")
        return result
    return wrapper

In [26]:
@measure_execution_time
def matching_without_blocking(df1, df2, similarity_func, filter_func, output_path = None, threshold=0.5):
  sim_udf = udf(similarity_func, DoubleType())
  filter_udf = udf(filter_func, BooleanType())

  duplicates = find_duplicates_for_ground_truth_js(df1, df2, sim_udf, filter_udf, threshold)

  if output_path is not None:
    write_to_csv(duplicates, output_path)
  return duplicates

In [27]:
def jaccard_similarity_case_insensitive(str1, str2):
    if str1 is None or str2 is None:
      return 0.0
    set1 = set(str1.lower().split())
    set2 = set(str2.lower().split())

    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))

    similarity = intersection / union if union != 0 else 0
    return similarity

In [28]:
def trigram_similarity(s1, s2):
    # Function to generate trigrams from a string
    def generate_trigrams(string):
        return [string[i:i+3] for i in range(len(string) - 2)]

    # Generate trigrams for both strings
    trigrams_s1 = generate_trigrams(s1.lower())
    trigrams_s2 = generate_trigrams(s2.lower())

    # Calculate Jaccard similarity coefficient
    intersection = len(set(trigrams_s1) & set(trigrams_s2))
    union = len(set(trigrams_s1) | set(trigrams_s2))
    similarity = intersection / union if union != 0 else 0

    return similarity

In [60]:
def cosine_similarity(s1, s2):
    # Function to preprocess and tokenize strings
    def preprocess(text):
        return text.lower().split()

    # Tokenize the strings
    tokens1 = preprocess(s1)
    tokens2 = preprocess(s2)

    # Count the occurrences of each token in both strings
    counter1 = Counter(tokens1)
    counter2 = Counter(tokens2)

    # Compute the dot product of the token counts
    dot_product = sum(counter1[token] * counter2[token] for token in counter1.keys() & counter2.keys())

    # Compute the Euclidean norm of the token counts
    norm1 = math.sqrt(sum(counter1[token] ** 2 for token in counter1.keys()))
    norm2 = math.sqrt(sum(counter2[token] ** 2 for token in counter2.keys()))

    # Compute the cosine similarity
    similarity = dot_product / (norm1 * norm2) if (norm1 * norm2) != 0 else 0

    return similarity

In [56]:
def filter_duplicates(title_similarity, author_similarity, threshold, sameyear, threshold2 = 0.75):
    if (title_similarity == 1 and author_similarity >= threshold and sameyear == 1) or \
       (title_similarity >= threshold and author_similarity == 1 and sameyear == 1) or \
       (title_similarity >= threshold2 and author_similarity >= threshold2):  ### Not same year pub shoud have
       #atleast 75 percent similarity between years and authors
        return True
    else:
        return False

In [31]:
# Register the UDF to calculate Jaccard similarity
#jaccard_similarity_udf = udf(jaccard_similarity_case_insensitive, DoubleType())

def find_duplicates_for_ground_truth_js(df1, df2,jaccard_similarity_udf, filter_duplicates_udf, threshold=0.5):
    # Select only the "title" and "authors" columns from df1 and alias them
    df1_subset = df1.select(col("title").alias("title_1"), col("authors").alias("authors_1"), col("year").alias("year_1"))

    # Select only the "title" and "authors" columns from df2 and alias them
    df2_subset = df2.select(col("title").alias("title_2"), col("authors").alias("authors_2"), col("year").alias("year_2"))

    # Join the subsets of df1 and df2 containing "title" and "authors" columns
    joined_df = df1_subset.crossJoin(df2_subset)

    # Calculate the similarity score using Jaccard similarity on "title" and "authors"
    similarity_df = joined_df.withColumn("Title_Similarity", jaccard_similarity_udf(joined_df["title_1"], joined_df["title_2"])) \
                             .withColumn("Authors_Similarity", jaccard_similarity_udf(joined_df["authors_1"], joined_df["authors_2"]))\
                             .withColumn("Years_Similarity", when(joined_df["year_1"] == joined_df["year_2"], lit(1)).otherwise(lit(0)))

    # Filter the DataFrame to keep only the pairs with similarity scores meeting the refined criteria
    duplicates_df = similarity_df.filter(filter_duplicates_udf(similarity_df["Title_Similarity"],
                                                          similarity_df["Authors_Similarity"],
                                                          lit(threshold), similarity_df["Years_Similarity"]))

    # Select and rename the relevant columns
    duplicates_df = duplicates_df.select(duplicates_df["title_1"].alias("Title1"),
                                         duplicates_df["title_2"].alias("Title2"),
                                         duplicates_df["authors_1"].alias("Authors1"),
                                         duplicates_df["authors_2"].alias("Authors2"),
                                         duplicates_df["Title_Similarity"],
                                         duplicates_df["Authors_Similarity"],
                                         duplicates_df["year_1"].alias("Year1"),
                                         duplicates_df["year_2"].alias("Year2"))


    return duplicates_df

In [32]:
def generate_blocking_key(title, authors, year):
    # Split the authors' names by commas
    author_list = authors.split(',')

    # Sort author names by last name in ascending order (case-insensitive)
    sorted_author_list = sorted(author_list, key=lambda x: x.split()[-1].lower())

    print(sorted_author_list)
    # Extract the last name of the first author
    first_author_last_name = sorted_author_list[0].split()[-1]

    # Take the first letter of the last name for other authors
    other_authors_initials = ''.join([author.split()[-1][0].capitalize() for author in sorted_author_list[1:]])

    # Extract the last two digits of the year
    year_last_two_digits = str(year)[2:]

    blocking_key = f"{first_author_last_name}{other_authors_initials}{year_last_two_digits}"
    return blocking_key

In [84]:
def find_duplicates_with_blocking(df1, df2, jaccard_similarity_udf, blocking_key_func, threshold=0.5):

    # Alias all columns in each DataFrame to distinguish them after joining
    df1 = df1.select([col(col_name).alias(f"{col_name}_df1") for col_name in df1.columns])
    df2 = df2.select([col(col_name).alias(f"{col_name}_df2") for col_name in df2.columns])

    df2 = df2.withColumn("id_df2", monotonically_increasing_id())

    # Generate blocking keys for each DataFrame
    df1_blocked = df1.withColumn("blocking_key_df1", blocking_key_func(col("title_df1"), col("authors_df1"), col("year_df1")))
    df2_blocked = df2.withColumn("blocking_key_df2", blocking_key_func(col("title_df2"), col("authors_df2"), col("year_df2")))

    # Join DataFrames on blocking keys
    joined_df = df1_blocked.alias("df1").join(df2_blocked.alias("df2"), col("df1.blocking_key_df1") == col("df2.blocking_key_df2"), "inner")

    # Calculate similarity score using Jaccard similarity on "title" and "authors" only for records with the same blocking key
    similarity_df = joined_df.withColumn("Title_Similarity", when(col("df1.blocking_key_df1") == col("df2.blocking_key_df2"),
                                                                   jaccard_similarity_udf(col("df1.title_df1"), col("df2.title_df2"))).otherwise(lit(0))) \
                             .withColumn("Authors_Similarity", when(col("df1.blocking_key_df1") == col("df2.blocking_key_df2"),
                                                                     jaccard_similarity_udf(col("df1.authors_df1"), col("df2.authors_df2"))).otherwise(lit(0)))

    # Filter the DataFrame to keep only the pairs with similarity scores greater than or equal to the specified threshold
    duplicates_df = similarity_df.filter((similarity_df["Title_Similarity"] >= threshold) &
                                         (similarity_df["Authors_Similarity"] >= threshold))
    matched_df = duplicates_df.drop(*["Title_Similarity", "Authors_Similarity", "title_df2", "authors_df2", "year_df2", "journal_df2", "index_df2"])
    return matched_df

In [34]:
@measure_execution_time
def matching_with_blocking(df1, df2, similarity_func, blocking_func, output_path = None, threshold=0.5):
  sim_udf = udf(similarity_func, DoubleType())
  blocking_udf = udf(blocking_func, StringType())

  duplicates = find_duplicates_with_blocking(df1, df2, sim_udf, blocking_udf, threshold)

  if output_path is not None:
    write_to_csv(duplicates, output_path)
  return duplicates

In [48]:
dup_naive = matching_without_blocking(df_acm, df_dlp, cosine_similarity, filter_duplicates, threshold=0.65).cache()

Execution time of matching_without_blocking: 0.20383858680725098 seconds


In [49]:
dup_naive.show()

+--------------------+--------------------+--------------------+--------------------+------------------+------------------+-----+-----+
|              Title1|              Title2|            Authors1|            Authors2|  Title_Similarity|Authors_Similarity|Year1|Year2|
+--------------------+--------------------+--------------------+--------------------+------------------+------------------+-----+-----+
|Honey, I shrunk t...|Honey, I Shrunk t...|    Praveen Seshadri|    Praveen Seshadri|0.8432740427115678|0.9999999999999998| 1999| 1999|
|lgr  DB  an ODMG ...|lambda DB  An ODM...|Leonidas Fegaras,...|Leonidas Fegaras,...|0.8749999999999998|0.9999999999999998| 2000| 2000|
|One size fits all...|One Size Fits All...|     Clark D  French|     Clark D  French|0.9090909090909091|1.0000000000000002| 1995| 1995|
|Share your data, ...|Share your data, ...|Irini Fundulaki, ...|Irini Fundulaki, ...|0.9999999999999998|               1.0| 2004| 2004|
|1 Safe Algorithms...|1 Safe Algorithms...|Rune 

In [50]:
matched_entities_with_blocking = matching_with_blocking(df_acm, df_dlp,cosine_similarity, generate_blocking_key, threshold=0.65).cache()

Execution time of matching_with_blocking: 0.2215595245361328 seconds


In [51]:
matched_entities_with_blocking.show()

+--------------------+--------------------+--------+--------------------+--------------------+----------------+--------------------+--------------------+--------+-----------------+--------------------+------+----------------+------------------+------------------+
|           title_df1|         authors_df1|year_df1|         journal_df1|           index_df1|blocking_key_df1|           title_df2|         authors_df2|year_df2|      journal_df2|           index_df2|id_df2|blocking_key_df2|  Title_Similarity|Authors_Similarity|
+--------------------+--------------------+--------+--------------------+--------------------+----------------+--------------------+--------------------+--------+-----------------+--------------------+------+----------------+------------------+------------------+
|Honey, I shrunk t...|    Praveen Seshadri|    1999|SIGMOD '99 Procee...|539087f320f70186a...|      Seshadri99|Honey, I Shrunk t...|    Praveen Seshadri|    1999|SIGMOD Conference|53e9990db7602d970...|     0|

In [63]:
#write_to_csv(matched_entities_with_blocking, blocked_duplicates_path)

In [52]:
generate_blocking_key_udf = udf(generate_blocking_key)
tp_df_dup = dup_naive.withColumn("blocking_key_df1", generate_blocking_key_udf(col("Title1"), col("Authors1"), col("Year1")))

In [53]:
#Calculate True Positives (TP)
TP = tp_df_dup.join(matched_entities_with_blocking, on="blocking_key_df1", how='inner').count()

# Calculate False Positives (FP)
FP = TP - matched_entities_with_blocking.count()

# Calculate False Negatives (FN)
# FN is the number of duplicate pairs identified by the baseline method but not by the blocking method,
# which can be calculated as the difference between the total number of duplicate pairs identified by the baseline method and TP
FN = TP- tp_df_dup.count()

In [54]:
TP, FP, FN

(1664, 166, 57)

In [55]:
precision = TP / (TP + FP)

recall = TP / (TP + FN)
f_score = 2 * (precision * recall) / (precision + recall)
print("Precision:", precision)
print("Recall:", recall)
print("F-score:", f_score)

Precision: 0.9092896174863389
Recall: 0.9668797210923882
F-score: 0.9372007885102789


In [76]:
def add_alphabets_to_blocking_keys(df, partitionkey):
    # Define a window specification to partition by blocking key
    window_spec = Window.partitionBy(partitionkey).orderBy(partitionkey)

    # Assign unique row numbers within each partition
    df = df.withColumn("row_num", row_number().over(window_spec))

    # Generate alphabets for unique keys
    alphabet = 'abcdefghijklmnopqrstuvwxyz'

    # Convert alphabet string to an array of characters
    alphabet_array = array([lit(char) for char in alphabet])


    # Check if the new blocking key already exists and assign a unique alphabet
    df = df.withColumn("alphabet", when(col("row_num") == 1, "").otherwise(
        alphabet_array.getItem(col("row_num"))
    ))

    # # Concatenate blocking key with the alphabet to create new blocking keys
    df = df.withColumn("new_blocking_key", concat(col(partitionkey), col("alphabet"))).drop(partitionkey)

    return df

In [None]:
def add_unique_key_to_df(acm, dblp, blocking_key_func, make_unique_blocking_key_func):
    blocking_key_func_udf = udf(blocking_key_func, StringType())
    dblp = dblp.withColumn("idDBLP", monotonically_increasing_id())

    acm = acm.withColumn("idACM_notunique", blocking_key_func_udf(col("title"), col("authors"), col("year")))
    acm = add_alphabets_to_blocking_keys(acm, "idACM_notunique")

    acm = acm.withColumnRenamed("idACM_notunique", "idACM")
    return acm, dblp
df_acm, df_dlp = add_unique_key_to_df(df_acm, df_dlp, generate_blocking_key, add_alphabets_to_blocking_keys)

In [85]:
matches = matching_with_blocking(df_acm, df_dlp,jaccard_similarity_case_insensitive, generate_blocking_key).cache()

Execution time of matching_with_blocking: 0.28238892555236816 seconds


In [87]:
matches.show()

+--------------------+--------------------+--------+--------------------+--------------------+-----------+------------+--------------------+----------------+----------+------+----------------+
|           title_df1|         authors_df1|year_df1|         journal_df1|           index_df1|row_num_df1|alphabet_df1|new_blocking_key_df1|blocking_key_df1|idDBLP_df2|id_df2|blocking_key_df2|
+--------------------+--------------------+--------+--------------------+--------------------+-----------+------------+--------------------+----------------+----------+------+----------------+
|Visual COKO  a de...|Daniel J  Abadi, ...|    2002|Proceedings of th...|5390882d20f70186a...|          1|            |            AbadiC02|        AbadiC02|      1992|  1992|        AbadiC02|
|Aurora  a new mod...|Daniel J  Abadi, ...|    2003|The VLDB Journal ...|5390958a20f70186a...|          1|            |     AbadiCCCELSTZ03| AbadiCCCELSTZ03|       246|   246| AbadiCCCELSTZ03|
|Hardware accelera...|Nagender Band

In [92]:
def resolve(matches_df, acm, dlp):
    matches = matches_df.select(col("new_blocking_key_df1"), col("idDBLP_df2"))
    ## TODO
    ## How to only get single instance from each dataset using this matches
    return matches
df_test = resolve(matches, df_acm, df_dlp)

In [44]:
def resolve_duplicates(matched_entities_with_blocking, df_acm, df_dlp):
    # Concatenate resolved entities with matched_entities_to_keep
    final_acm = df_acm.drop("index").union(
        matched_entities_with_blocking.select(
        col("title_df1").alias("title"),
        col("authors_df1").alias("authors"),
        col("year_df1").alias("year"),
        col("journal_df1").alias("journal")
        ))
    final_dlp = df_dlp.drop("index").union(
        matched_entities_with_blocking.select(
        col("title_df2").alias("title"),
        col("authors_df2").alias("authors"),
        col("year_df2").alias("year"),
        col("journal_df2").alias("journal")
        ))
    final_acm.cache()
    final_dlp.cache()
    return final_acm, final_dlp
resolved_acm, resolved_dlp = resolve_duplicates(matched_entities_with_blocking, df_acm, df_dlp)

In [46]:
path_resolved_acm = "/content/gdrive/MyDrive/Colab Notebooks/Dia_Exercise/acm_resolved.csv"
path_resolved_dlp = "/content/gdrive/MyDrive/Colab Notebooks/Dia_Exercise/dlp.resolved.csv"

In [93]:
#write_to_csv(resolved_acm, path_resolved_acm)

In [94]:
def levenshtein_ratio(s1, s2):
    return ratio(s1.lower(), s2.lower())

In [95]:
levenshtein_ratio("Tutorial Designing an Ultra Highly Available dbms", "An Ultra Highly Available DBMS")

0.759493670886076

In [97]:
# Register the DataFrame as a temporary view
matches.createOrReplaceTempView("joined_data")

# Run SQL query to count occurrences of each blocking key
blocking_key_counts = spark.sql("""
    SELECT new_blocking_key_df1, COUNT(*) AS count_new_blocking_key
    FROM joined_data
    GROUP BY new_blocking_key_df1
    ORDER BY count_new_blocking_key DESC
""")

# Show the result
blocking_key_counts.show()

+--------------------+----------------------+
|new_blocking_key_df1|count_new_blocking_key|
+--------------------+----------------------+
|          BonnetS02c|                     3|
|           BonnetS02|                     3|
|           ndezSXY03|                     2|
|          ndezSXY03c|                     2|
|           Aberer03c|                     2|
|         Bhashyam96c|                     2|
|            Aberer03|                     2|
|          Bhashyam96|                     2|
|       BouganimNPW03|                     1|
|           BrodieC99|                     1|
|       ChaudhuriDL04|                     1|
|        FlorescuKL97|                     1|
|      GunopulosMVV04|                     1|
|           KoudasS96|                     1|
|          PacittiS00|                     1|
|           Wiegand02|                     1|
|    BlausteinLMRST95|                     1|
|         DeWittGJW03|                     1|
|         FreytagLN99|            