In [None]:
import os
import sys
import pandas
import pyspark

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [None]:
from pyspark.sql import SparkSession
ss = SparkSession.builder.appName('folder_read').getOrCreate()

#create sparksession object and read dataframe
df = ss.read.option("header", "true").option("nullValue", "?").option("inferSchema", "true").load("/home/lplab/Desktop/donation/", format = "csv", pathGlobFilter="*.csv")
df.show()
#print schema
df.printSchema()
#count number of entries
print("Dataframe contains " + str(df.count()) + " entries")




+-----+-----+------------+------------+------------+------------+-------+------+------+------+-------+--------+
| id_1| id_2|cmp_fname_c1|cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|
+-----+-----+------------+------------+------------+------------+-------+------+------+------+-------+--------+
|53719|60579|           1|        null|           1|        null|      1|     1|     1|     1|      1|    true|
|58967|58973|           1|        null|           1|        null|      1|     1|     1|     1|      1|    true|
| 1499|23331|           1|        null|           1|        null|      1|     1|     1|     1|      1|    true|
|18441|36183|           1|           1|           1|        null|      1|     1|     1|     1|      1|    true|
| 8902|11508|           1|        null|           1|        null|      1|     1|     1|     1|      1|    true|
|17704|21348|           1|        null|           1|        null|      1|     1|     1|     1|      1|  

In [None]:
#Develop a PySpark script to clean and preprocess data before performing entity resolution.Include steps like tokenization and normalization.

#Cleaning data: drop null values, retain columns with high amount of data, normalize integer values
df = df.drop('id_1')
df = df.drop('id_2')

#recompute summary after dropping those columns
summary = df.describe()
#summary of columns
summary.select("summary", "cmp_fname_c1", "cmp_fname_c2").show()


+-------+--------------------+------------------+
|summary|        cmp_fname_c1|      cmp_fname_c2|
+-------+--------------------+------------------+
|  count|             5748126|            103699|
|   mean|  0.7129023464241682|0.9000089989364239|
| stddev|  0.3887584395082915|0.2713306768152377|
|    min|                   0|                 0|
|    max|2.68694413843136e-05|                 1|
+-------+--------------------+------------------+



In [None]:
#match and miss dataframes based on is_match column
matches = df.where("is_match = true")
matches_summary = matches.describe()

#create dataframe with just match = false entries
misses = df.where("is_match = false")
misses_summary = misses.describe()
#display
matches_summary.show()
misses_summary.show()

+-------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+
|summary|       cmp_fname_c1|       cmp_fname_c2|       cmp_lname_c1|       cmp_lname_c2|            cmp_sex|              cmp_bd|              cmp_bm|              cmp_by|            cmp_plz|
+-------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+
|  count|              20922|               1333|              20931|                475|              20931|               20925|               20925|               20925|              20902|
|   mean| 0.9973163859635039| 0.9898900320318174| 0.9970152595958819| 0.9693701678438521|  0.987291577086618|  0.9970848267622461|  0.9979450418160095|  0.9961290322580645| 0.9584250310975027|
| stddev|0.03650667584833679|0.0825

In [None]:
#obtain modified summaries
from pyspark.sql.types import DoubleType

def modify_sum(summary):
    #change names using pandas functions (transpose to make it visible)
    summary_p = summary.toPandas()
    summary_p = summary_p.set_index('summary').transpose().reset_index()
    summary_p = summary_p.rename(columns = {'index': 'field'})
    summary_p = summary_p.rename_axis(None, axis = 1)
    print(summary_p)
    #all columns are made into double type
    summary_ss = ss.createDataFrame(summary_p)
    for c in summary_ss.columns:
        if c == 'field':
            continue
        summary_ss = summary_ss.withColumn(c, summary_ss[c].cast(DoubleType()))
    summary_ss.printSchema()
    return summary_ss

In [None]:
match_summary_ss = modify_sum(matches_summary)
miss_summary_ss = modify_sum(misses_summary)

          field  count                mean                stddev min max
0  cmp_fname_c1  20922  0.9973163859635039   0.03650667584833679   0   1
1  cmp_fname_c2   1333  0.9898900320318174   0.08251973727615235   0   1
2  cmp_lname_c1  20931  0.9970152595958819   0.04311880753394514   0   1
3  cmp_lname_c2    475  0.9693701678438521   0.15345280740388917   0   1
4       cmp_sex  20931   0.987291577086618   0.11201570591216437   0   1
5        cmp_bd  20925  0.9970848267622461  0.053914876598079815   0   1
6        cmp_bm  20925  0.9979450418160095  0.045286127452170664   0   1
7        cmp_by  20925  0.9961290322580645  0.062098048567310556   0   1
8       cmp_plz  20902  0.9584250310975027   0.19962063345931913   0   1
root
 |-- field: string (nullable = true)
 |-- count: double (nullable = true)
 |-- mean: double (nullable = true)
 |-- stddev: double (nullable = true)
 |-- min: double (nullable = true)
 |-- max: double (nullable = true)

          field    count                  mean

In [None]:
#obtain top 5 columns with max count in both match and misses? basically using columns thatt have the most not-null values (so most values are present and not missing for comparision)
match_summary_ss.sort("count", ascending = False).show()
miss_summary_ss.sort("count", ascending = False).show()

#based on the counts, choose top columns for similarity evaluation
top_cols = ["cmp_lname_c1", "cmp_sex", "cmp_by", "cmp_bd", "cmp_bm"]

+------------+-------+------------------+--------------------+---+---+
|       field|  count|              mean|              stddev|min|max|
+------------+-------+------------------+--------------------+---+---+
|     cmp_sex|20931.0| 0.987291577086618| 0.11201570591216437|0.0|1.0|
|cmp_lname_c1|20931.0|0.9970152595958819| 0.04311880753394514|0.0|1.0|
|      cmp_bm|20925.0|0.9979450418160095|0.045286127452170664|0.0|1.0|
|      cmp_bd|20925.0|0.9970848267622461|0.053914876598079815|0.0|1.0|
|      cmp_by|20925.0|0.9961290322580645|0.062098048567310556|0.0|1.0|
|cmp_fname_c1|20922.0|0.9973163859635039| 0.03650667584833679|0.0|1.0|
|     cmp_plz|20902.0|0.9584250310975027| 0.19962063345931913|0.0|1.0|
|cmp_fname_c2| 1333.0|0.9898900320318174| 0.08251973727615235|0.0|1.0|
|cmp_lname_c2|  475.0|0.9693701678438521| 0.15345280740388917|0.0|1.0|
+------------+-------+------------------+--------------------+---+---+

+------------+---------+--------------------+-------------------+---+---+
| 

In [None]:
#Implement a PySpark program that computes similarity scores between records using a chosen similarity metric.
from pyspark.sql.functions import expr

sum_exp = "+". join(top_cols)
df = df.fillna(0, subset = top_cols)
df_scored = df.withColumn('score', expr(sum_exp)).select('score', 'is_match')
df_scored = df_scored.fillna(0)
df_scored.show()
#df_check = df_scored.groupBy('score').count()
#df_check.show()

+-----+--------+
|score|is_match|
+-----+--------+
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  4.0|    true|
|  5.0|    true|
|  4.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
+-----+--------+
only showing top 20 rows



In [None]:
#if score >= num1 then it creates a column 'above' indicating whether the score is above or equal to the threshold num1, groups the dataframe with the above column
# and then creates a pivot table with "is_match" values ("true" and "false") as columns and counts the occurrences for each combination of "above" and "is_match"
def crosstabs(df_cop, num1):
    return df_cop.selectExpr(f"score >= {num1} as above", "is_match").groupBy("above").pivot("is_match", ("true", "false")).count()


In [None]:
#df scored is passed through cross tabs with a threshold of 4.0
a = crosstabs(df_scored, 4.0)
a.show()

row_list = a.collect()

#compute precision and recall for match and unmatched class
precision = (row_list[0].__getitem__('true'))/(row_list[0].__getitem__('true') + row_list[0].__getitem__('false'))
recall = (row_list[0].__getitem__('false'))/(row_list[0].__getitem__('true') + row_list[1].__getitem__('false'))
print("Precision: ", precision)
print("False positive: ", recall)
print("F1 score", (precision * recall)/(precision + recall))



+-----+-----+-------+
|above| true|  false|
+-----+-----+-------+
| true|20892| 160712|
|false|   39|5567489|
+-----+-----+-------+

Precision:  0.11504151890927512
False positive:  0.028758239640425374
F1 score 0.023006934105859467


In [None]:
#does the same process but with a threshold of 2.0 instead of 4.0
a = crosstabs(df_scored, 2.0)
a.show()

row_list = a.collect()
precision = (row_list[0].__getitem__('true'))/(row_list[0].__getitem__('true') + row_list[0].__getitem__('false'))
recall = (row_list[0].__getitem__('false'))/(row_list[0].__getitem__('true') + row_list[1].__getitem__('false'))
print("Precision: ", precision)
print("False positive: ", recall)
print("F1 score", (precision * recall)/(precision + recall))

+-----+-----+-------+
|above| true|  false|
+-----+-----+-------+
| true|20925|5028701|
|false|    6| 699500|
+-----+-----+-------+

Precision:  0.004143871249078644
False positive:  6.980186695353437
F1 score 0.004141412649966648
