In [0]:
%fs
ls /FileStore/merchant_category_large-1.csv


path,name,size,modificationTime
dbfs:/FileStore/merchant_category_large-1.csv,merchant_category_large-1.csv,117,1695567142000


In [0]:
%fs
ls /FileStore/merchant_large-1.csv


path,name,size,modificationTime
dbfs:/FileStore/merchant_large-1.csv,merchant_large-1.csv,23644832,1695567335000


In [0]:
%fs
ls /FileStore/transaction_large-1.csv


path,name,size,modificationTime
dbfs:/FileStore/transaction_large-1.csv,transaction_large-1.csv,54228538,1695534819000


In [0]:
merchant_category_large_cleaning_df = spark.read.format('csv').option("header","True").option("inferschema","true").load("dbfs:/FileStore/merchant_category_large-1.csv")
merchant_large_cleaning_df = spark.read.format('csv').option("header","True").option("inferschema","true").load("dbfs:/FileStore/merchant_large-1.csv")
transaction_large_cleaning_df = spark.read.format('csv').option("header","True").option("inferschema","true").load("dbfs:/FileStore/transaction_large-1.csv")

In [0]:

#Data consistency between tables
# Perform an inner join on the two dataframes

df_transactions = transaction_large_cleaning_df.withColumnRenamed('merchant_id', 'id_merchant_category')

df_merchant = merchant_large_cleaning_df.withColumnRenamed('id', 'id_merchant_category')

df_joined = df_merchant.join(df_transactions, on=['id_merchant_category'], how='inner')

# If there are any inconsistencies, the joined dataframe will have less rows than the original dataframes
if df_joined.count() < df_merchant.count() or df_joined.count() < df_transactions.count():
    print("There are inconsistencies in the 'id_merchant_category' between the two tables.")
else:
    print("The 'id_merchant_category' is consistent between the two tables.")


The 'id_merchant_category' is consistent between the two tables.


In [0]:
from pyspark.sql.functions import initcap

# Df is the DataFrame and "name" is the column with the names
df = df_merchant.withColumn("name", initcap(df_merchant["name"]))

# Show the DataFrame to verify the changes
df.show()


+--------------------+---------------+-----------------+
|id_merchant_category|           name|merchant_category|
+--------------------+---------------+-----------------+
|            MID80028|Kathryn Frazier|                5|
|            MID33354|Cameron Stevens|                1|
|            MID53976|  Amy Henderson|                1|
|            MID47112|     Wendy Leon|                5|
|            MID75112|Robert Mcdaniel|                5|
|            MID58145|           null|                1|
|            MID96735|Catherine Smith|                1|
|            MID23566|   Teresa Wolfe|                5|
|            MID50690|      Alan Wood|                1|
|            MID89747|   Megan Pierce|                5|
|            MID44098|  Julian Carney|                1|
|            MID21112|James Rasmussen|                1|
|            MID99662|Loretta Barrera|                1|
|            MID49518| Angela Bennett|                5|
|            MID74116|  Stanley

In [0]:
from pyspark.sql.functions import col


# Get the distinct merchant IDs from the transactions DataFrame
transaction_merchants = transaction_large_cleaning_df.select("merchant_id").distinct()

# Perform a left anti join on the merchant DataFrame with the transaction merchants
# This will give us all rows in the merchant DataFrame that do not have a matching merchant ID in the transactions DataFrame
df_merchant = merchant_large_cleaning_df.withColumnRenamed('id', 'merchant_id')
merchants_without_transactions = df_merchant.join(transaction_merchants, on="merchant_id", how="left_anti")

# Show the merchants without transactions
merchants_without_transactions.show()


+-----------+------------------+-----------------+
|merchant_id|              name|merchant_category|
+-----------+------------------+-----------------+
|   MID30110|    Brittany Jones|                5|
|   MID30110|     Dennis Parker|                1|
|   MID30110|   Kimberly Dawson|                5|
|   MID30110|         Tami Ross|                5|
|   MID30110|     Joseph Orozco|                1|
|   MID30110|    John Stevenson|                1|
|   MID30110|       Mario Mccoy|                1|
|   MID30110|  Michael Gonzalez|                1|
|   MID30110|Elizabeth Sullivan|                1|
|   MID00001|      Issac newton|                8|
+-----------+------------------+-----------------+



In [0]:
from pyspark.sql.functions import isnan, when, count, col

# Df is the DataFrame, "merchant_category" is the merchant category column, and "name" is the name column

# Find the records where 'merchant_category' or 'name' is missing
missing_merchant_category = merchant_large_cleaning_df.filter((merchant_large_cleaning_df["merchant_category"] == "") | merchant_large_cleaning_df["merchant_category"].isNull() | isnan(merchant_large_cleaning_df["merchant_category"]))

missing_name = merchant_large_cleaning_df.filter((merchant_large_cleaning_df["name"] == "") | merchant_large_cleaning_df["name"].isNull() | isnan(merchant_large_cleaning_df["name"]))

# Show the records with missing 'merchant_category' or 'name'
missing_merchant_category.show()
missing_name.show()


+--------+------------+-----------------+
|      id|        name|merchant_category|
+--------+------------+-----------------+
|MID84829| Kevin Lewis|             null|
|MID72394|Linda Warner|             null|
+--------+------------+-----------------+

+--------+----+-----------------+
|      id|name|merchant_category|
+--------+----+-----------------+
|MID58145|null|                1|
|MID92601|null|                5|
+--------+----+-----------------+



In [0]:
# Count the records with missing 'merchant_category' or 'name'
print("Number of records with missing merchant category: ", missing_merchant_category.count())
print("Number of records with missing name: ", missing_name.count())

Number of records with missing merchant category:  2
Number of records with missing name:  2
