In [0]:
%fs
ls /FileStore/credit_card_large-2.csv



path,name,size,modificationTime
dbfs:/FileStore/credit_card_large-2.csv,credit_card_large-2.csv,177863,1695551137000


In [0]:
%sql
create database credit_card_large_cleaning

In [0]:
credit_card_large_cleaning_df = spark.read.format('csv').option("header","True").option("inferschema","true").load("dbfs:/FileStore/credit_card_large-2.csv")

In [0]:
%fs
ls /FileStore/card_holder_large-1.csv



path,name,size,modificationTime
dbfs:/FileStore/card_holder_large-1.csv,card_holder_large-1.csv,201642,1695547797000


In [0]:
%sql
create database card_holder_large_cleaning

In [0]:
card_holder_large_cleaning_df = spark.read.format('csv').option("header","True").option("inferschema","true").load("dbfs:/FileStore/card_holder_large-1.csv")

In [0]:
from pyspark.sql.functions import when, col

In [0]:
# Replace null values in 'id_card_holder' with '9999'
df = credit_card_large_cleaning_df.withColumn("id_card_holder", when(col("id_card_holder").isNull(), "9999").otherwise(col("id_card_holder")))
df.show()

+--------------+--------------------+
|id_card_holder|         card_number|
+--------------+--------------------+
|          2306|          4.28219E12|
|          5253|          4.43817E15|
|          1466|          1.80019E14|
|          6227|           3.5995E15|
|          2489|          4.06665E15|
|          2720|          3.00387E13|
|          6855|          6.76342E11|
|          3883|       4.454223452E9|
|          9445|          4.08189E18|
|          7291|           3.5403E15|
|          9258|          3.51468E15|
|          3234|7.485848382838282...|
|          9149|          4.94379E15|
|          9999|          4.54953E15|
|          3505|          4.47428E15|
|          3625|          2.24191E15|
|          9999|          6.30478E11|
|          7338|          4.84069E12|
|          4815|          2.29413E15|
|          4006|          4.23601E12|
+--------------+--------------------+
only showing top 20 rows



In [0]:
df.count()

Out[24]: 10001

In [0]:
# Filter out records where 'card' field is missing
df2 = df.filter(df.card_number.isNotNull())

In [0]:
df2.count()

Out[26]: 9999

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, percentile_approx

# Convert 'id_card_holder' to numeric type
df = df2.withColumn("id_card_holder", df["id_card_holder"].cast("double"))

# Calculate Q1 and Q3
bounds = df.stat.approxQuantile("id_card_holder", [0.25, 0.75], 0)

# Calculate IQR
IQR = bounds[1] - bounds[0]

# Define lower and upper bounds for outliers
lower_bound = bounds[0] - 1.5 * IQR
upper_bound = bounds[1] + 1.5 * IQR

# Filter the DataFrame to include only outlier rows
df_outliers = df.filter((col("id_card_holder") < lower_bound) | (col("id_card_holder") > upper_bound))

# Show the result
df_outliers.show()


+--------------+--------------------+
|id_card_holder|         card_number|
+--------------+--------------------+
| 9.999999999E9|1.212121212121212E15|
+--------------+--------------------+



In [0]:
# Join the two DataFrames on the 'id_card_holder' and 'id' fields
joined_df = df2.join(card_holder_large_cleaning_df, df2.id_card_holder == card_holder_large_cleaning_df.id)

# Show the result
joined_df.show()


+--------------+--------------------+----+----------------+
|id_card_holder|         card_number|  id|            name|
+--------------+--------------------+----+----------------+
|          2306|          4.28219E12|2306|    Matthew Reed|
|          5253|          4.43817E15|5253|    Kelly Norris|
|          1466|          1.80019E14|1466|Natalie Williams|
|          2720|          3.00387E13|2720|  Carl Rodriguez|
|          7291|           3.5403E15|7291|   Andrea Burton|
|          7291|           3.5403E15|7291|  Teresa Cabrera|
|          9258|          3.51468E15|9258|   Jessica Gomez|
|          3234|7.485848382838282...|3234|  Jennifer Brown|
|          9149|          4.94379E15|9149|  Amy Montgomery|
|          9149|          4.94379E15|9149|   Linda Carroll|
|          9999|          4.54953E15|9999|    Anne Chapman|
|          3625|          2.24191E15|3625|  Robert Vasquez|
|          3625|          2.24191E15|3625|   Jared Perkins|
|          9999|          6.30478E11|999

In [0]:
joined_df.count()

Out[29]: 10078

In [0]:
df = joined_df.withColumn("card_length", length(joined_df.card_number))
df.show()

+--------------+--------------------+----+----------------+-----------+
|id_card_holder|         card_number|  id|            name|card_length|
+--------------+--------------------+----+----------------+-----------+
|          2306|          4.28219E12|2306|    Matthew Reed|         10|
|          5253|          4.43817E15|5253|    Kelly Norris|         10|
|          1466|          1.80019E14|1466|Natalie Williams|         10|
|          2720|          3.00387E13|2720|  Carl Rodriguez|         10|
|          7291|           3.5403E15|7291|   Andrea Burton|          9|
|          7291|           3.5403E15|7291|  Teresa Cabrera|          9|
|          9258|          3.51468E15|9258|   Jessica Gomez|         10|
|          3234|7.485848382838282...|3234|  Jennifer Brown|         21|
|          9149|          4.94379E15|9149|  Amy Montgomery|         10|
|          9149|          4.94379E15|9149|   Linda Carroll|         10|
|          9999|          4.54953E15|9999|    Anne Chapman|     

In [0]:
df_valid = df.withColumn("is_valid", when((col("card_length") >= 10) & (col("card_length") <= 18), True).otherwise(False))

In [0]:
df_valid.show()

+--------------+--------------------+----+----------------+-----------+--------+
|id_card_holder|         card_number|  id|            name|card_length|is_valid|
+--------------+--------------------+----+----------------+-----------+--------+
|          2306|          4.28219E12|2306|    Matthew Reed|         10|    true|
|          5253|          4.43817E15|5253|    Kelly Norris|         10|    true|
|          1466|          1.80019E14|1466|Natalie Williams|         10|    true|
|          2720|          3.00387E13|2720|  Carl Rodriguez|         10|    true|
|          7291|           3.5403E15|7291|   Andrea Burton|          9|   false|
|          7291|           3.5403E15|7291|  Teresa Cabrera|          9|   false|
|          9258|          3.51468E15|9258|   Jessica Gomez|         10|    true|
|          3234|7.485848382838282...|3234|  Jennifer Brown|         21|   false|
|          9149|          4.94379E15|9149|  Amy Montgomery|         10|    true|
|          9149|          4.

In [0]:
df_valid.write.format("csv").option("header","true").save("/FileStore/data/cleaned/credit_card_cleaned_data.csv")


In [0]:
#Card_holder_large
from pyspark.sql import SparkSession

# Option 1: Exclude these records from certain analyses
df_no_missing = card_holder_large_cleaning_df.filter(card_holder_large_cleaning_df['name'].isNotNull())

# Option 2: Flag them for further investigation by adding a new column
df_flagged = card_holder_large_cleaning_df.withColumn('name_missing', card_holder_large_cleaning_df['name'].isNull())


In [0]:
df_flagged.show()

+----+-------------------+------------+
|  id|               name|name_missing|
+----+-------------------+------------+
|9074|       Victor Petty|       false|
|7992|G%abrielle Ferguson|       false|
|5410|     William Davies|       false|
|4930|       Jesse Morris|       false|
| 634| Christopher Huerta|       false|
|9939|    Jer-emy Everett|       false|
|6508|               null|        true|
|4095|  Katherine Salazar|       false|
|2959|  Bethany Whitehead|       false|
| 210|        Scott Walsh|       false|
|8560|      Jerry Simmons|       false|
|1302|     Melissa Murray|       false|
|1747|        Troy Joseph|       false|
|7663|               null|        true|
|8842|       Charles Webb|       false|
|2609| Austin Carrillo II|       false|
| 659|    Michelle Nelson|       false|
|6499|        Luis Archer|       false|
|1923|        Lori Potter|       false|
|9744|       Allison Lowe|       false|
+----+-------------------+------------+
only showing top 20 rows



In [0]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import BooleanType
import re

# Define a UDF that checks if a name is valid (contains only alphabets and spaces)
def is_valid_name(name):
    if name is None:
        return False
    return bool(re.fullmatch('[A-Za-z ]+', name))

# Register the UDF
spark.udf.register('is_valid_name', is_valid_name, BooleanType())

# Use the UDF to add a new column 'name_is_valid' that indicates if the name is valid
df_validated = df_flagged.withColumn('name_is_valid', udf(is_valid_name, BooleanType())(col('name')))

# Now df_validated contains an additional column 'name_is_valid' that is True if the name is valid and False otherwise


In [0]:
df_validated.show()

+----+-------------------+------------+-------------+
|  id|               name|name_missing|name_is_valid|
+----+-------------------+------------+-------------+
|9074|       Victor Petty|       false|         true|
|7992|G%abrielle Ferguson|       false|        false|
|5410|     William Davies|       false|         true|
|4930|       Jesse Morris|       false|         true|
| 634| Christopher Huerta|       false|         true|
|9939|    Jer-emy Everett|       false|        false|
|6508|               null|        true|        false|
|4095|  Katherine Salazar|       false|         true|
|2959|  Bethany Whitehead|       false|         true|
| 210|        Scott Walsh|       false|         true|
|8560|      Jerry Simmons|       false|         true|
|1302|     Melissa Murray|       false|         true|
|1747|        Troy Joseph|       false|         true|
|7663|               null|        true|        false|
|8842|       Charles Webb|       false|         true|
|2609| Austin Carrillo II|  

In [0]:
df_validated.write.format("csv").option("header","true").save("/FileStore/data/cleaned/card_holder_cleaned_data.csv")
