## STEP 1: Understand the Table Structure

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import col, trim, length

Read Data from Bronze layer

In [0]:
df_bronze = spark.read.table("dev_project.bronze.erp_cust_az12")
display(df_bronze.limit(10))
#check schema
df_bronze.printSchema()




## STEP 2: Get a Row Count & Sample Data

**What to observe:**
- Is the row count what you expect?
- Do the columns look reasonable?
- Are there obvious issues (extra spaces, weird formatting)?

In [0]:
#Get basic stats
#df_bronze.describe().show()
print(f"Total records: {df_bronze.count()}")
df_bronze.describe().show()

In [0]:
%sql
select * from dev_project.bronze.erp_cust_az12 limit 10;

## STEP 3: Analyze NULL/Missing Values


Trimming

In [0]:
#from pyspark.sql.functions import col, trim
#from pyspark.sql.types import StringType

df_bronze = df_bronze.select([
    trim(col(c)).alias(c) if isinstance(t, StringType) else col(c)
    for c, t in df_bronze.dtypes
])

#for field in df_bronze.schema.fields:
#    if isinstance(field.dataType, StringType):
#        df_bronze = df_bronze.withColumn(field.name, trim(col(field.name)))

display(df_bronze.limit(10))

Customer Id Cleaning 

In [0]:

df_bronze = df_bronze.withColumn(
    "cid",
    F.when(col("cid").startswith("NAS"),
           F.substring(col("cid"), 4, F.length(col("cid"))))
     .otherwise(col("cid"))
)
display(df_bronze.limit(10))

Birthdate Validation

In [0]:
df_bronze = df_bronze.withColumn(
    "bdate",
    F.when(col("bdate") > F.current_date(), None)
    .otherwise(col("bdate"))
)
display(df_bronze.limit(10))


Gender Normalization


In [0]:
df_bronze = df_bronze.withColumn(
    "gen",
    F.when(F.upper(col("gen")).isin("F", "FEMALE"), "Female")
     .when(F.upper(col("gen")).isin("M", "MALE"), "Male")
     .otherwise("n/a")
)
   

Date Casting:  
The Problem:
When data is loaded into a PySpark DataFrame (especially from CSV, JSON, or text files), date columns are often read as strings, not actual date objects.

Renaming Columns

In [0]:

RENAME_MAP = {
    "cid": "customer_number",
    "bdate": "birth_date",
    "gen": "gender"
}
for old_name, new_name in RENAME_MAP.items():  
    df_bronze = df_bronze.withColumnRenamed(old_name, new_name)


In [0]:

df_bronze.limit(10).display()

In [0]:
df_bronze.write.mode("overwrite").format("delta").saveAsTable("dev_project.silver.erp_customers")

Sanity checks of customer info table

In [0]:

%sql
SELECT * FROM dev_project.silver.erp_customers LIMIT 10