## STEP 1: Understand the Table Structure

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import col, trim, length

Read Data from Bronze layer

In [0]:
df_bronze = spark.read.table("dev_project.bronze.erp_px_cat_g1v2")
display(df_bronze.limit(10))
#check schema
df_bronze.printSchema()




## STEP 2: Get a Row Count & Sample Data

**What to observe:**
- Is the row count what you expect?
- Do the columns look reasonable?
- Are there obvious issues (extra spaces, weird formatting)?

In [0]:
#Get basic stats
#df_bronze.describe().show()
print(f"Total records: {df_bronze.count()}")
df_bronze.describe().show()

In [0]:
%sql
select * from dev_project.bronze.erp_px_cat_g1v2 limit 10;

## STEP 3: Analyze NULL/Missing Values


Trimming

In [0]:
from pyspark.sql.types import StringType

for field in df_bronze.schema.fields:
    if isinstance(field.dataType, StringType):
        df_bronze = df_bronze.withColumn(field.name, trim(col(field.name)))

display(df_bronze.limit(10))

Customer Id Cleaning 

In [0]:

df_bronze = df_bronze.withColumn("cid", F.regexp_replace(col("cid"), "-", ""))
display(df_bronze.limit(10))

Country Normalization


Renaming Columns

In [0]:

RENAME_MAP = {
    "id": "category_id",
    "cat": "category",
    "subcat": "subcategory",
    "maintenance": "maintenance_flag"
}
for old_name, new_name in RENAME_MAP.items():
    df_bronze = df_bronze.withColumnRenamed(old_name, new_name)

In [0]:

df_bronze.limit(10).display()

In [0]:
df_bronze.write.mode("overwrite").format("delta").saveAsTable("dev_project.silver.erp_product_category")

Sanity checks of customer info table

In [0]:

%sql
SELECT * FROM dev_project.silver.erp_product_category LIMIT 10