## STEP 1: Understand the Table Structure

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import col, trim, length

Read Data from Bronze layer

In [0]:
df_bronze = spark.read.table("dev_project.bronze.crm_sales_details")
display(df_bronze.limit(10))
#check schema
df_bronze.printSchema()




## STEP 2: Get a Row Count & Sample Data

**What to observe:**
- Is the row count what you expect?
- Do the columns look reasonable?
- Are there obvious issues (extra spaces, weird formatting)?

In [0]:
#Get basic stats
#df_bronze.describe().show()
print(f"Total records: {df_bronze.count()}")
df_bronze.describe().show()

In [0]:
%sql
select * from dev_project.bronze.crm_sales_details limit 10;

## STEP 3: Analyze NULL/Missing Values


Trimming

In [0]:
#from pyspark.sql.functions import col, trim
#from pyspark.sql.types import StringType

df_bronze = df_bronze.select([
    trim(col(c)).alias(c) if isinstance(t, StringType) else col(c)
    for c, t in df_bronze.dtypes
])

#for field in df_bronze.schema.fields:
#    if isinstance(field.dataType, StringType):
#        df_bronze = df_bronze.withColumn(field.name, trim(col(field.name)))

display(df_bronze.limit(10))

Cleaning Dates

In [0]:
from pyspark.sql.functions import col, when, length, to_date

df_bronze = (
    df_bronze
    .withColumn(
        "sls_order_dt",
        when(
            (col("sls_order_dt") == 0) | (length(col("sls_order_dt").cast("string")) != 8),
            None
        ).otherwise(to_date(col("sls_order_dt").cast("string"), "yyyyMMdd"))
    )
    .withColumn(
        "sls_ship_dt",
        when(
            (col("sls_ship_dt") == 0) | (length(col("sls_ship_dt").cast("string")) != 8),
            None
        ).otherwise(to_date(col("sls_ship_dt").cast("string"), "yyyyMMdd"))
    )
    .withColumn(
        "sls_due_dt",
        when(
            (col("sls_due_dt") == 0) | (length(col("sls_due_dt").cast("string")) != 8),
            None
        ).otherwise(to_date(col("sls_due_dt").cast("string"), "yyyyMMdd"))
    )
)

Sales and Price Corrections

In [0]:
df_bronze = (
    df_bronze
    .withColumn(
        "sls_price",
        F.when(
            (col("sls_price").isNull()) | (col("sls_price") <= 0),
            F.when(
                col("sls_quantity") != 0,
                col("sls_sales") / col("sls_quantity")
            ).otherwise(None)
        ).otherwise(col("sls_price"))
    )
)


Date Casting:  
The Problem:
When data is loaded into a PySpark DataFrame (especially from CSV, JSON, or text files), date columns are often read as strings, not actual date objects.

Renaming Columns

In [0]:
RENAME_MAP = {
    "sls_ord_num": "order_number",
    "sls_prd_key": "product_number",
    "sls_cust_id": "customer_id",
    "sls_order_dt": "order_date",
    "sls_ship_dt": "ship_date",
    "sls_due_dt": "due_date",
    "sls_sales": "sales_amount",
    "sls_quantity": "quantity",
    "sls_price": "price"
}
for old_name, new_name in RENAME_MAP.items():
    df_bronze = df_bronze.withColumnRenamed(old_name, new_name)


In [0]:

df_bronze.limit(10).display()

In [0]:
df_bronze.write.mode("overwrite").format("delta").saveAsTable("dev_project.silver.crm_sales")

Sanity checks of customer info table

In [0]:

%sql
SELECT * FROM dev_project.silver.crm_sales LIMIT 10