In [0]:
spark

In [0]:
# Import required libraries
from pyspark.sql.functions import *

In [0]:
# Reading orders Data From Bronze Table
df_orders = spark.read.format('Delta').load('abfss://olist-data@retailds.dfs.core.windows.net/bronze/orders')

In [0]:
# Displaying 5 records
df_orders.limit(5).display()

In [0]:
# Print schema
df_orders.printSchema()

In [0]:
# Total number of records
df_orders.count()

In [0]:
# Distinct Order_id
df_orders.agg(countDistinct(col('order_id')).alias('Distinct_order_count')).show()

In [0]:
# Checking Null in order_id
df_orders.filter(col("order_id").isNull()).count()

In [0]:
# Checking Null in all columns
df_orders.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in df_orders.columns
]).display()

In [0]:
# Percentage of nulls per column
total_rows = df_orders.count()
df_orders.select([
    (count(when(col(c).isNull(),c)) / total_rows * 100).alias(c)
    for c in df_orders.columns
]).display()

In [0]:
# Display Distinct Order_status
df_orders.select("order_status").distinct().show()

In [0]:
# Count of each order_status
df_orders.groupBy("order_status").count().show()

In [0]:
# Orders with missing delivery date
df_orders.filter(col("order_delivered_customer_date").isNull()) \
  .groupBy("order_status") \
  .count() \
  .show()

In [0]:
# Records with missing delivery date when the order status is delivered
df_orders.filter(
    (col("order_status")=='delivered') &
    (col("order_delivered_customer_date").isNull())
).display()

In [0]:
# Records with missing order_delivered_carrier_date date when the order status is shipped
df_orders.filter((col("order_status") == 'shipped') & (col("order_delivered_carrier_date").isNull())).display()

In [0]:
# Records with missing order_approved_at date when the order status is approved
df_orders.filter((col("order_status") == 'approved') & (col("order_approved_at").isNull())).display()

In [0]:
# Creating a new column is_valid_delivery
df_orders = df_orders.withColumn(
    "is_valid_delivery",
    when(
        (col("order_status")=='delivered') & (col("order_delivered_customer_date").isNull()) |
         (col("order_status") == 'shipped') & (col("order_delivered_carrier_date").isNull()) | 
         (col("order_status") == 'approved') & (col("order_approved_at").isNull())
         , 0
        ).otherwise(1)
)

In [0]:
# Checking the new column
df_orders.filter(col("is_valid_delivery") == 0).display()

In [0]:
# Creating a new column order_stage
df_orders= df_orders.withColumn(
    "order_stage",
    when(col("order_status") == "delivered","completed")\
        .when(col("order_status").isin("shipped", "processing", "invoiced"), "in_progress")\
            .when(col("order_status").isin("canceled", "unavailable"), "failed")\
                .otherwise("early")
    )

In [0]:
# Checking the new column
df_orders.select(col("order_stage")).display()


In [0]:
# Calculating the delivery_days
from pyspark.sql.functions import datediff

df_orders = df_orders.withColumn(
    "delivery_days",
    datediff(
        col("order_delivered_customer_date"),
        col("order_purchase_timestamp")
    )
)


In [0]:
# Writing Data into Silver Layer
df_orders.write.format('delta')\
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save('abfss://olist-data@retailds.dfs.core.windows.net/silver/orders')