In [0]:
# import necessary libraries
from pyspark.sql.functions import *

In [0]:
# Load payments data from bronze layer
df_order_payments = spark.read.format("delta")\
    .load("abfss://olist-data@retailds.dfs.core.windows.net/bronze/order_payments")

In [0]:
# Print Schema
df_order_payments.printSchema()

In [0]:
# Display 5 rows
df_order_payments.limit(5).display()

In [0]:
# Count number of records
df_order_payments.count()

In [0]:
# Check if any NULL values exits in any of the columns
df_order_payments.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in df_order_payments.columns                      
]).display()

In [0]:
# Cast the data types and store into df_payments_silver
df_payments_silver = df_order_payments.withColumn("payment_value", col("payment_value").cast("double"))\
    .withColumn("payment_installments", col("payment_installments").cast("integer"))

In [0]:
#  Check for Duplicate records
df_order_payments.groupBy("order_id","payment_sequential","payment_type","payment_installments","payment_value")\
    .count().filter(col("count")>'1').display()

In [0]:
# Aggregating the total payment value
df_payment_agg = df_payments_silver.groupBy("order_id")\
    .agg(sum("payment_value").alias("total_payment_value"))\
        .orderBy("total_payment_value", ascending=False)

In [0]:
# Aggregating the payment methods
df_payment_methods = df_payments_silver.groupBy("order_id")\
    .agg(collect_set("payment_type").alias("payment_methods"))

In [0]:
# Joining the dataframes
df_payments_silver = df_payments_silver\
    .join(df_payment_agg, "order_id", "left")\
    .join(df_payment_methods, "order_id", "left")

In [0]:
# Adding a new column to validate the payment
df_payments_silver = df_payments_silver.withColumn("is_payment_valid", col("payment_value") > 0)

In [0]:
# Comparing the order_id in silver.orders and silver.payments
df_payments_silver.join(spark.table("silver.orders"), on="order_id", how="left_anti").display()

In [0]:
# Assuring the columns before writing data into silver layer
df_payments_silver.columns

In [0]:
# Writing Data into Silver layer
df_payments_silver.write.format("delta")\
    .mode("overwrite") \
    .option("overwriteSchema", "true")\
        .save("abfss://olist-data@retailds.dfs.core.windows.net/silver/payments")

In [0]:
# Validating registered table
spark.sql("""SELECT * FROM silver.payments""").display()