# Explanation of this notebook

This notebook is designed to provide an overview and explanation of the concepts, processes and results presented. Here you will find detailed descriptions of each section, as well as the purpose and objectives of the analysis performed. Use this document as a guide to understand the workflow and logic behind each step implemented in the notebook.

In [0]:
%run ../Transversal/config

In [0]:
from pyspark.sql.functions import to_timestamp, to_date, col


def data_preprocessing(table_silver):
    
    df = spark.table(table_silver)

    df = df.dropDuplicates(["order_id"])

    df = df.withColumn("approved_date", to_timestamp(col("event_date"), "dd/MM/yyyy HH:mm:ss"))
    df = df.withColumn("approved_date", to_date(col("approved_date")))

    return df


silver_stream = data_preprocessing(table_silver=Silver_Orders)

StatementMeta(, eccefa97-dd8e-4132-a217-84543c132111, 7, Finished, Available, Finished)

In [0]:
from pyspark.sql.functions import col, count, sum as _sum, round


def performace_operation_batch(table_silver, table_gold):

    silver_stream_agg = table_silver.groupBy("approved_date").agg(
    count("order_id").alias("total_orders"),
    _sum("quantity_products").alias("total_products"))

    silver_stream_agg = silver_stream_agg.withColumn(
        "avg_products_per_order",
        round(col("total_products") / col("total_orders"), 2))

    silver_stream_agg.write\
    .format("delta")\
    .mode("overwrite")\
    .option("overwriteSchema", "true")\
    .saveAsTable(table_gold)

    return silver_stream_agg

StatementMeta(, ed8ffedc-de3b-4346-80a6-beb1a0003b67, 6, Finished, Available, Finished)

In [0]:
from pyspark.sql.functions import when


def inconsistencies_report(table_silver, table_gold):

    silver_stream_agg = table_silver.groupBy("approved_date").agg(count("order_id").alias("total_orders"))

    error_silver_stream = table_silver.filter((col("district") == "None") | (col("district") == "DESCONOCIDA"))
    error_silver_agg = error_silver_stream.groupBy("approved_date").agg(count("order_id").alias("total_error_orders"))

    df_merged = silver_stream_agg.join(
    error_silver_agg,
    on="approved_date",
    how="left"
    )

    df_final = df_merged.na.fill({"total_error_orders": 0})

    df_final = df_final.withColumn(
        "percent_errors",
        round(when(col("total_orders") != 0,
                (col("total_error_orders") / col("total_orders"))
            ).otherwise(0), 3))

    df_final.write\
    .format("delta")\
    .mode("overwrite")\
    .option("overwriteSchema", "true")\
    .saveAsTable(table_gold)

    return df_final 

StatementMeta(, ed8ffedc-de3b-4346-80a6-beb1a0003b67, 4, Finished, Available, Finished)

In [0]:
performace_operation_batch(
    table_silver=silver_stream,
    table_gold=Gold_Performance_Operations
)

inconsistencies_report(
    table_silver=silver_stream,
    table_gold=Gold_Inconsistencies_Report
)

StatementMeta(, ed8ffedc-de3b-4346-80a6-beb1a0003b67, 7, Finished, Available, Finished)

DataFrame[approved_date: date, total_orders: bigint, total_error_orders: bigint, percent_errors: double]