#### Create an aggregate table that shows profit by 
- Year
- Product Category
- Product Sub Category
- Customer
#### Using SQL output the following aggregates
- Profit by Year
- Profit by Year + Product Category
- Profit by Customer
- Profit by Customer + Year


In [0]:
enrich_df = spark.read.table("pei_adb_proj.silver.enriched_silver_tb")

In [0]:
enrich_df.display()

In [0]:
from pyspark.sql.functions import year,sum,col,round

enrich_df = enrich_df.withColumn("Year", year("Order_Date"))

In [0]:
agg_df = enrich_df.groupBy("Year","Category","Sub_Category","Customer_ID","Customer_Name")\
                  .agg(round(sum("Profit"), 2).alias("Total_Profit"))

agg_df.display()

#### Test Cases For Agrregared Details

In [0]:
expected_columns = {'Year', 'Category', 'Sub_Category', 'Customer_ID', 'Customer_Name', 'Total_Profit'}
assert expected_columns.issubset(set(agg_df.columns)), "Test Case 1 Failed: Schema is missing expected columns."
print("OK Test Case 1 Passed: Schema is correct.")


In [0]:
null_profits = agg_df.filter(col("Total_Profit").isNull())
assert null_profits.count() == 0, "Test Case 2 Failed: Null Total_Profit found in aggregated data."
print("OK Test Case 2 Passed: No nulls in Total_Profit.")


In [0]:
null_years = agg_df.filter(col("Year").isNull())
assert null_years.count() == 0, "Test Case 3 Failed: Null values found in Year field."
print("OK Test Case 3 Passed: Year field does not contain nulls.")


In [0]:
record_count = agg_df.count()
assert record_count > 0, "Test Case 4 Failed: Aggregated table is empty."
print("OK Test Case 4 Passed: Aggregated table contains records.")


In [0]:
agg_df.write.format("delta").mode("overwrite").saveAsTable("pei_adb_proj.gold.profit_aggregate_gold")