In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast
from datetime import datetime

In [2]:
spark = SparkSession\
    .builder\
    .master("local[*]")\
    .appName("Sales_processing")\
    .enableHiveSupport() \
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

sc = spark.sparkContext

In [3]:
fact_df_read = spark.read.table("default.sales_fact")

# Show the first few rows of the DataFrame
fact_df_read.show()


+-----------+----------------+----------------+--------------+---------+----------+-----+----------+------------------+
|customer_id|transaction_date|  transaction_id|sales_agent_id|branch_id|product_id|units|unit_price|       final_price|
+-----------+----------------+----------------+--------------+---------+----------+-----+----------+------------------+
|      85469|      2023-05-20|trx-152546429674|             1|        2|        22|   10|     79.99|             799.9|
|      85512|      2022-10-25|trx-291375327542|             3|        1|        24|    5|     49.99|199.95999925509096|
|      85484|      2022-02-05|trx-312507679871|            10|        3|         4|    1|     99.99|             99.99|
|      85528|      2023-10-20|trx-193384855491|             7|        2|        25|    8|    499.99|           3999.92|
|      85500|      2022-11-17|trx-831626097654|             5|        1|        14|   10|    399.99| 3399.914976158738|
|      85545|      2022-09-27|trx-158496

In [4]:
spark.sql("REFRESH TABLE default.agents_dim")
agents_dim = spark.read.table("default.agents_dim2")

# Show the first few rows of the DataFrame
agents_dim.show()


+---------------+------------------+----------+
|sales_person_id|              name| hire_date|
+---------------+------------------+----------+
|              1|          John Doe|2020-06-03|
|              2|        Jane Smith|2018-05-13|
|              3|   Michael Johnson|2021-10-03|
|              4|       Emily Brown|2020-10-25|
|              5|      David Wilson|2021-04-08|
|              6|       Emma Taylor|2019-03-28|
|              7|Christopher Miller|2020-01-11|
|              8|      Olivia Davis|2021-10-24|
|              9|   Daniel Martinez|2018-10-08|
|             10|      Sophia Moore|2019-05-25|
+---------------+------------------+----------+



In [5]:
products_dim = spark.read.table("default.products_dim2")

# Show the first few rows of the DataFrame
products_dim.show()


+----------+-----------------+----------------+
|product_id|     product_name|product_category|
+----------+-----------------+----------------+
|        29|Hair Straightener|      Appliances|
|        30|  Electric Kettle|      Appliances|
|        25|  Washing Machine|      Appliances|
|        26|   Vacuum Cleaner|      Appliances|
|        22|     Coffee Maker|      Appliances|
|         2|       Smartphone|     Electronics|
|         4|       Headphones|     Electronics|
|        28|       Hair Dryer|      Appliances|
|        23|          Toaster|      Appliances|
|        14|           Camera|     Electronics|
|        24|          Blender|      Appliances|
|         3|           Tablet|     Electronics|
|        19|          Sandals|        Footwear|
|         5|          T-Shirt|        Clothing|
|        15|           Hoodie|        Clothing|
|        17|           Blouse|        Clothing|
|        27|             Iron|      Appliances|
|         9|            Boots|        Fo

In [24]:
from pyspark.sql.functions import sum
result_df = fact_df_read.groupBy("sales_agent_id", "product_id").agg(
    sum("units").alias("total_units_sold")
)

In [25]:
result_df.show()


+--------------+----------+----------------+
|sales_agent_id|product_id|total_units_sold|
+--------------+----------+----------------+
|             3|        30|              48|
|             3|        15|              28|
|            10|         2|              32|
|             5|        16|              30|
|             9|        10|              26|
|             4|        10|              40|
|             9|        16|               6|
|             6|        20|              32|
|             6|         1|              22|
|             3|         1|              42|
|             7|         4|              52|
|             2|         2|              94|
|             1|        25|              30|
|             9|         4|              50|
|             6|        22|               8|
|             8|         3|              18|
|             2|        26|              20|
|             6|        25|               4|
|            10|        15|              10|
|         

In [26]:
#joing agents with sales_transcations
#agesnts dimensions is a small table so we can broadcast it
result = result_df.join(broadcast(agents_dim), result_df.sales_agent_id == agents_dim.sales_person_id)

In [27]:
result.show()

+--------------+----------+----------------+---------------+------------------+----------+
|sales_agent_id|product_id|total_units_sold|sales_person_id|              name| hire_date|
+--------------+----------+----------------+---------------+------------------+----------+
|             3|        30|              48|              3|   Michael Johnson|2021-10-03|
|             3|        15|              28|              3|   Michael Johnson|2021-10-03|
|            10|         2|              32|             10|      Sophia Moore|2019-05-25|
|             5|        16|              30|              5|      David Wilson|2021-04-08|
|             9|        10|              26|              9|   Daniel Martinez|2018-10-08|
|             4|        10|              40|              4|       Emily Brown|2020-10-25|
|             9|        16|               6|              9|   Daniel Martinez|2018-10-08|
|             6|        20|              32|              6|       Emma Taylor|2019-03-28|

In [28]:
products_dim.count()

30

In [29]:
#products aslo is a small table so we can broadcast it too
final_report = result.join(broadcast(products_dim), result.product_id == products_dim.product_id)

In [30]:
final_report.show()

+--------------+----------+----------------+---------------+------------------+----------+----------+---------------+----------------+
|sales_agent_id|product_id|total_units_sold|sales_person_id|              name| hire_date|product_id|   product_name|product_category|
+--------------+----------+----------------+---------------+------------------+----------+----------+---------------+----------------+
|             3|        30|              48|              3|   Michael Johnson|2021-10-03|        30|Electric Kettle|      Appliances|
|             3|        15|              28|              3|   Michael Johnson|2021-10-03|        15|         Hoodie|        Clothing|
|            10|         2|              32|             10|      Sophia Moore|2019-05-25|         2|     Smartphone|     Electronics|
|             5|        16|              30|              5|      David Wilson|2021-04-08|        16|          Skirt|        Clothing|
|             9|        10|              26|           

In [33]:
selected_df = final_report.select("name", "product_name", "total_units_sold")
selected_df.show()


+------------------+---------------+----------------+
|              name|   product_name|total_units_sold|
+------------------+---------------+----------------+
|   Michael Johnson|Electric Kettle|              48|
|   Michael Johnson|         Hoodie|              28|
|      Sophia Moore|     Smartphone|              32|
|      David Wilson|          Skirt|              30|
|   Daniel Martinez|        Sandals|              26|
|       Emily Brown|        Sandals|              40|
|   Daniel Martinez|          Skirt|               6|
|       Emma Taylor|          Heels|              32|
|       Emma Taylor|         Laptop|              22|
|   Michael Johnson|         Laptop|              42|
|Christopher Miller|     Headphones|              52|
|        Jane Smith|     Smartphone|              94|
|          John Doe|Washing Machine|              30|
|   Daniel Martinez|     Headphones|              50|
|       Emma Taylor|   Coffee Maker|               8|
|      Olivia Davis|        

In [37]:
current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"file:///data/output_{current_timestamp}.csv"
selected_df.write.mode("overwrite").csv(output_path, header=True)

<hr>