In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())

from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (SparkSession.builder .appName("OptimizedLocalSpark") .getOrCreate())
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


In [3]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("RollupSubTotalGrandTotal").getOrCreate()

# Sample data
data = [
    ("2022-09-01", 1, "A", 10, 100),
    ("2022-09-01", 2, "A", 5, 200),
    ("2022-09-02", 1, "B", 20, 150),
    ("2022-09-02", 2, "B", 15, 300)
]
columns = ["TransactionDate", "ProductID", "Category", "Qty", "Price"]

df = spark.createDataFrame(data, columns)

df.show()


+---------------+---------+--------+---+-----+
|TransactionDate|ProductID|Category|Qty|Price|
+---------------+---------+--------+---+-----+
|     2022-09-01|        1|       A| 10|  100|
|     2022-09-01|        2|       A|  5|  200|
|     2022-09-02|        1|       B| 20|  150|
|     2022-09-02|        2|       B| 15|  300|
+---------------+---------+--------+---+-----+



In [4]:
# Register the DataFrame as a temporary table for SQL queries
df.createOrReplaceTempView("sales")

# SQL query using ROLLUP
result = spark.sql("""
    SELECT 
        Category,
        ProductID,
        SUM(Qty) AS TotalQty,
        SUM(Price) AS TotalPrice,
        GROUPING(Category) AS IsCategoryTotal,
        GROUPING(ProductID) AS IsProductTotal
    FROM sales
    GROUP BY ROLLUP(Category, ProductID)
    ORDER BY Category, ProductID
""")

result.show()

+--------+---------+--------+----------+---------------+--------------+
|Category|ProductID|TotalQty|TotalPrice|IsCategoryTotal|IsProductTotal|
+--------+---------+--------+----------+---------------+--------------+
|    null|     null|      50|       750|              1|             1|
|       A|     null|      15|       300|              0|             1|
|       A|        1|      10|       100|              0|             0|
|       A|        2|       5|       200|              0|             0|
|       B|     null|      35|       450|              0|             1|
|       B|        1|      20|       150|              0|             0|
|       B|        2|      15|       300|              0|             0|
+--------+---------+--------+----------+---------------+--------------+



In [7]:
from pyspark.sql.functions import col, lit, sum as Fsum

# Compute Subtotals by Category
category_totals = df.groupBy("Category") \
    .agg(Fsum("Qty").alias("TotalQty"), Fsum("Price").alias("TotalPrice")) \
    .withColumn("TransactionDate", lit(None)) \
    .withColumn("ProductID", lit(None)) \
    .withColumn("Type", lit("Subtotal"))

# Compute Grand Totals
grand_total = df.agg(Fsum("Qty").alias("TotalQty"), Fsum("Price").alias("TotalPrice")) \
    .withColumn("TransactionDate", lit(None)) \
    .withColumn("Category", lit(None)) \
    .withColumn("ProductID", lit(None)) \
    .withColumn("Type", lit("GrandTotal"))

# Add a column to the original DataFrame for consistency
df_with_type = df.withColumn("Type", lit("Detail"))

# Combine Original Data with Subtotals and Grand Totals
result_df = df_with_type.select("TransactionDate", "ProductID", "Category", "Qty", "Price", "Type") \
    .union(category_totals.select("TransactionDate", "ProductID", "Category", "TotalQty", "TotalPrice", "Type")) \
    .union(grand_total.select("TransactionDate", "ProductID", "Category", "TotalQty", "TotalPrice", "Type"))

result_df.show()


+---------------+---------+--------+---+-----+----------+
|TransactionDate|ProductID|Category|Qty|Price|      Type|
+---------------+---------+--------+---+-----+----------+
|     2022-09-01|        1|       A| 10|  100|    Detail|
|     2022-09-01|        2|       A|  5|  200|    Detail|
|     2022-09-02|        1|       B| 20|  150|    Detail|
|     2022-09-02|        2|       B| 15|  300|    Detail|
|           null|     null|       A| 15|  300|  Subtotal|
|           null|     null|       B| 35|  450|  Subtotal|
|           null|     null|    null| 50|  750|GrandTotal|
+---------------+---------+--------+---+-----+----------+



In [6]:
# Collect the data to Python
data = df.collect()

# Initialize dictionaries for totals
category_totals = {}
grand_total_qty = 0
grand_total_price = 0

# Calculate Subtotals and Grand Totals
for row in data:
    category = row["Category"]
    qty = row["Qty"]
    price = row["Price"]
    
    # Update category totals
    if category not in category_totals:
        category_totals[category] = {"Qty": 0, "Price": 0}
    category_totals[category]["Qty"] += qty
    category_totals[category]["Price"] += price
    
    # Update grand totals
    grand_total_qty += qty
    grand_total_price += price

# Print the results
print("Category Totals:", category_totals)
print("Grand Total: Qty =", grand_total_qty, "Price =", grand_total_price)


Category Totals: {'A': {'Qty': 15, 'Price': 300}, 'B': {'Qty': 35, 'Price': 450}}
Grand Total: Qty = 50 Price = 750
