In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())


from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


# Calculate Yearly, Quarterly, Monthly totals in a single SQL Query |Grouping Sets

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, quarter

# Initialize Spark Session
spark = SparkSession.builder.appName("ComplexAggregationExample").getOrCreate()

# Sample Data: Sales records with date, product, and sales amount
data = [
    (1, "2023-01-15", "Product A", 1500),
    (2, "2023-01-20", "Product B", 2000),
    (3, "2023-02-11", "Product A", 3000),
    (4, "2023-02-28", "Product C", 2500),
    (5, "2023-03-05", "Product B", 3200),
    (6, "2023-03-20", "Product C", 2700),
    (7, "2023-04-05", "Product A", 4000),
    (8, "2023-05-15", "Product B", 2300),
    (9, "2023-06-10", "Product C", 2900),
    (10, "2023-07-22", "Product A", 1500),
    (11, "2023-08-18", "Product B", 3300),
    (12, "2023-09-25", "Product C", 3400),
    (13, "2023-10-10", "Product A", 1900),
    (14, "2023-11-21", "Product B", 2500),
    (15, "2023-12-15", "Product C", 2700),
]

# Create DataFrame
df = spark.createDataFrame(data, ["sale_id", "sale_date", "product", "amount"])

# Create a Temporary View for Spark SQL
df.createOrReplaceTempView("sales_data")

# Show the Original DataFrame
df.show(truncate=False)


+-------+----------+---------+------+
|sale_id|sale_date |product  |amount|
+-------+----------+---------+------+
|1      |2023-01-15|Product A|1500  |
|2      |2023-01-20|Product B|2000  |
|3      |2023-02-11|Product A|3000  |
|4      |2023-02-28|Product C|2500  |
|5      |2023-03-05|Product B|3200  |
|6      |2023-03-20|Product C|2700  |
|7      |2023-04-05|Product A|4000  |
|8      |2023-05-15|Product B|2300  |
|9      |2023-06-10|Product C|2900  |
|10     |2023-07-22|Product A|1500  |
|11     |2023-08-18|Product B|3300  |
|12     |2023-09-25|Product C|3400  |
|13     |2023-10-10|Product A|1900  |
|14     |2023-11-21|Product B|2500  |
|15     |2023-12-15|Product C|2700  |
+-------+----------+---------+------+



In [3]:
res = spark.sql(""" 
                
SELECT 
    year(sale_date) AS year, 
    quarter(sale_date) AS quarter, 
    month(sale_date) AS month, 
    SUM(amount) AS total_sales
FROM sales_data
GROUP BY GROUPING SETS (
    (year(sale_date)),                 
    (year(sale_date), quarter(sale_date)), 
    (year(sale_date), month(sale_date))    
)
ORDER BY year, quarter, month;



              
                """)
res.show()

+----+-------+-----+-----------+
|year|quarter|month|total_sales|
+----+-------+-----+-----------+
|2023|   null| null|      39400|
|2023|   null|    1|       3500|
|2023|   null|    2|       5500|
|2023|   null|    3|       5900|
|2023|   null|    4|       4000|
|2023|   null|    5|       2300|
|2023|   null|    6|       2900|
|2023|   null|    7|       1500|
|2023|   null|    8|       3300|
|2023|   null|    9|       3400|
|2023|   null|   10|       1900|
|2023|   null|   11|       2500|
|2023|   null|   12|       2700|
|2023|      1| null|      14900|
|2023|      2| null|       9200|
|2023|      3| null|       8200|
|2023|      4| null|       7100|
+----+-------+-----+-----------+



In [6]:
#rollup("year", "quarter", "month") will group by year, then by year and quarter, and finally by year, quarter, and month.

from pyspark.sql.functions import sum as _sum

# Add columns for year, quarter, and month
df_with_periods = df.withColumn("year", year("sale_date")) \
                    .withColumn("quarter", quarter("sale_date")) \
                    .withColumn("month", month("sale_date"))

# Use rollup to simulate GROUPING SETS and calculate totals
df_grouped = df_with_periods.rollup("year", "quarter", "month").agg(_sum("amount").alias("total_sales")).orderBy("year", "quarter", "month")

# Show the result
df_grouped.show(truncate=False)



+----+-------+-----+-----------+
|year|quarter|month|total_sales|
+----+-------+-----+-----------+
|null|null   |null |39400      |
|2023|null   |null |39400      |
|2023|1      |null |14900      |
|2023|1      |1    |3500       |
|2023|1      |2    |5500       |
|2023|1      |3    |5900       |
|2023|2      |null |9200       |
|2023|2      |4    |4000       |
|2023|2      |5    |2300       |
|2023|2      |6    |2900       |
|2023|3      |null |8200       |
|2023|3      |7    |1500       |
|2023|3      |8    |3300       |
|2023|3      |9    |3400       |
|2023|4      |null |7100       |
|2023|4      |10   |1900       |
|2023|4      |11   |2500       |
|2023|4      |12   |2700       |
+----+-------+-----+-----------+



In [5]:
#Produces all possible combinations of the group-by keys. It can be more comprehensive than rollup if you need deeper insights.

# Use cube to get comprehensive combinations for year, quarter, and month
df_cubed = df_with_periods.cube("year", "quarter", "month").agg(_sum("amount").alias("total_sales")).orderBy("year", "quarter", "month")

# Show the result
df_cubed.show(truncate=False)


+----+-------+-----+-----------+
|year|quarter|month|total_sales|
+----+-------+-----+-----------+
|null|null   |null |39400      |
|null|null   |1    |3500       |
|null|null   |2    |5500       |
|null|null   |3    |5900       |
|null|null   |4    |4000       |
|null|null   |5    |2300       |
|null|null   |6    |2900       |
|null|null   |7    |1500       |
|null|null   |8    |3300       |
|null|null   |9    |3400       |
|null|null   |10   |1900       |
|null|null   |11   |2500       |
|null|null   |12   |2700       |
|null|1      |null |14900      |
|null|1      |1    |3500       |
|null|1      |2    |5500       |
|null|1      |3    |5900       |
|null|2      |null |9200       |
|null|2      |4    |4000       |
|null|2      |5    |2300       |
+----+-------+-----+-----------+
only showing top 20 rows



In [8]:
# Calculate totals separately and combine using union
from pyspark.sql.functions import sum as _sum, lit
yearly_totals = df_with_periods.groupBy("year").agg(_sum("amount").alias("total_sales")).withColumn("quarter", col("year") * lit(None)).withColumn("month", col("year") * lit(None))
quarterly_totals = df_with_periods.groupBy("year", "quarter").agg(_sum("amount").alias("total_sales")).withColumn("month", col("year") * lit(None))
monthly_totals = df_with_periods.groupBy("year", "month").agg(_sum("amount").alias("total_sales")).withColumn("quarter", col("year") * lit(None))

# Combine all totals
combined_totals = yearly_totals.union(quarterly_totals).union(monthly_totals).orderBy("year", "quarter", "month")

# Show the result
combined_totals.show(truncate=False)


+----+-----------+-------+-----+
|year|total_sales|quarter|month|
+----+-----------+-------+-----+
|2023|39400      |null   |null |
|2023|7          |1500   |null |
|2023|10         |1900   |null |
|2023|5          |2300   |null |
|2023|11         |2500   |null |
|2023|12         |2700   |null |
|2023|6          |2900   |null |
|2023|8          |3300   |null |
|2023|9          |3400   |null |
|2023|1          |3500   |null |
|2023|4          |4000   |null |
|2023|2          |5500   |null |
|2023|3          |5900   |null |
|2023|4          |7100   |null |
|2023|3          |8200   |null |
|2023|2          |9200   |null |
|2023|1          |14900  |null |
+----+-----------+-------+-----+

