# How to dynamically convert rows into columns | Dynamic Pivot

In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

# Pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Initialize Spark Session (if not already done)
spark = SparkSession.builder.appName("DynamicPivotExample").getOrCreate()

# Define schema using StructType and StructField
schema = StructType([
    StructField("Year", IntegerType(), True),
    StructField("Month", StringType(), True),
    StructField("SalesAmount", DoubleType(), True)
])

# Sample sales data
data = [
    (2023, "January", 1000.0),
    (2023, "February", 1500.0),
    (2023, "March", 1800.0),
    (2023, "April", 1100.0),
    (2023, "May", 1300.0),
    (2023, "June", 950.0),
    (2023, "July", 1250.0),
    (2023, "August", 1450.0),
    (2023, "September", 1150.0),
    (2023, "October", 1500.0),
    (2023, "November", 1400.0),
    (2023, "December", 1600.0),
    (2022, "January", 900.0),
    (2022, "February", 1400.0),
    (2022, "March", 1700.0)
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df.show(truncate=False)


+----+---------+-----------+
|Year|Month    |SalesAmount|
+----+---------+-----------+
|2023|January  |1000.0     |
|2023|February |1500.0     |
|2023|March    |1800.0     |
|2023|April    |1100.0     |
|2023|May      |1300.0     |
|2023|June     |950.0      |
|2023|July     |1250.0     |
|2023|August   |1450.0     |
|2023|September|1150.0     |
|2023|October  |1500.0     |
|2023|November |1400.0     |
|2023|December |1600.0     |
|2022|January  |900.0      |
|2022|February |1400.0     |
|2022|March    |1700.0     |
+----+---------+-----------+



----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 62038)
Traceback (most recent call last):
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 755, in __init__
    self.handle()
  File "C:\spark\python\pyspark\accumulators.py", line 281, in handle
    poll(accum_updates)
  File "C:\spark\python\pyspark\accumulators.py", line 253, in poll
    if func():
       ^^^^^^
  File "C:\spark\python\py

In [2]:
# Register the DataFrame as a temporary table
df.createOrReplaceTempView("SalesData")


In [3]:
# Perform a static pivot using DataFrame API
static_pivot_df = df.groupBy("Year").pivot("Month", ["January", "February", "March", "April", "May", "June", 
                                                     "July", "August", "September", "October", "November", "December"]).sum("SalesAmount")
static_pivot_df.show(truncate=False)




# Collect unique months dynamically
unique_months = [row["Month"] for row in df.select("Month").distinct().collect()]

print("Unique Months:", unique_months)

+----+-------+--------+------+------+------+-----+------+------+---------+-------+--------+--------+
|Year|January|February|March |April |May   |June |July  |August|September|October|November|December|
+----+-------+--------+------+------+------+-----+------+------+---------+-------+--------+--------+
|2023|1000.0 |1500.0  |1800.0|1100.0|1300.0|950.0|1250.0|1450.0|1150.0   |1500.0 |1400.0  |1600.0  |
|2022|900.0  |1400.0  |1700.0|null  |null  |null |null  |null  |null     |null   |null    |null    |
+----+-------+--------+------+------+------+-----+------+------+---------+-------+--------+--------+

Unique Months: ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']


In [4]:
# Create dynamic SQL query
pivot_query = f"""
SELECT Year, {', '.join([f"MAX(CASE WHEN Month = '{month}' THEN SalesAmount ELSE NULL END) AS `{month}`" for month in unique_months])}
FROM SalesData
GROUP BY Year
"""

# Execute the dynamic pivot query
dynamic_pivot_df = spark.sql(pivot_query)
dynamic_pivot_df.show(truncate=False)

+----+-------+--------+------+------+------+-----+------+------+---------+-------+--------+--------+
|Year|January|February|March |April |May   |June |July  |August|September|October|November|December|
+----+-------+--------+------+------+------+-----+------+------+---------+-------+--------+--------+
|2023|1000.0 |1500.0  |1800.0|1100.0|1300.0|950.0|1250.0|1450.0|1150.0   |1500.0 |1400.0  |1600.0  |
|2022|900.0  |1400.0  |1700.0|null  |null  |null |null  |null  |null     |null   |null    |null    |
+----+-------+--------+------+------+------+-----+------+------+---------+-------+--------+--------+



In [6]:
from pyspark.sql.functions import collect_list

# Collect unique months dynamically
months_list = df.select("Month").distinct().orderBy("Month").rdd.flatMap(lambda x: x).collect()

# Perform dynamic pivot
dynamic_pivot_df_api = df.groupBy("Year").pivot("Month", months_list).sum("SalesAmount")
dynamic_pivot_df_api.show(truncate=False)


+----+------+------+--------+--------+-------+------+-----+------+------+--------+-------+---------+
|Year|April |August|December|February|January|July  |June |March |May   |November|October|September|
+----+------+------+--------+--------+-------+------+-----+------+------+--------+-------+---------+
|2023|1100.0|1450.0|1600.0  |1500.0  |1000.0 |1250.0|950.0|1800.0|1300.0|1400.0  |1500.0 |1150.0   |
|2022|null  |null  |null    |1400.0  |900.0  |null  |null |1700.0|null  |null    |null   |null     |
+----+------+------+--------+--------+-------+------+-----+------+------+--------+-------+---------+



In [8]:
from pyspark.sql.functions import expr

# Collect unique month names dynamically
months = df.select("Month").distinct().orderBy("Month").rdd.flatMap(lambda x: x).collect()
print("Unique Months:", months)

# Construct aggregation expressions dynamically
aggregations = {month: expr(f"MAX(CASE WHEN Month = '{month}' THEN SalesAmount ELSE NULL END)").alias(month) for month in months}

# Create a pivot-like DataFrame by aggregating each month dynamically
df_dynamic_agg = df.groupBy("Year").agg(*aggregations.values())
df_dynamic_agg.show(truncate=False)


Unique Months: ['April', 'August', 'December', 'February', 'January', 'July', 'June', 'March', 'May', 'November', 'October', 'September']
+----+------+------+--------+--------+-------+------+-----+------+------+--------+-------+---------+
|Year|April |August|December|February|January|July  |June |March |May   |November|October|September|
+----+------+------+--------+--------+-------+------+-----+------+------+--------+-------+---------+
|2023|1100.0|1450.0|1600.0  |1500.0  |1000.0 |1250.0|950.0|1800.0|1300.0|1400.0  |1500.0 |1150.0   |
|2022|null  |null  |null    |1400.0  |900.0  |null  |null |1700.0|null  |null    |null   |null     |
+----+------+------+--------+--------+-------+------+-----+------+------+--------+-------+---------+



In [9]:
from pyspark.sql.functions import expr

# Construct aggregation expressions dynamically
aggregations = {month: expr(f"MAX(CASE WHEN Month = '{month}' THEN SalesAmount ELSE NULL END)").alias(month) for month in months}

# Create a pivot-like DataFrame by aggregating each month dynamically
df_dynamic_agg = df.groupBy("Year").agg(*aggregations.values())
df_dynamic_agg.show(truncate=False)


+----+------+------+--------+--------+-------+------+-----+------+------+--------+-------+---------+
|Year|April |August|December|February|January|July  |June |March |May   |November|October|September|
+----+------+------+--------+--------+-------+------+-----+------+------+--------+-------+---------+
|2023|1100.0|1450.0|1600.0  |1500.0  |1000.0 |1250.0|950.0|1800.0|1300.0|1400.0  |1500.0 |1150.0   |
|2022|null  |null  |null    |1400.0  |900.0  |null  |null |1700.0|null  |null    |null   |null     |
+----+------+------+--------+--------+-------+------+-----+------+------+--------+-------+---------+



In [None]:
# Convert to RDD for manual pivot operation
rdd_pivot = df.rdd.map(lambda x: (x.Year, (x.Month, x.SalesAmount))) \
    .groupByKey() \
    .map(lambda x: (x[0], dict(x[1])))

# Transform back to DataFrame with dynamic columns
pivoted_df = spark.createDataFrame(rdd_pivot.map(lambda x: (x[0], *[x[1].get(month, None) for month in months])),
                                   schema=["Year"] + months)

pivoted_df.show(truncate=False)


# Spark SQL

In [10]:

months = df.select("Month").distinct().orderBy("Month").rdd.flatMap(lambda x: x).collect()

# Create dynamic SQL query using the collected unique months
pivot_query = f"""
SELECT Year, {', '.join([f"MAX(CASE WHEN Month = '{month}' THEN SalesAmount ELSE NULL END) AS `{month}`" for month in months])}
FROM SalesData
GROUP BY Year
"""

# Execute the dynamic pivot query using Spark SQL
df.createOrReplaceTempView("SalesData")
dynamic_sql_pivot = spark.sql(pivot_query)
dynamic_sql_pivot.show(truncate=False)


+----+------+------+--------+--------+-------+------+-----+------+------+--------+-------+---------+
|Year|April |August|December|February|January|July  |June |March |May   |November|October|September|
+----+------+------+--------+--------+-------+------+-----+------+------+--------+-------+---------+
|2023|1100.0|1450.0|1600.0  |1500.0  |1000.0 |1250.0|950.0|1800.0|1300.0|1400.0  |1500.0 |1150.0   |
|2022|null  |null  |null    |1400.0  |900.0  |null  |null |1700.0|null  |null    |null   |null     |
+----+------+------+--------+--------+-------+------+-----+------+------+--------+-------+---------+

