In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("UDTFs") \
    .master("local[*]") \
    .enableHiveSupport() \
    .getOrCreate()

## Stack

In [14]:
# Stacks
# Sample wide data
metrics_data = [
    ("server1", 80.5, 65.2, 45.8, 12.1),
    ("server2", 75.3, 70.1, 50.2, 15.5)
]
metrics_df = spark.createDataFrame(metrics_data, ["server", "cpu", "memory", "disk", "network"])

# Using stack() in SQL parameters are number of pairs then value pairs
metrics_df.createOrReplaceTempView("metrics")
spark.sql("""
    SELECT server, metric_name, metric_value, metric_flag
    FROM metrics
    LATERAL VIEW stack(4, 
        'cpu_usage', cpu, case when cpu < 70 then 'low' else 'high' end,
        'memory_usage', memory, case when memory < 60 then 'low' else 'high' end,
        'disk_usage', disk, case when disk < 50 then 'low' else 'high' end,
        'network_usage', network, case when network < 15 then 'low' else 'high' end
    ) stacked AS metric_name, metric_value, metric_flag
""").show()

+-------+-------------+------------+-----------+
| server|  metric_name|metric_value|metric_flag|
+-------+-------------+------------+-----------+
|server1|    cpu_usage|        80.5|       high|
|server1| memory_usage|        65.2|       high|
|server1|   disk_usage|        45.8|        low|
|server1|network_usage|        12.1|        low|
|server2|    cpu_usage|        75.3|       high|
|server2| memory_usage|        70.1|       high|
|server2|   disk_usage|        50.2|       high|
|server2|network_usage|        15.5|       high|
+-------+-------------+------------+-----------+



In [4]:
# DataFrame API equivalent (more verbose)
from pyspark.sql.functions import lit, array, struct, explode, col
metrics_df.select(
    "server",
    explode(array(
        struct(lit("cpu_usage").alias("metric_name"), col("cpu").alias("metric_value")),
        struct(lit("memory_usage").alias("metric_name"), col("memory").alias("metric_value")),
        struct(lit("disk_usage").alias("metric_name"), col("disk").alias("metric_value")),
        struct(lit("network_usage").alias("metric_name"), col("network").alias("metric_value"))
    )).alias("metric")
).select("server", "metric.*").show()

+-------+-------------+------------+
| server|  metric_name|metric_value|
+-------+-------------+------------+
|server1|    cpu_usage|        80.5|
|server1| memory_usage|        65.2|
|server1|   disk_usage|        45.8|
|server1|network_usage|        12.1|
|server2|    cpu_usage|        75.3|
|server2| memory_usage|        70.1|
|server2|   disk_usage|        50.2|
|server2|network_usage|        15.5|
+-------+-------------+------------+



In [11]:
from pyspark.sql.functions import stack
# Using stack() in DataFrame API
metrics_df.select('*', 
        stack(lit(4),
            lit("cpu_usage"), col("cpu"), lit(1), 
            lit("memory_usage"), col("memory"), lit(1),
            lit("disk_usage"), col("disk"), lit(1),
            lit("network_usage"), col("network"), lit(1)
        ).alias("metric_name", "metric_value", "sdf")
).show(10, truncate=False)

+-------+----+------+----+-------+-------------+------------+---+
|server |cpu |memory|disk|network|metric_name  |metric_value|sdf|
+-------+----+------+----+-------+-------------+------------+---+
|server1|80.5|65.2  |45.8|12.1   |cpu_usage    |80.5        |1  |
|server1|80.5|65.2  |45.8|12.1   |memory_usage |65.2        |1  |
|server1|80.5|65.2  |45.8|12.1   |disk_usage   |45.8        |1  |
|server1|80.5|65.2  |45.8|12.1   |network_usage|12.1        |1  |
|server2|75.3|70.1  |50.2|15.5   |cpu_usage    |75.3        |1  |
|server2|75.3|70.1  |50.2|15.5   |memory_usage |70.1        |1  |
|server2|75.3|70.1  |50.2|15.5   |disk_usage   |50.2        |1  |
|server2|75.3|70.1  |50.2|15.5   |network_usage|15.5        |1  |
+-------+----+------+----+-------+-------------+------------+---+



## JSON

In [15]:
from pyspark.sql.functions import *

json_data = [
    ("Alice", '{"age": 25, "city": "NYC", "salary": 50000}'),
    ("Bob", '{"age": 30, "city": "LA", "salary": 60000}')
]
json_df = spark.createDataFrame(json_data, ["name", "json_str"])

In [17]:
from pyspark.sql.types import *

json_schema = StructType([
    StructField("age", IntegerType(), False),
    StructField("city", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("always_null", IntegerType(), True)
])
df_from = json_df.select('*', from_json(col('json_str'), json_schema).alias('json_data'))

df_from.select('name', 'json_data.*').show(20, truncate=False)

+-----+---+----+------+-----------+
|name |age|city|salary|always_null|
+-----+---+----+------+-----------+
|Alice|25 |NYC |50000 |NULL       |
|Bob  |30 |LA  |60000 |NULL       |
+-----+---+----+------+-----------+



In [None]:

ls_col = ['age', 'city', 'salary', 'always_null']
df_tuple = json_df.select('*', json_tuple(col('json_str'), *ls_col).alias(*ls_col))

df_tuple.show(20, truncate=False)

+-----+-------------------------------------------+---+----+------+-----------+
|name |json_str                                   |age|city|salary|always_null|
+-----+-------------------------------------------+---+----+------+-----------+
|Alice|{"age": 25, "city": "NYC", "salary": 50000}|25 |NYC |50000 |NULL       |
|Bob  |{"age": 30, "city": "LA", "salary": 60000} |30 |LA  |60000 |NULL       |
+-----+-------------------------------------------+---+----+------+-----------+



In [None]:

ls_col = ['age', 'city', 'salary', 'always_null']
df_tuple_unordered = json_df.select('*', json_tuple(col('json_str'), *ls_col).alias(*ls_col))

df_tuple_unordered.show(20, truncate=False)

In [34]:
data = [
    ("Alice", 25, "NYC", {"salary": 50000, "bonus": 5000, "department": "Engineering", "job_title": "Engineer"}),
    ("Bob", 30, "LA", {"salary": 60000, "bonus": 6000, "department": "Marketing", "job_title": "Manager"}),
    ("Charlie", 28, "SF", {"salary": 55000, "department": "Sales", "job_title": "Salesperson", "special_note": "Top performer"}),
]

df = spark.createDataFrame(data, ["name", "age", "city", "details"])
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)
 |-- details: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)

+-------+---+----+------------------------------------------------------------------------------+
|name   |age|city|details                                                                       |
+-------+---+----+------------------------------------------------------------------------------+
|Alice  |25 |NYC |{department -> NULL, salary -> 50000, job_title -> NULL, bonus -> 5000}       |
|Bob    |30 |LA  |{department -> NULL, salary -> 60000, job_title -> NULL, bonus -> 6000}       |
|Charlie|28 |SF  |{department -> NULL, salary -> 55000, special_note -> NULL, job_title -> NULL}|
+-------+---+----+------------------------------------------------------------------------------+



In [35]:
data = [
    ("Alice", 25, "NYC", ("salary": 50000, "bonus": 5000, "department": "Engineering", "job_title": "Engineer")),
    ("Bob", 30, "LA", ("salary": 60000, "bonus": 6000, "department": "Marketing", "job_title": "Manager")),
    ("Charlie", 28, "SF", ("salary": 55000, "department": "Sales", "job_title": "Salesperson", "special_note": "Top performer")),
]

df = spark.createDataFrame(data, ["name", "age", "city", "details"])
df.printSchema()
df.show(truncate=False)

SyntaxError: invalid syntax (1503060548.py, line 2)

In [33]:
df.select(map_keys("details")).show(truncate=False)


+---------------------------------------------+
|map_keys(details)                            |
+---------------------------------------------+
|[department, salary, job_title, bonus]       |
|[department, salary, job_title, bonus]       |
|[department, salary, special_note, job_title]|
+---------------------------------------------+



In [32]:
df.select(col("details.department").alias("department"),
          col("details.salary").alias("salary"),
).show(20, truncate=False)

+----------+------+
|department|salary|
+----------+------+
|NULL      |50000 |
|NULL      |60000 |
|NULL      |55000 |
+----------+------+



In [25]:
from pyspark.sql.functions import map_keys, explode, collect_set

# Get all unique keys across all maps
df_keys = df.select(explode(map_keys("details")).alias("key"))
df_keys.show(10, truncate=False)

+----------+
|key       |
+----------+
|department|
|salary    |
|job_title |
|bonus     |
|department|
|salary    |
|job_title |
|bonus     |
|department|
|salary    |
+----------+
only showing top 10 rows



In [26]:

df_key_set = df_keys.select(collect_set("key").alias("keys"))
df_key_set.show(truncate=False)

+----------------------------------------------------+
|keys                                                |
+----------------------------------------------------+
|[special_note, salary, department, job_title, bonus]|
+----------------------------------------------------+



In [28]:
all_keys = df_key_set.collect()[0]["keys"]
print(type(all_keys), all_keys)

<class 'list'> ['special_note', 'salary', 'department', 'job_title', 'bonus']


In [30]:
# Create columns dynamically
select_expr = ["name"] + [f"details['{key}'] as {key}" for key in sorted(all_keys)]
print("Select Expression:", select_expr)
result = df.selectExpr(*select_expr)
result.show()

Select Expression: ['name', "details['bonus'] as bonus", "details['department'] as department", "details['job_title'] as job_title", "details['salary'] as salary", "details['special_note'] as special_note"]
+-------+-----+----------+---------+------+------------+
|   name|bonus|department|job_title|salary|special_note|
+-------+-----+----------+---------+------+------------+
|  Alice| 5000|      NULL|     NULL| 50000|        NULL|
|    Bob| 6000|      NULL|     NULL| 60000|        NULL|
|Charlie| NULL|      NULL|     NULL| 55000|        NULL|
+-------+-----+----------+---------+------+------------+



In [31]:
df.selectExpr('name', 
              "details['bonus'] as bonus", 
              "details['department'] as department", 
              "details['job_title'] as job_title", 
              "details['salary'] as salary", "details['special_note'] as special_note").show(truncate=False) 

+-------+-----+----------+---------+------+------------+
|name   |bonus|department|job_title|salary|special_note|
+-------+-----+----------+---------+------+------------+
|Alice  |5000 |NULL      |NULL     |50000 |NULL        |
|Bob    |6000 |NULL      |NULL     |60000 |NULL        |
|Charlie|NULL |NULL      |NULL     |55000 |NULL        |
+-------+-----+----------+---------+------+------------+

