In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json

# Create Spark session
spark = SparkSession.builder \
    .appName("Test Spark JSONs") \
    .master("local[*]") \
    .enableHiveSupport() \
    .getOrCreate()

# Sample data with array of structs
complex_data = [
    ("John", [{"course": "Math", "grade": 85}, {"course": "Science", "grade": 92}]),
    ("Jane", [{"course": "Math", "grade": 78}, {"course": "Science", "grade": 88}]),
]

In [4]:
df_raw = spark.createDataFrame(complex_data)
df_raw.show()
df_raw.printSchema()


+----+--------------------+
|  _1|                  _2|
+----+--------------------+
|John|[{course -> Math,...|
|Jane|[{course -> Math,...|
+----+--------------------+

root
 |-- _1: string (nullable = true)
 |-- _2: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)



In [5]:
complex_schema = StructType([
    StructField("name", StringType(), True),
    StructField("courses", ArrayType(StructType([
        StructField("course", StringType(), True),
        StructField("grade", IntegerType(), True)
    ]), True
    ))
])

df_complex = spark.createDataFrame(complex_data, schema=complex_schema)
df_complex.show(truncate=False)
df_complex.printSchema()

+----+---------------------------+
|name|courses                    |
+----+---------------------------+
|John|[{Math, 85}, {Science, 92}]|
|Jane|[{Math, 78}, {Science, 88}]|
+----+---------------------------+

root
 |-- name: string (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- course: string (nullable = true)
 |    |    |-- grade: integer (nullable = true)



In [7]:
df_complex.select(
    col("name"),
    col("courses.course").alias("course"),
    col("courses")[0].grade.alias("first_course_grade")
).show(truncate=False)

+----+---------------+------------------+
|name|course         |first_course_grade|
+----+---------------+------------------+
|John|[Math, Science]|85                |
|Jane|[Math, Science]|78                |
+----+---------------+------------------+



In [23]:
df_explode = df_complex.select(
    col("name"),
    explode(col("courses")).alias("courses"),
    # explode(col("courses.course")).alias("course_info"),
    # explode(col("courses.grade")).alias("grade_info")
)
df_explode.show(truncate=False)
df_explode.printSchema()

+----+-------------+
|name|courses      |
+----+-------------+
|John|{Math, 85}   |
|John|{Science, 92}|
|Jane|{Math, 78}   |
|Jane|{Science, 88}|
+----+-------------+

root
 |-- name: string (nullable = true)
 |-- courses: struct (nullable = true)
 |    |-- course: string (nullable = true)
 |    |-- grade: integer (nullable = true)



In [24]:
df_explode.select(col("courses.course"), col("courses").grade).show(truncate=False)

+-------+-------------+
|course |courses.grade|
+-------+-------------+
|Math   |85           |
|Science|92           |
|Math   |78           |
|Science|88           |
+-------+-------------+



In [29]:
# explode_outer keeps rows even if array is null/empty
df_with_nulls = spark.createDataFrame([
    ("John", [1, 2, 3]),
    ("Jane", None),
    ("Bob", [])
], ["name", "numbers"])

df_outer_exploded = df_with_nulls.select(
    col("name"),
    explode_outer(col("numbers")).alias("number")
)
df_outer_exploded.show(truncate=False)
df_outer_exploded.count()

+----+------+
|name|number|
+----+------+
|John|1     |
|John|2     |
|John|3     |
|Jane|NULL  |
|Bob |NULL  |
+----+------+



5

In [None]:
CREATE TABLE processed_data AS
SELECT *,
       monotonically_increasing_id() AS record_id,
       current_timestamp() AS processed_at,
       'system' AS created_by,
       1 AS version
FROM raw_data;