In [1]:
%run nb0.spark-session.ipynb

**StructType Overview**

- `StructType` defines the schema for complex, nested columns in PySpark DataFrames.
- It holds an ordered collection of `StructField` objects, each describing a fieldâ€™s name, data type, and nullability.
- Use it to model records with nested structures (e.g., embedded addresses) so Spark can validate and optimize queries across those fields.

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

address_schema = StructType(
    [
        StructField("street", StringType(), nullable=False),
        StructField("city", StringType(), nullable=False),
        StructField("zipCode", StringType(), nullable=False),
    ]
)

order_schema = StructType(
    [
        StructField("orderId", StringType(), nullable=False),
        StructField("shippingAddress", address_schema, nullable=False),
        StructField("itemCount", IntegerType(), nullable=False),
    ]
)

orders_df = spark.createDataFrame(
    [
        ("order-1001", ("123 Elm St", "Seattle", "98101"), 3),
        ("order-1002", ("456 Oak Ave", "Portland", "97205"), 5),
    ],
    order_schema,
)

orders_df.printSchema()
display(orders_df)
display(
    orders_df.select("orderId", "shippingAddress.city", "shippingAddress.zipCode"))

root
 |-- orderId: string (nullable = false)
 |-- shippingAddress: struct (nullable = false)
 |    |-- street: string (nullable = false)
 |    |-- city: string (nullable = false)
 |    |-- zipCode: string (nullable = false)
 |-- itemCount: integer (nullable = false)



orderId,shippingAddress,itemCount
order-1001,"{'street': '123 Elm St', 'city': 'Seattle', 'zipCode': '98101'}",3
order-1002,"{'street': '456 Oak Ave', 'city': 'Portland', 'zipCode': '97205'}",5


orderId,city,zipCode
order-1001,Seattle,98101
order-1002,Portland,97205
