In [1]:
%run _spark_init.ipynb

- Write a structured query that “explodes” an array of structs (of open and close hours).

In [11]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

In [22]:
data = [{
  "business_id": "abc",
  "full_address": "random_address",
  "hours": {
    "Monday": {
      "close": "02:00",
      "open": "11:00"
    },
    "Tuesday": {
      "close": "02:00",
      "open": "11:00"
    },
    "Friday": {
      "close": "02:00",
      "open": "11:00"
    },
    "Wednesday": {
      "close": "02:00",
      "open": "11:00"
    },
    "Thursday": {
      "close": "02:00",
      "open": "11:00"
    },
    "Sunday": {
      "close": "00:00",
      "open": "11:00"
    },
    "Saturday": {
      "close": "02:00",
      "open": "11:00"
    }
  }
}]

hours_schema = StructType([StructField("close", StringType(), True),
                           StructField("open", StringType(), True)])

schema = StructType([StructField("business_id", StringType(), True),
                     StructField("full_address", StringType(), True),
                     StructField("hours", ArrayType(hours_schema), True)])

df = spark.createDataFrame(data)

In [23]:
df.show(truncate=False)

+-----------+--------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|business_id|full_address  |hours                                                                                                                                                                                                                                                                                                                |
+-----------+--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [42]:
from pyspark.sql.functions import col, explode

(df
  .select("business_id", "full_address", explode("hours").alias("day", "hours"))
  .withColumns({"open_time": col("hours").open, "close_time": col("hours").close})
  .drop("hours")
  .orderBy("day")
).show()

+-----------+--------------+---------+---------+----------+
|business_id|  full_address|      day|open_time|close_time|
+-----------+--------------+---------+---------+----------+
|        abc|random_address|   Friday|    11:00|     02:00|
|        abc|random_address|   Monday|    11:00|     02:00|
|        abc|random_address| Saturday|    11:00|     02:00|
|        abc|random_address|   Sunday|    11:00|     00:00|
|        abc|random_address| Thursday|    11:00|     02:00|
|        abc|random_address|  Tuesday|    11:00|     02:00|
|        abc|random_address|Wednesday|    11:00|     02:00|
+-----------+--------------+---------+---------+----------+

