In [9]:
# Create the Spark Session

from pyspark.sql import SparkSession

spark = ( 
    SparkSession
    .builder
    .appName("Streaming Process Files")
    .config("spark.streaming.stopGracefullyonshutdown", True)
    .master("local[*]")
    .getOrCreate()
)

spark

In [10]:
#To allow automatic schemaInference while reading
# spark.conf.set("spark.sql.streaming.schemaInference", True)
streaming_df = (
    spark.read
    .format("json")
    .load("/home/jupyter/streaming-spark/data/input/device_file/device_01.json")
)


In [13]:
# To the schema of the data, place a sample json file and change readStream to read

streaming_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)



In [15]:
# Lets explode the data as devices contains List/array of device reading

from pyspark.sql.functions import explode

explode_df = streaming_df.withColumn("data_devices", explode("data.devices"))

In [17]:
# check the schema of the explode_df, place a sample json file and change readStream to read

explode_df.printSchema()
explode_df.show(truncate=False)

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- data_devices: struct (nullable = true)
 |    |-- deviceId: string (nullable = true)
 |    |-- measure: string (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- temperature: long (nullable = true)

+----------+------------------------------------------------+------------------------------------+-----------+--------------+--------------------------+----------------------+
|customerId|data                     

In [20]:
# flatten the explode df
from pyspark.sql.functions import col
 
flattened_df = (
    explode_df
    .drop("data")
    .withColumn("deviceId", col("data_devices.deviceId"))
    .withColumn("measure", col("data_devices.measure"))
    .withColumn("status", col("data_devices.status"))
    .withColumn("temperature", col("data_devices.temperature"))
    .drop("data_devices")

)

In [21]:
# check the schema of the flattened_df, place a sample json file and change readStream to read

flattened_df.printSchema()
flattened_df.show(truncate=False)

root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- deviceId: string (nullable = true)
 |-- measure: string (nullable = true)
 |-- status: string (nullable = true)
 |-- temperature: long (nullable = true)

+----------+------------------------------------+-----------+--------------+--------------------------+--------+-------+-------+-----------+
|customerId|eventId                             |eventOffset|eventPublisher|eventTime                 |deviceId|measure|status |temperature|
+----------+------------------------------------+-----------+--------------+--------------------------+--------+-------+-------+-----------+
|CI00103   |e3cb26d3-41b2-49a2-84f3-0156ed8d7502|10001      |device        |2023-01-05 11:13:53.643364|D001    |C      |ERROR  |15         |
|CI00103   |e3cb26d3-41b2-49a2-84f3-0156ed8d7502|10001      |de

## Streaming

In [22]:
#To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)
streaming_df = (
    spark
    .readStream
    .option("cleanSource","archive")
    .option("sourceArchiveDir","archive_dir")
    .option("maxFilesPerTrigger",1)
    .format("json")
    .load("/home/jupyter/streaming-spark/data/input/device_file/device_01.json")
)


In [23]:
# To the schema of the data, place a sample json file and change readStream to read

streaming_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)



In [24]:
# Lets explode the data as devices contains List/array of device reading

from pyspark.sql.functions import explode

explode_df = streaming_df.withColumn("data_devices", explode("data.devices"))

In [25]:
# check the schema of the explode_df, place a sample json file and change readStream to read

explode_df.printSchema()


root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- data_devices: struct (nullable = true)
 |    |-- deviceId: string (nullable = true)
 |    |-- measure: string (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- temperature: long (nullable = true)



In [26]:
# flatten the explode df
from pyspark.sql.functions import col
 
flattened_df = (
    explode_df
    .drop("data")
    .withColumn("deviceId", col("data_devices.deviceId"))
    .withColumn("measure", col("data_devices.measure"))
    .withColumn("status", col("data_devices.status"))
    .withColumn("temperature", col("data_devices.temperature"))
    .drop("data_devices")

)

In [32]:
# check the schema of the flattened_df, place a sample json file and change readStream to read

flattened_df.printSchema()


root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- deviceId: string (nullable = true)
 |-- measure: string (nullable = true)
 |-- status: string (nullable = true)
 |-- temperature: long (nullable = true)



In [38]:
# # write the output to console to check the output

(flattened_df
.writeStream
.format("csv")
.outputMode("append")
.option("path","data/output/device_data.csv")
.option("checkpointlocation","checkpoint_dir")
.start()
.awaitTermination())


