In [0]:
%fs ls /databricks/shared/databricks-datasets/structured-streaming/events/

path,name,size,modificationTime
dbfs:/databricks/shared/databricks-datasets/structured-streaming/events/file-00.json,file-00.json,72530,1676496581000
dbfs:/databricks/shared/databricks-datasets/structured-streaming/events/file-01.json,file-01.json,72961,1676496582000
dbfs:/databricks/shared/databricks-datasets/structured-streaming/events/file-02.json,file-02.json,73007,1676496582000
dbfs:/databricks/shared/databricks-datasets/structured-streaming/events/file-03.json,file-03.json,72996,1676496583000
dbfs:/databricks/shared/databricks-datasets/structured-streaming/events/file-04.json,file-04.json,72992,1676496583000
dbfs:/databricks/shared/databricks-datasets/structured-streaming/events/file-05.json,file-05.json,72998,1676496584000
dbfs:/databricks/shared/databricks-datasets/structured-streaming/events/file-06.json,file-06.json,72997,1676496584000
dbfs:/databricks/shared/databricks-datasets/structured-streaming/events/file-07.json,file-07.json,73022,1676496585000
dbfs:/databricks/shared/databricks-datasets/structured-streaming/events/file-08.json,file-08.json,72997,1676496585000
dbfs:/databricks/shared/databricks-datasets/structured-streaming/events/file-09.json,file-09.json,72970,1676496587000


In [0]:
%cat /dbfs/databricks/shared/databricks-datasets/structured-streaming/events/file-00.json

# Starting a stream, with the source's schema

Below a schema is created for the messages streamed and then the stream is created by reading from the folder of files.

The schema is created from [StructField](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.types.StructField.html) specified within the SQL types API.

In [0]:
from pyspark.sql.types import * 
json_schema = StructType([StructField("time", TimestampType(), True), StructField("action", StringType(), True)])

file_path = "dbfs:/databricks/shared/databricks-datasets/structured-streaming/events"
checkpoint_path = "/tmp/ss-tutorial/_checkpoint"

streaming_input_df = (spark
  .readStream.schema(json_schema)
  .option("maxFilesPerTrigger", 1)
  .json(file_path)
)

In [0]:
from pyspark.sql.window import Window
import pyspark.sql.functions as f
stream_counts_df = streaming_input_df.groupBy(streaming_input_df.action, f.window(streaming_input_df.time, "1 hour")).count()

In [0]:
stream_counts_df.display()

action,window,count
Close,"List(2016-07-26T13:00:00.000+0000, 2016-07-26T14:00:00.000+0000)",1028
Open,"List(2016-07-26T18:00:00.000+0000, 2016-07-26T19:00:00.000+0000)",1004
Close,"List(2016-07-27T02:00:00.000+0000, 2016-07-27T03:00:00.000+0000)",315
Open,"List(2016-07-26T05:00:00.000+0000, 2016-07-26T06:00:00.000+0000)",1000
Open,"List(2016-07-26T11:00:00.000+0000, 2016-07-26T12:00:00.000+0000)",991
Close,"List(2016-07-26T06:00:00.000+0000, 2016-07-26T07:00:00.000+0000)",1011
Open,"List(2016-07-26T10:00:00.000+0000, 2016-07-26T11:00:00.000+0000)",1007
Close,"List(2016-07-26T04:00:00.000+0000, 2016-07-26T05:00:00.000+0000)",815
Close,"List(2016-07-26T16:00:00.000+0000, 2016-07-26T17:00:00.000+0000)",984
Close,"List(2016-07-26T12:00:00.000+0000, 2016-07-26T13:00:00.000+0000)",960
