In [None]:
from pyspark.sql.types import *

# Since we know the data format already, let's define the schema to speed up processing (no need for Spark to infer schema)
jsonSchema = StructType([ StructField("last_reported", TimestampType(), True), 
                          StructField("name", StringType(), True),
                          StructField("station_id", StringType(), True),
#                          StructField("region_id", StringType(), True),
#                          StructField("publisher", StringType(), True),
                          StructField("lat", FloatType(), True),
                          StructField("lon", FloatType(), True),
                          StructField("country_code", StringType(), True),
#                          StructField("num_bikes_available", IntegerType(), True),
#                          StructField("num_docks_available", IntegerType(), True)
#                          StructField("is_renting", IntegerType(), True),
#                          StructField("is_returning", IntegerType(), True)
                        ])

## Stream Processing 

In [None]:
from pyspark.sql.functions import *

# Similar to definition of staticInputDF above, just using `readStream` instead of `read`
parsed = (
  spark
    .readStream                       
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "satori-bike")
    .load()
    .select(col("timestamp"),from_json(col("value").cast("string"),jsonSchema).alias("parsed_value"))
)

bikedata = parsed.select("timestamp","parsed_value.*")
bikedata.printSchema()
bikedata.isStreaming

In [None]:
#spark.conf.set("spark.sql.shuffle.partitions", "2")  # keep the size of shuffles small
#query.stop()
#query = (
#  bikedata
#    .withWatermark("timestamp", "1 minutes")
#    .groupBy(
#       bikedata.country_code, 
#       window(bikedata.last_reported, "1 minutes", "30 seconds"))  
#    .count()
#    .select(to_json(struct("window")).alias("key"),
#            to_json(struct("country_code","count")).alias("value"))
#    .writeStream
#    .trigger(processingTime='10 seconds') # only write every 10 seconds to the output
#    .format("console")        # memory = store in-memory table (for testing only in Spark 2.0)
#    .outputMode("update")  # complete = all the counts should be in the table
#    .queryName("bikesharing")     # counts = name of the in-memory table
#    .start()
#)

In [None]:
#query.stop()

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", "2")  # keep the size of shuffles small

query = (
  bikedata
    .withWatermark("timestamp", "1 hour")
    .groupBy(
       bikedata.country_code, 
       window(bikedata.last_reported, "1 hour", "30 minutes"))  
    .count()
    .select(to_json(struct("country_code", "window")).alias("key"),
            to_json(struct("window.start","window.end","country_code","count")).alias("value"))
#        col("count").cast("string").alias("value"))
    .writeStream
    .trigger(processingTime='2 seconds')
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("topic", "aggr-bike")
    .option("checkpointLocation", "./checkpoints")
    .outputMode("update")  # complete = all the counts should be in the table
    .queryName("bikesharing")     # counts = name of the in-memory table
    .start()
)

In [None]:
#print(query.lastProgress)

# check for running streams:
#spark.streams.active

#Terminate the query stream
#query.stop()
# some stats for the query
