In [None]:
from pyspark.sql.types import *

# Since we know the data format already, let's define the schema to speed up processing (no need for Spark to infer schema)
jsonSchema = StructType([ StructField("last_reported", TimestampType(), True), 
                          StructField("name", StringType(), True),
                          StructField("station_id", StringType(), True),
                          StructField("region_id", StringType(), True),
                          StructField("publisher", StringType(), True),
                          StructField("lat", FloatType(), True),
                          StructField("lon", FloatType(), True),
                          StructField("country_code", StringType(), True),
                          StructField("num_bikes_available", IntegerType(), True),
                          StructField("num_docks_available", IntegerType(), True),
                          StructField("is_renting", IntegerType(), True),
                          StructField("is_returning", IntegerType(), True)
                        ])

## Stream Processing 

In [None]:
from pyspark.sql.functions import *

# Similar to definition of staticInputDF above, just using `readStream` instead of `read`
parsed = (
  spark
    .readStream                       
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "satori-bike")
    .load()
    .select(col("timestamp"),from_json(col("value").cast("string"),jsonSchema).alias("parsed_value"))
)

bikedata = parsed.select("timestamp","parsed_value.*")
bikedata.printSchema()
bikedata.isStreaming

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", "2")  # keep the size of shuffles small
#query.stop()
query = (
  bikedata
    .withWatermark("timestamp", "10 minutes")
    .groupBy(
       bikedata.country_code, 
       window(bikedata.last_reported, "10 minutes", "5 minutes"))    
    .count()
    .writeStream
    .format("memory")        # memory = store in-memory table (for testing only in Spark 2.0)
    .outputMode("complete")  # complete = all the counts should be in the table
    .queryName("bikesharing")     # counts = name of the in-memory table
    .start()
)

In [None]:
spark.sql("select * from bikesharing").collect()

In [None]:
spark.sql("select country_code, date_format(window.start, 'MMM-dd HH:mm') as time_start, date_format(window.end, 'MMM-dd HH:mm') as time_end, count from bikesharing order by time_end, country_code").show()

Also, let's see the total number of "opens" and "closes".

In [None]:
spark.sql("select country_code, sum(count) as total_count from bikesharing group by country_code order by country_code").collect()

In [None]:
# RUN a simple webservice on default port 5000 to return the in-memory table as is
from flask import Flask
import json
app = Flask(__name__)

# route to access data
@app.route("/")
def count_per_country():
    counts = spark.sql("select country_code, sum(count) as total_count from bikesharing group by country_code order by country_code").toJSON().collect()
    return json.dumps(counts)

if __name__ == "__main__":
    app.run()

In [None]:
# print(query.lastProgress)

# check for running streams:
#spark.streams.active

#Terminate the query stream
#query.stop()
# some stats for the query
