In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pbspark import from_protobuf
from seat_detection_pb2 import SeatDetection
import sys

In [None]:
jarsPackages = "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1"

In [None]:
spark = SparkSession.builder.master("local[*]") \
                    .appName('seat-processor') \
                    .config("spark.jars.packages", jarsPackages) \
                    .getOrCreate()


In [None]:
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "103.146.34.72:9094") \
  .option("subscribe", "seat_detection") \
  .option("includeHeaders", "true") \
  .load()

In [None]:
def message_proto(value):
    decoded = from_protobuf(value,SeatDetection)
    return decoded

In [None]:
raw_df = df.selectExpr("value", "headers",'timestamp')

In [None]:
query = raw_df \
        .withColumn('parsed',message_proto('value')) \
        .withColumn('bus_id', expr('headers')[2]['value'].cast('string')) \
        .select('bus_id','parsed.*') \
        .withColumn('timestamp',to_timestamp('timestamp')) \
        .withColumn('year', year('timestamp')) \
        .withColumn('month', month('timestamp')) \
        .withColumn('day', dayofmonth('timestamp')) \
        .withColumn('hour', hour('timestamp')) \
        .withColumn('minute', minute('timestamp'))

query.printSchema()

In [None]:
show = query \
        .writeStream \
        .queryName('RawSeatDetection') \
        .outputMode('append') \
        .option('path','hdfs://localhost:9000/user/parallels/job/seat-occupancy/') \
        .option('checkpointLocation','hdfs://localhost:9000/user/parallels/spark-checkpoint/seat-occupancy-checkpoint/') \
        .partitionBy('year','month','day','hour','minute') \
        .option("truncate", False) \
        .start() \
        .awaitTermination()