In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import MapType
import cantools
import json
from pbspark import MessageConverter
from canbus_test_pb2 import CANBusMessage
import struct

In [None]:
jarsPackages = "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1,com.datastax.spark:spark-cassandra-connector_2.12:3.2.0"

In [None]:
spark = SparkSession.builder.master("local[*]") \
                    .appName('Spark Structured Streaming from Sensor EV-BUS') \
                    .config("spark.jars.packages", jarsPackages) \
                    .getOrCreate()

In [None]:
dbc = cantools.database.load_file('./j1939.dbc')
dbc.add_dbc_file('iso.dbc')

In [None]:
mc = MessageConverter()

In [None]:
def parseProtoKafka(df, proto):
    # Parsed Protobuf encryption
    parsedData = df.withColumn('firstparsed', mc.from_protobuf('value', proto)) \
        .withColumn('topic', expr('headers')[0]['value'].cast('string')) \
        .withColumn('bus_id', expr('headers')[2]['value'].cast('string')) \
        .selectExpr('topic','bus_id', 'firstparsed.*') \
        .withColumn('secondparsed', mc.from_protobuf('canId', proto)) \
        .selectExpr('topic','bus_id','secondparsed.*') \
        .withColumnRenamed('canId','can_id') \
        .withColumn('timestamp', to_timestamp(col('timestamp') / 1000))
    
    return parsedData

In [None]:
LONG_MESSAGE = {}
def parse_can_message(ID_HEX,DLC,DATA_HEX_STR):
        try:
            ID_HEX = int.from_bytes(ID_HEX, "big")
        except:
            pass
        DATA_LEN = int(DLC)
        PRIORITY = ID_HEX & (0b00011100 << 24)
        RESERVED = ID_HEX & (0b00000010 << 24)
        DATA_PAGE = ID_HEX & (0b00000001 << 24)
        PDU_FORMAT = ID_HEX & (0b11111111 << 16)
        PDU_SPECIFIC = ID_HEX & (0b11111111 << 8)
        SOURCE_ADDRESS = ID_HEX & (0b11111111 << 0)
        
        PGN = RESERVED | DATA_PAGE | PDU_FORMAT | PDU_SPECIFIC
        DBC_ID = PRIORITY | PGN | 0xFE
        try:
            currMsg = dbc.get_message_by_frame_id(DBC_ID)
            try:
                outdata = dbc.decode_message(DBC_ID,DATA_HEX_STR,decode_choices=True)
                if(outdata):
                    for key in outdata.keys():
                        if not isinstance(outdata[key], int) and not isinstance(outdata[key], float) and not isinstance(outdata[key], str):
                            outdata[key] = str(outdata[key])
                outdata["MessageName"] = currMsg.name
                json_data = json.dumps(outdata)
                return str(json_data)
            except:
                if currMsg.frame_id in LONG_MESSAGE.keys():
                    LONG_MESSAGE[currMsg.frame_id] = f"{LONG_MESSAGE[currMsg.frame_id]}{DATA_HEX_STR}"
                    try:
                        outdata = dbc.decode_message(DBC_ID,LONG_MESSAGE[currMsg.frame_id],decode_choices=True)
                        if(outdata):
                            for key in outdata.keys():
                                if not isinstance(outdata[key], int) and not isinstance(outdata[key], float) and not isinstance(outdata[key], str):
                                    outdata[key] = str(outdata[key])
                        outdata["MessageName"] = currMsg.name
                        del LONG_MESSAGE[currMsg.frame_id]
                        json_data = json.dumps(outdata)
                        return str(json_data)
                    except:
                        pass
                else:
                    LONG_MESSAGE[currMsg.frame_id] = DATA_HEX_STR
        except:
            pass

In [None]:
kafkaDF = spark.readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", "10.252.62.70:9092") \
      .option("subscribe", 'canbus_test') \
      .option("includeHeaders", "true") \
      .option('startingOffsets', 'latest') \
      .load() \
      .selectExpr("headers","CAST(key AS STRING)", "value","timestamp")

In [None]:
parse_proto_kafka = parseProtoKafka(kafkaDF,CANBusMessage)
# parserCan = udf(lambda m,n,o: parse_can_message(m,n,o))
parserCan = udf(lambda m,n,o: parse_can_message(m,n,o))

In [None]:
parserValue = parse_proto_kafka.withColumn("parser_value", parserCan('can_id','dlc','data'))

In [None]:
query = parserValue.writeStream \
    .outputMode("append") \
    .format("console") \
    .option('truncate', False) \
    .start()

query.awaitTermination()