In [1]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName("Kafka Streaming") \
        .master("local[3]") \
        .getOrCreate()

kafkaDF = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "iabd-virtualbox:9092") \
        .option("subscribe", "facturas") \
        .option("startingOffsets", "earliest") \
        .load()

In [2]:
kafkaDF.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [3]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, IntegerType, ArrayType
esquema = StructType([
    StructField("InvoiceNumber", StringType()),
    StructField("CreatedTime", LongType()),
    StructField("StoreID", StringType()),
    StructField("PosID", StringType()),
    StructField("CashierID", StringType()),
    StructField("CustomerType", StringType()),
    StructField("CustomerCardNo", StringType()),
    StructField("TotalAmount", DoubleType()),
    StructField("NumberOfItems", IntegerType()),
    StructField("PaymentMethod", StringType()),
    StructField("CGST", DoubleType()),
    StructField("SGST", DoubleType()),
    StructField("CESS", DoubleType()),
    StructField("DeliveryType", StringType()),
    StructField("DeliveryAddress", StructType([
        StructField("AddressLine", StringType()),
        StructField("City", StringType()),
        StructField("State", StringType()),
        StructField("PinCode", StringType()),
        StructField("ContactNumber", StringType())
    ])),
    StructField("InvoiceLineItems", ArrayType(StructType([
        StructField("ItemCode", StringType()),
        StructField("ItemDescription", StringType()),
        StructField("ItemPrice", DoubleType()),
        StructField("ItemQty", IntegerType()),
        StructField("TotalValue", DoubleType())
    ]))),
])

In [4]:
from pyspark.sql.functions import from_json, col
valueDF = kafkaDF.select(from_json(col("value").cast("string"), esquema).alias("value"))
valueDF.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- InvoiceNumber: string (nullable = true)
 |    |-- CreatedTime: long (nullable = true)
 |    |-- StoreID: string (nullable = true)
 |    |-- PosID: string (nullable = true)
 |    |-- CashierID: string (nullable = true)
 |    |-- CustomerType: string (nullable = true)
 |    |-- CustomerCardNo: string (nullable = true)
 |    |-- TotalAmount: double (nullable = true)
 |    |-- NumberOfItems: integer (nullable = true)
 |    |-- PaymentMethod: string (nullable = true)
 |    |-- CGST: double (nullable = true)
 |    |-- SGST: double (nullable = true)
 |    |-- CESS: double (nullable = true)
 |    |-- DeliveryType: string (nullable = true)
 |    |-- DeliveryAddress: struct (nullable = true)
 |    |    |-- AddressLine: string (nullable = true)
 |    |    |-- City: string (nullable = true)
 |    |    |-- State: string (nullable = true)
 |    |    |-- PinCode: string (nullable = true)
 |    |    |-- ContactNumber: string (nullable = true)
 |    |

In [5]:
from pyspark.sql.functions import expr

explodeDF = valueDF.selectExpr("value.InvoiceNumber", "value.CreatedTime",
    "value.StoreID", "value.PosID", "value.CustomerType",
    "value.PaymentMethod", "value.DeliveryType", "value.DeliveryAddress.City",
    "value.DeliveryAddress.State", "value.DeliveryAddress.PinCode",
    "explode(value.InvoiceLineItems) as LineItem")

limpioDF = explodeDF \
    .withColumn("ItemCode", expr("LineItem.ItemCode")) \
    .withColumn("ItemDescription", expr("LineItem.ItemDescription")) \
    .withColumn("ItemPrice", expr("LineItem.ItemPrice")) \
    .withColumn("ItemQty", expr("LineItem.ItemQty")) \
    .withColumn("TotalValue", expr("LineItem.TotalValue")) \
    .drop("LineItem")

In [None]:
facturaWriterQuery  = limpioDF.writeStream \
    .format("json") \
    .queryName("Facturas Kafka Writer") \
    .outputMode("append") \
    .option("path", "salida") \
    .option("checkpointLocation", "chk-point-dir-03") \
    .trigger(processingTime="1 minute") \
    .start()

facturaWriterQuery.awaitTermination()