In [1]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName("Streaming de Ficheros") \
        .master("local[2]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .config("spark.sql.shuffle.partitions", 3) \
        .config("spark.sql.streaming.schemaInference", "true") \
        .getOrCreate()

raw_df = spark.readStream \
        .format("json") \
        .option("path", "entrada") \
        .load()

raw_df.printSchema()
# root
#  |-- CESS: double (nullable = true)
#  |-- CGST: double (nullable = true)
#  |-- CashierID: string (nullable = true)
#  |-- CreatedTime: long (nullable = true)
#  |-- CustomerCardNo: string (nullable = true)
#  |-- CustomerType: string (nullable = true)
#  |-- DeliveryAddress: struct (nullable = true)
#  |    |-- AddressLine: string (nullable = true)
#  |    |-- City: string (nullable = true)
#  |    |-- ContactNumber: string (nullable = true)
#  |    |-- PinCode: string (nullable = true)
#  |    |-- State: string (nullable = true)
#  |-- DeliveryType: string (nullable = true)
#  |-- InvoiceLineItems: array (nullable = true)
#  |    |-- element: struct (containsNull = true)
#  |    |    |-- ItemCode: string (nullable = true)
#  |    |    |-- ItemDescription: string (nullable = true)
#  |    |    |-- ItemPrice: double (nullable = true)
#  |    |    |-- ItemQty: long (nullable = true)
#  |    |    |-- TotalValue: double (nullable = true)
#  |-- InvoiceNumber: string (nullable = true)
#  |-- NumberOfItems: long (nullable = true)
#  |-- PaymentMethod: string (nullable = true)
#  |-- PosID: string (nullable = true)
#  |-- SGST: double (nullable = true)
#  |-- StoreID: string (nullable = true)
#  |-- TaxableAmount: double (nullable = true)
#  |-- TotalAmount: double (nullable = true)


root
 |-- CESS: double (nullable = true)
 |-- CGST: double (nullable = true)
 |-- CashierID: string (nullable = true)
 |-- CreatedTime: long (nullable = true)
 |-- CustomerCardNo: string (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- DeliveryAddress: struct (nullable = true)
 |    |-- AddressLine: string (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- ContactNumber: string (nullable = true)
 |    |-- PinCode: string (nullable = true)
 |    |-- State: string (nullable = true)
 |-- DeliveryType: string (nullable = true)
 |-- InvoiceLineItems: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ItemCode: string (nullable = true)
 |    |    |-- ItemDescription: string (nullable = true)
 |    |    |-- ItemPrice: double (nullable = true)
 |    |    |-- ItemQty: long (nullable = true)
 |    |    |-- TotalValue: double (nullable = true)
 |-- InvoiceNumber: string (nullable = true)
 |-- NumberOfItems: long (nullable = t

In [2]:
spark

In [3]:
explode_df = raw_df.selectExpr("InvoiceNumber", "CreatedTime", "StoreID",
                                 "PosID", "CustomerType",
                                 "PaymentMethod", "DeliveryType",
                                 "explode(InvoiceLineItems) as LineItem")
explode_df.printSchema()
# root
#  |-- InvoiceNumber: string (nullable = true)
#  |-- CreatedTime: long (nullable = true)
#  |-- StoreID: string (nullable = true)
#  |-- PosID: string (nullable = true)
#  |-- CustomerType: string (nullable = true)
#  |-- PaymentMethod: string (nullable = true)
#  |-- DeliveryType: string (nullable = true)
#  |-- LineItem: struct (nullable = true)
#  |    |-- ItemCode: string (nullable = true)
#  |    |-- ItemDescription: string (nullable = true)
#  |    |-- ItemPrice: double (nullable = true)
#  |    |-- ItemQty: long (nullable = true)
#  |    |-- TotalValue: double (nullable = true)


root
 |-- InvoiceNumber: string (nullable = true)
 |-- CreatedTime: long (nullable = true)
 |-- StoreID: string (nullable = true)
 |-- PosID: string (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- DeliveryType: string (nullable = true)
 |-- LineItem: struct (nullable = true)
 |    |-- ItemCode: string (nullable = true)
 |    |-- ItemDescription: string (nullable = true)
 |    |-- ItemPrice: double (nullable = true)
 |    |-- ItemQty: long (nullable = true)
 |    |-- TotalValue: double (nullable = true)



In [4]:
from pyspark.sql.functions import expr
limpio_df = explode_df \
    .withColumn("ItemCode", expr("LineItem.ItemCode")) \
    .withColumn("ItemDescription", expr("LineItem.ItemDescription")) \
    .withColumn("ItemPrice", expr("LineItem.ItemPrice")) \
    .withColumn("ItemQty", expr("LineItem.ItemQty")) \
    .withColumn("TotalValue", expr("LineItem.TotalValue")) \
    .drop("LineItem")
limpio_df.printSchema()
# root
#  |-- InvoiceNumber: string (nullable = true)
#  |-- CreatedTime: long (nullable = true)
#  |-- StoreID: string (nullable = true)
#  |-- PosID: string (nullable = true)
#  |-- CustomerType: string (nullable = true)
#  |-- PaymentMethod: string (nullable = true)
#  |-- DeliveryType: string (nullable = true)
#  |-- ItemCode: string (nullable = true)
#  |-- ItemDescription: string (nullable = true)
#  |-- ItemPrice: double (nullable = true)
#  |-- ItemQty: long (nullable = true)
#  |-- TotalValue: double (nullable = true)


root
 |-- InvoiceNumber: string (nullable = true)
 |-- CreatedTime: long (nullable = true)
 |-- StoreID: string (nullable = true)
 |-- PosID: string (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- DeliveryType: string (nullable = true)
 |-- ItemCode: string (nullable = true)
 |-- ItemDescription: string (nullable = true)
 |-- ItemPrice: double (nullable = true)
 |-- ItemQty: long (nullable = true)
 |-- TotalValue: double (nullable = true)



In [5]:
facturaWriterQuery = limpio_df.writeStream \
    .format("json") \
    .queryName("Facturas Writer") \
    .outputMode("append") \
    .option("path", "salida") \
    .option("checkpointLocation", "chk-point-dir-caso2") \
    .trigger(processingTime="1 minute") \
    .start()


In [None]:
facturaWriterQuery.awaitTermination()

In [None]:
facturaWriterQuery.explain()