In [1]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName("Streaming Bolsa") \
        .master("local[3]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .config("spark.sql.shuffle.partitions", 3) \
        .getOrCreate()

In [2]:
spark

In [4]:
raw_df = spark.readStream \
        .format("json") \
        .option("path", "entrada") \
        .option("maxFilesPerTrigger", 1) \
        .load()

In [5]:
raw_df.printSchema()

root
 |-- CESS: double (nullable = true)
 |-- CGST: double (nullable = true)
 |-- CashierID: string (nullable = true)
 |-- CreatedTime: long (nullable = true)
 |-- CustomerCardNo: string (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- DeliveryAddress: struct (nullable = true)
 |    |-- AddressLine: string (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- ContactNumber: string (nullable = true)
 |    |-- PinCode: string (nullable = true)
 |    |-- State: string (nullable = true)
 |-- DeliveryType: string (nullable = true)
 |-- InvoiceLineItems: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ItemCode: string (nullable = true)
 |    |    |-- ItemDescription: string (nullable = true)
 |    |    |-- ItemPrice: double (nullable = true)
 |    |    |-- ItemQty: long (nullable = true)
 |    |    |-- TotalValue: double (nullable = true)
 |-- InvoiceNumber: string (nullable = true)
 |-- NumberOfItems: long (nullable = t

In [5]:
explode_df = raw_df.selectExpr("InvoiceNumber", "CreatedTime", "StoreID", "PosID", "CustomerType", "PaymentMethod", "DeliveryType", "explode(InvoiceLineItems) as LineItem")
explode_df.printSchema()

root
 |-- InvoiceNumber: string (nullable = true)
 |-- CreatedTime: long (nullable = true)
 |-- StoreID: string (nullable = true)
 |-- PosID: string (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- DeliveryType: string (nullable = true)
 |-- LineItem: struct (nullable = true)
 |    |-- ItemCode: string (nullable = true)
 |    |-- ItemDescription: string (nullable = true)
 |    |-- ItemPrice: double (nullable = true)
 |    |-- ItemQty: long (nullable = true)
 |    |-- TotalValue: double (nullable = true)



In [6]:
from pyspark.sql.functions import expr
limpio_df = explode_df \
    .withColumn("ItemCode", expr("LineItem.ItemCode")) \
    .withColumn("ItemDescription", expr("LineItem.ItemDescription")) \
    .withColumn("ItemPrice", expr("LineItem.ItemPrice")) \
    .withColumn("ItemQty", expr("LineItem.ItemQty")) \
    .withColumn("TotalValue", expr("LineItem.TotalValue")) \
    .drop("LineItem")
limpio_df.printSchema()

root
 |-- InvoiceNumber: string (nullable = true)
 |-- CreatedTime: long (nullable = true)
 |-- StoreID: string (nullable = true)
 |-- PosID: string (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- DeliveryType: string (nullable = true)
 |-- ItemCode: string (nullable = true)
 |-- ItemDescription: string (nullable = true)
 |-- ItemPrice: double (nullable = true)
 |-- ItemQty: long (nullable = true)
 |-- TotalValue: double (nullable = true)



In [7]:
facturaWriterQuery = limpio_df.writeStream \
    .format("json") \
    .queryName("Facturas Writer") \
    .outputMode("append") \
    .option("path", "salida") \
    .option("checkpointLocation", "chk-point-dir") \
    .trigger(processingTime="1 minute") \
    .start()

In [8]:
facturaWriterQuery.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.3-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.3-src.zip/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [1]:
facturaWriterQuery.stop()

NameError: name 'facturaWriterQuery' is not defined