In [1]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName("Streaming de Ficheros") \
        .master("local[2]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .config("spark.sql.shuffle.partitions", 3) \
        .getOrCreate()

In [2]:
spark

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
bolsaSchema = StructType([
    StructField("CreatedTime", StringType()),
    StructField("Type", StringType()),
    StructField("Amount", IntegerType()),
    StructField("BrokerCode", StringType())
])

In [4]:
rawDF = spark.readStream \
        .format("json") \
        .option("path", "entrada") \
        .option("maxFilesPerTrigger", 1) \
        .schema(bolsaSchema) \
        .load()

In [5]:
rawDF.printSchema()

root
 |-- CreatedTime: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- BrokerCode: string (nullable = true)



In [6]:
from pyspark.sql.functions import col, to_timestamp, expr
accionesDF = rawDF.withColumn("CreatedTime", to_timestamp(col("CreatedTime"), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("Compras", expr("case when Type == 'BUY' then Amount else 0 end")) \
    .withColumn("Ventas", expr("case when Type == 'SELL' then Amount else 0 end"))

accionesDF.printSchema()

root
 |-- CreatedTime: timestamp (nullable = true)
 |-- Type: string (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- BrokerCode: string (nullable = true)
 |-- Compras: integer (nullable = true)
 |-- Ventas: integer (nullable = true)



In [84]:
# accionesDF.show()

In [16]:
from pyspark.sql.functions import window, sum
windowDF = accionesDF \
    .withWatermark("CreatedTime", "10 minutes") \
    .groupBy(  # col("BrokerCode"),
         window(col("CreatedTime"), "15 minutes")) \
    .agg(sum("Compras").alias("Compras"),
         sum("Ventas").alias("Ventas"))

salidaDF = windowDF.select("window.start", "window.end", "Compras", "Ventas")

In [15]:
#bolsaWriterQuery = salidaDF.writeStream \
#        .option("checkpointLocation", "chk-point-dir") \
#        .toTable("bolsaT")

In [71]:
#spark.read.table("bolsaT").show()

+-----+---+-------+------+
|start|end|Compras|Ventas|
+-----+---+-------+------+
+-----+---+-------+------+



In [73]:
#consulta = spark.sql("select * from bolsaT")
#consulta.show(3)

In [17]:
bolsaWriterQuery = salidaDF.writeStream \
    .format("parquet") \
    .queryName("BolsaWQuery") \
    .outputMode("append") \
    .option("path", "salida") \
    .option("checkpointLocation", "chk-point-dir") \
    .trigger(processingTime="1 minute") \
    .start()

In [32]:
rawBolsaDF = spark.read \
        .format("parquet") \
        .option("path", "salida") \
        .load()

In [33]:
rawBolsaDF.show()

+-------------------+-------------------+-------+------+
|              start|                end|Compras|Ventas|
+-------------------+-------------------+-------+------+
|2022-05-09 10:00:00|2022-05-09 10:15:00|   2400|     0|
|2022-05-09 10:15:00|2022-05-09 10:30:00|   2400|  1200|
|2022-05-09 10:45:00|2022-05-09 11:00:00|      0|  2100|
|2022-05-09 10:30:00|2022-05-09 10:45:00|   2700|     0|
+-------------------+-------------------+-------+------+



In [37]:
from pyspark.sql import Window
ventanaTotal = Window.orderBy("end") \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

salidaDF = rawBolsaDF \
    .withColumn("Compras", sum("Compras").over(ventanaTotal)) \
    .withColumn("Ventas", sum("Ventas").over(ventanaTotal)) \
    .withColumn("Neto", expr("Compras - Ventas"))

salidaDF.show(truncate=False)

+-------------------+-------------------+-------+------+----+
|start              |end                |Compras|Ventas|Neto|
+-------------------+-------------------+-------+------+----+
|2022-05-09 10:00:00|2022-05-09 10:15:00|2400   |0     |2400|
|2022-05-09 10:15:00|2022-05-09 10:30:00|4800   |1200  |3600|
|2022-05-09 10:30:00|2022-05-09 10:45:00|7500   |1200  |6300|
|2022-05-09 10:45:00|2022-05-09 11:00:00|7500   |3300  |4200|
+-------------------+-------------------+-------+------+----+

