# Ejemplo Ventanas 3
Este ejemplo es similar a los anteriores, pero en este caso usamos datos de bolsa.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
import string

spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("ejemplo_ventanas_3") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "hdfs:///spark/logs/history") \
    .config("spark.history.fs.logDirectory", "hdfs:///spark/logs/history") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Usaremos el archivo **bolsa1.json**. Lo primero es crear un esquema apropiado para dicho archivo. 

In [2]:
# Definimos el esquema de los datos de entrada
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
bolsaSchema = StructType([
    StructField("CreatedTime", StringType()),
    StructField("Type", StringType()),
    StructField("Amount", IntegerType()),
    StructField("BrokerCode", StringType())
])

Debe  colocarse en el directorio **/user/jovyan/entrada**. Si se coloca en otro hay que modificar la opción *path* de *readStream*.

In [3]:
# Configuramos la lectura de fichero en formato JSON
rawDF = spark.readStream \
        .format("json") \
        .option("path", "entrada") \
        .option("maxFilesPerTrigger", 1) \
        .schema(bolsaSchema) \
        .load()

rawDF.printSchema()

root
 |-- CreatedTime: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- BrokerCode: string (nullable = true)



Creamos un nuevo DF a partir del anterior. A las columnas existentes añadiremos dos nuevas:
- **Compras**: tendrá el valor de la columna *Amount* si la columna *Type* tiene el valor *BUY*, 0 en caso contrario.
- **Ventas**: tendrá el valor de la columna *Amount* si la columna *Type* tiene el valor *SELL*, 0 en caso contrario.

In [4]:
from pyspark.sql.functions import to_timestamp, col, expr
accionesDF = rawDF.withColumn("CreatedTime", to_timestamp(col("CreatedTime"), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("Compras", expr("case when Type == 'BUY' then Amount else 0 end")) \
    .withColumn("Ventas", expr("case when Type == 'SELL' then Amount else 0 end"))

accionesDF.printSchema()

root
 |-- CreatedTime: timestamp (nullable = true)
 |-- Type: string (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- BrokerCode: string (nullable = true)
 |-- Compras: integer (nullable = true)
 |-- Ventas: integer (nullable = true)



Agrupamos por ventanas de **CreatedTime** de 15 minutos de duración, agregando mediante suma las columnas *compras* y *ventas*. 

In [5]:
from pyspark.sql.functions import window, sum
windowDF = accionesDF \
    .groupBy(  # col("BrokerCode"),
         window(col("CreatedTime"), "15 minutes")) \
    .agg(sum("Compras").alias("Compras"),
         sum("Ventas").alias("Ventas"))

salidaDF = windowDF.select("window.start", "window.end", "Compras", "Ventas")

Por último, iniciamos el procesamiento en streaming indicando la salida a consola.

In [8]:


bolsaWriterQuery = salidaDF.writeStream \
    .format("console") \
    .outputMode("complete") \
    .start()

bolsaWriterQuery.awaitTermination()

25/05/05 14:28:22 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-b74acf31-cddb-435c-8d5e-b942b019158c. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/05/05 14:28:22 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-------------------+-------------------+-------+------+
|              start|                end|Compras|Ventas|
+-------------------+-------------------+-------+------+
|2022-05-09 10:00:00|2022-05-09 10:15:00|    800|     0|
|2022-05-09 10:15:00|2022-05-09 10:30:00|    800|   400|
|2022-05-09 10:45:00|2022-05-09 11:00:00|      0|   700|
|2022-05-09 10:30:00|2022-05-09 10:45:00|    900|     0|
+-------------------+-------------------+-------+------+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.9/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

Hay una serie de limitaciones: 
- Al usar agregaciones en operaciones con ventana no podemos usar algunos modos de salida, como ya se habisto en el notebook 00. En concreto, no se pueden usar los modos *update* y *append*.
- Para poder usar el *sink* tipo archivo es necesario el modo *append*.
- Para poder usar el modo *append* es necesario aplicar el concepto de *watermarking* que se verá en el notebook 04.d.

In [6]:
# Ejemplo: No se puede usar el modo append sin watermark.
bolsaWriterQuery = salidaDF.writeStream \
    .format("parquet") \
    .queryName("BolsaWQuery") \
    .outputMode("append") \
    .option("path", "salida") \
    .option("checkpointLocation", "chk-point-dir-caso7") \
    .trigger(processingTime="1 minute") \
    .start()

25/05/05 14:29:41 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


AnalysisException: Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;
Project [window#27.start AS start#42, window#27.end AS end#43, Compras#35L, Ventas#37L]
+- Aggregate [window#38], [window#38 AS window#27, sum(Compras#14) AS Compras#35L, sum(Ventas#20) AS Ventas#37L]
   +- Project [named_struct(start, knownnullable(precisetimestampconversion(((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - CASE WHEN (((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - 0) % 900000000) < cast(0 as bigint)) THEN (((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - 0) % 900000000) + 900000000) ELSE ((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - 0) % 900000000) END) - 0), LongType, TimestampType)), end, knownnullable(precisetimestampconversion((((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - CASE WHEN (((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - 0) % 900000000) < cast(0 as bigint)) THEN (((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - 0) % 900000000) + 900000000) ELSE ((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - 0) % 900000000) END) - 0) + 900000000), LongType, TimestampType))) AS window#38, CreatedTime#8, Type#1, Amount#2, BrokerCode#3, Compras#14, Ventas#20]
      +- Filter isnotnull(CreatedTime#8)
         +- Project [CreatedTime#8, Type#1, Amount#2, BrokerCode#3, Compras#14, CASE WHEN (Type#1 = SELL) THEN Amount#2 ELSE 0 END AS Ventas#20]
            +- Project [CreatedTime#8, Type#1, Amount#2, BrokerCode#3, CASE WHEN (Type#1 = BUY) THEN Amount#2 ELSE 0 END AS Compras#14]
               +- Project [to_timestamp(CreatedTime#0, Some(yyyy-MM-dd HH:mm:ss), TimestampType, Some(Etc/UTC), false) AS CreatedTime#8, Type#1, Amount#2, BrokerCode#3]
                  +- StreamingRelation DataSource(org.apache.spark.sql.SparkSession@4b5ec5ae,json,List(),Some(StructType(StructField(CreatedTime,StringType,true),StructField(Type,StringType,true),StructField(Amount,IntegerType,true),StructField(BrokerCode,StringType,true))),List(),None,Map(path -> entrada, maxFilesPerTrigger -> 1),None), FileSource[entrada], [CreatedTime#0, Type#1, Amount#2, BrokerCode#3]
