# Exemplo Xanelas 3
Este exemplo é semellante aos anteriores pero neste caso usamos datos de bolsa.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
import string

spark = SparkSession.builder \
    .appName("xanelas-3") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

print("Versión: ",spark.version)

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/hadoop/.ivy2/cache
The jars for the packages stored in: /home/hadoop/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a8f3ad09-24c3-479c-8b74-f12633b43900;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in central
	found io.delta#delta-storage;3.1.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.7 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.fi

Versión:  3.5.7


Usaremos o ficheiro **bolsa1.json**. O primeiro é crear un esquema axeitado para dito ficheiro.


In [2]:
# Definimos o esquema dos datos de entrada
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
bolsaSchema = StructType([
    StructField("CreatedTime", StringType()),
    StructField("Type", StringType()),
    StructField("Amount", IntegerType()),
    StructField("BrokerCode", StringType())
])

Deben colocarse no directorio **/datos/bolsa**. Se se coloca noutro hai que modificar a opción *path* de *readStream*.

In [3]:
# Configuramos la lectura de fichero en formato JSON
rawDF = spark.readStream \
        .format("json") \
        .option("path", "/data/bolsa") \
        .option("maxFilesPerTrigger", 1) \
        .schema(bolsaSchema) \
        .load()

rawDF.printSchema()

root
 |-- CreatedTime: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- BrokerCode: string (nullable = true)



Creamos un novo DF a partir do anterior. Ás columnas existentes engadiremos dúas novas:
- **Compras**: terá o valor da columna *Amount* se a columna *Type* ten o valor *BUY*, 0 no caso contrario.
- **Vendas**: terá o valor da columna *Amount* se a columna *Type* ten o valor *SELL*, 0 no caso contrario.


In [4]:
from pyspark.sql.functions import to_timestamp, col, expr
accionesDF = rawDF.withColumn("CreatedTime", to_timestamp(col("CreatedTime"), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("Compras", expr("case when Type == 'BUY' then Amount else 0 end")) \
    .withColumn("Ventas", expr("case when Type == 'SELL' then Amount else 0 end"))

accionesDF.printSchema()

root
 |-- CreatedTime: timestamp (nullable = true)
 |-- Type: string (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- BrokerCode: string (nullable = true)
 |-- Compras: integer (nullable = true)
 |-- Ventas: integer (nullable = true)



Agrupamos por ventanas de **CreatedTime** de 15 minutos de duración, agregando mediante suma as columnas *compras*e *ventas*. 

In [5]:
from pyspark.sql.functions import window, sum
windowDF = accionesDF \
    .groupBy(  # col("BrokerCode"),
         window(col("CreatedTime"), "15 minutes")) \
    .agg(sum("Compras").alias("Compras"),
         sum("Ventas").alias("Ventas"))

salidaDF = windowDF.select("window.start", "window.end", "Compras", "Ventas")

Por último, iniciamos o procesamento en streaming indicando a saída a consola.

In [6]:


bolsaWriterQuery = salidaDF.writeStream \
    .format("console") \
    .outputMode("complete") \
    .start()

bolsaWriterQuery.awaitTermination()

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-------------------+-------------------+-------+------+
|              start|                end|Compras|Ventas|
+-------------------+-------------------+-------+------+
|2022-05-09 10:45:00|2022-05-09 11:00:00|      0|   700|
|2022-05-09 10:30:00|2022-05-09 10:45:00|    900|     0|
|2022-05-09 10:00:00|2022-05-09 10:15:00|    800|     0|
|2022-05-09 10:15:00|2022-05-09 10:30:00|    800|   400|
+-------------------+-------------------+-------+------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-------------------+-------------------+-------+------+
|              start|                end|Compras|Ventas|
+-------------------+-------------------+-------+------+
|2022-05-09 10:45:00|2022-05-09 11:00:00|      0|   700|
|2022-05-09 10:30:00|2022-05-09 10:45:00|    900|     0|
|2022-05-09 10:00:00|2022-05-09 10:15:00|    800|     0|
|2022-05-09 10:15:00|2022-05-09 10:30:00|    800|   400|
|2022-05-09 12:45:00|2022-05-09 13:00:00|    150|     0|
+-------------------+-------------------+-------+------+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.8/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

Hai unha serie de limitacións:
- Ao usar agregacións en operacións con xanela non se poden empregar algúns modos de saída, como xa se viu no notebook 00. En concreto, non se poden usar os modos *update* e *append*.
- Para poder usar o *sink* de tipo ficheiro é necesario o modo *append*.
- Para poder usar o modo *append* é necesario aplicar o concepto de *watermarking*, que se verá no notebook 04.d.


In [6]:
# Exemplo: Non se pode usar o modo append sen watermark.
bolsaWriterQuery = salidaDF.writeStream \
    .format("parquet") \
    .queryName("BolsaWQuery") \
    .outputMode("append") \
    .option("path", "salida") \
    .option("checkpointLocation", "chk-point-dir-caso7") \
    .trigger(processingTime="1 minute") \
    .start()

25/05/05 14:29:41 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


AnalysisException: Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;
Project [window#27.start AS start#42, window#27.end AS end#43, Compras#35L, Ventas#37L]
+- Aggregate [window#38], [window#38 AS window#27, sum(Compras#14) AS Compras#35L, sum(Ventas#20) AS Ventas#37L]
   +- Project [named_struct(start, knownnullable(precisetimestampconversion(((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - CASE WHEN (((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - 0) % 900000000) < cast(0 as bigint)) THEN (((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - 0) % 900000000) + 900000000) ELSE ((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - 0) % 900000000) END) - 0), LongType, TimestampType)), end, knownnullable(precisetimestampconversion((((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - CASE WHEN (((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - 0) % 900000000) < cast(0 as bigint)) THEN (((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - 0) % 900000000) + 900000000) ELSE ((precisetimestampconversion(CreatedTime#8, TimestampType, LongType) - 0) % 900000000) END) - 0) + 900000000), LongType, TimestampType))) AS window#38, CreatedTime#8, Type#1, Amount#2, BrokerCode#3, Compras#14, Ventas#20]
      +- Filter isnotnull(CreatedTime#8)
         +- Project [CreatedTime#8, Type#1, Amount#2, BrokerCode#3, Compras#14, CASE WHEN (Type#1 = SELL) THEN Amount#2 ELSE 0 END AS Ventas#20]
            +- Project [CreatedTime#8, Type#1, Amount#2, BrokerCode#3, CASE WHEN (Type#1 = BUY) THEN Amount#2 ELSE 0 END AS Compras#14]
               +- Project [to_timestamp(CreatedTime#0, Some(yyyy-MM-dd HH:mm:ss), TimestampType, Some(Etc/UTC), false) AS CreatedTime#8, Type#1, Amount#2, BrokerCode#3]
                  +- StreamingRelation DataSource(org.apache.spark.sql.SparkSession@4b5ec5ae,json,List(),Some(StructType(StructField(CreatedTime,StringType,true),StructField(Type,StringType,true),StructField(Amount,IntegerType,true),StructField(BrokerCode,StringType,true))),List(),None,Map(path -> entrada, maxFilesPerTrigger -> 1),None), FileSource[entrada], [CreatedTime#0, Type#1, Amount#2, BrokerCode#3]
