### Ejemplo ventanas 1
En primer lugar creamos la sesión como en los casos anteriores:


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
import string

spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("StructuredWordCount") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "hdfs:///spark/logs/history") \
    .config("spark.history.fs.logDirectory", "hdfs:///spark/logs/history") \
    .getOrCreate()




Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


En este ejemplo vamos a leer, nuevamente, los  datos desde un socket. Antes de nada lo ponemos en marcha con el siguiente comando:
- nc -lk 9999

Lo siguiente es poner en marcha el *stream* de lectura, esta vez activando la opción *includeTimestamp*.

In [2]:
df_lineas = spark.readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", "9999") \
    .option('includeTimestamp', 'true')\
    .load()

df_lineas.printSchema()

25/04/28 16:26:29 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


root
 |-- value: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



root
 |-- value: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [3]:
from pyspark.sql.functions import explode, split
df_palabras = df_lineas.select(
    explode(split(df_lineas.value, ' ')).alias('palabra'),
    df_lineas.timestamp)

In [4]:
from pyspark.sql.functions import window
windowed_counts = df_palabras.groupBy(
    window(df_palabras.timestamp, "2 minutes", "1 minute"), df_palabras.palabra
).count().orderBy('window')

In [5]:
query = windowed_counts \
          .writeStream \
          .outputMode("complete") \
          .format("console") \
          .queryName("consulta1") \
          .option("truncate","false") \
          .start()

25/04/28 16:26:36 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-faea51a3-067f-4afa-9bb7-e9b402ff736b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/04/28 16:26:36 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+-------+-----+
|window|palabra|count|
+------+-------+-----+
+------+-------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+--------+-----+
|window                                    |palabra |count|
+------------------------------------------+--------+-----+
|{2025-04-28 16:25:00, 2025-04-28 16:27:00}|caracola|1    |
|{2025-04-28 16:25:00, 2025-04-28 16:27:00}|hola    |1    |
|{2025-04-28 16:26:00, 2025-04-28 16:28:00}|caracola|1    |
|{2025-04-28 16:26:00, 2025-04-28 16:28:00}|hola    |1    |
+------------------------------------------+--------+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+-----------+-----+
|window                                    |palabra    |count|
+------------------------------------------+-----------+-----+
|{2025-04-28 16:25:00, 2025-04-28 16:27:00}|caracola   |1    |
|{2025-04-28 16:25:00, 2025-04-28 16:27:00}|hola       |1    |
|{2025-04-28 16:26:00, 2025-04-28 16:28:00}|cifp       |1    |
|{2025-04-28 16:26:00, 2025-04-28 16:28:00}|caracola   |1    |
|{2025-04-28 16:26:00, 2025-04-28 16:28:00}|carballeira|1    |
|{2025-04-28 16:26:00, 2025-04-28 16:28:00}|desde      |1    |
|{2025-04-28 16:26:00, 2025-04-28 16:28:00}|hola       |2    |
|{2025-04-28 16:26:00, 2025-04-28 16:28:00}|A          |1    |
|{2025-04-28 16:27:00, 2025-04-28 16:29:00}|desde      |1    |
|{2025-04-28 16:27:00, 2025-04-28 16:29:00}|carballeira|1    |
|{2025-04-28 16:27:00, 2025-04-28 16:29:00}|cifp       |1    |
|{2025-04-28 16:27:00

In [7]:
from IPython.display import display, clear_output
from time import sleep

while True:
    clear_output(wait=True)
    display(query.status)
    display(spark.sql('SELECT * FROM consulta1').show())
    sleep(1)

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

+--------------------+-----------+-----+
|              window|    palabra|count|
+--------------------+-----------+-----+
|{2025-04-28 16:22...|     desade|    1|
|{2025-04-28 16:22...|   caracola|    1|
|{2025-04-28 16:22...|          A|    1|
|{2025-04-28 16:22...|       hola|    2|
|{2025-04-28 16:22...|Carballeira|    1|
|{2025-04-28 16:23...|       hola|    2|
|{2025-04-28 16:23...|          A|    1|
|{2025-04-28 16:23...|   caracola|    2|
|{2025-04-28 16:23...|       mola|    1|
|{2025-04-28 16:23...|Carballeira|    1|
|{2025-04-28 16:23...|     desade|    1|
|{2025-04-28 16:24...|   caracola|    1|
|{2025-04-28 16:24...|       mola|    1|
+--------------------+-----------+-----+



None

KeyboardInterrupt: 