In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName("Ventana fija IABD WordCount") \
        .master("local[2]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .config("spark.sql.shuffle.partitions", 3) \
        .getOrCreate()

dfLineas = spark.readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", "9999") \
    .option('includeTimestamp', 'true')\
    .load()

In [2]:
dfLineas.printSchema()

root
 |-- value: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [3]:
from pyspark.sql.functions import explode, split
dfPalabras = dfLineas.select(
    explode(split(dfLineas.value, ' ')).alias('palabra'),
    dfLineas.timestamp)

In [4]:
from pyspark.sql.functions import window
windowedCounts = dfPalabras.groupBy(
    window(dfPalabras.timestamp, "2 minutes"), dfPalabras.palabra
).count().orderBy('window')

In [5]:
palabrasQuery = windowedCounts.writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("consulta2") \
    .option('truncate', 'false')\
    .start()

In [10]:
from IPython.display import display, clear_output
display(palabrasQuery.status)
display(spark.sql('SELECT * FROM consulta2').show())

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

+--------------------+--------+-----+
|              window| palabra|count|
+--------------------+--------+-----+
|{2024-04-25 19:20...|    hola|    1|
|{2024-04-25 19:20...|  trav√©s|    1|
|{2024-04-25 19:20...|    toco|    1|
|{2024-04-25 19:20...|       a|    1|
|{2024-04-25 19:20...|     una|    1|
|{2024-04-25 19:20...|    Bebo|    1|
|{2024-04-25 19:20...|      la|    1|
|{2024-04-25 19:20...|      de|    1|
|{2024-04-25 19:20...|cacerola|    1|
|{2024-04-25 19:20...|mientras|    1|
|{2024-04-25 19:20...|cocacola|    1|
|{2024-04-25 19:20...| carcola|    1|
|{2024-04-25 19:20...|      me|    1|
|{2024-04-25 19:20...|  pirola|    1|
|{2024-04-25 19:22...|cacerola|    1|
|{2024-04-25 19:22...|cocacola|    1|
+--------------------+--------+-----+



None