Cuarto ejercicio de Spark Streaming usando el API DStream.
WordCount con ventana deslizante usando CountByValueAndWindow

In [1]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

In [2]:
# Function to create and setup a new StreamingContext
def functionToCreateContext():
    # Create a local StreamingContext with two working thread and batch interval of 5 seconds
    sc = SparkContext("local[2]", "WindowedNetworkWordCount2")
    ssc = StreamingContext(sc, 5)
    
    # Mandatory set a checkpoint dir
    # http://spark.apache.org/docs/latest/streaming-programming-guide.html#checkpointing
    # Crear carpeta /checkpointDirectory2 dentro del directorio notebooks-spark o dentro del directorio Spark-Streaming-Python
    ssc.checkpoint("./checkpointDirectory4")  # set checkpoint directory
    return ssc

In [3]:
# Get StreamingContext from checkpoint data or create a new one
ssc = StreamingContext.getOrCreate(checkpointPath = "./checkpointDirectory4", setupFunc = functionToCreateContext)

In [4]:
# Create a DStream that will connect to hostname:port, like localhost:9999
# In command line -> nc -lk 9999
lines =ssc.socketTextStream("localhost", 9999)

In [5]:
# Split each line into words
words =lines.flatMap(lambda line:line.split(" "))

In [6]:
# Count each word in each batch
windowedWordCounts = words.countByValueAndWindow(30, 10)

In [None]:
# Print the first ten elements of each RDD generated in this DStream to the console
windowedWordCounts.pprint()

ssc.start() # Start the computation
ssc.awaitTermination() # Wait for the computation to terminate

-------------------------------------------
Time: 2022-06-01 11:00:10
-------------------------------------------

-------------------------------------------
Time: 2022-06-01 11:00:20
-------------------------------------------
('ejemplo', 1)
('nuevo', 1)

-------------------------------------------
Time: 2022-06-01 11:00:30
-------------------------------------------
('ejemplo', 1)
('con', 1)
('', 1)
('nuevo', 1)
('wordcount', 1)
('ventana', 1)
('deslizante', 1)

-------------------------------------------
Time: 2022-06-01 11:00:40
-------------------------------------------
('ejemplo', 1)
('con', 1)
('', 1)
('usando', 1)
('nuevo', 1)
('wordcount', 1)
('ventana', 1)
('deslizante', 1)
('countbyvalueandwindow', 1)

-------------------------------------------
Time: 2022-06-01 11:00:50
-------------------------------------------
('con', 1)
('', 2)
('usando', 1)
('streaming', 1)
('el', 1)
('api', 1)
('wordcount', 1)
('ventana', 1)
('deslizante', 1)
('countbyvalueandwindow', 1)
...

------