# Ejemplo de Spark Streaming con sockets

1. Creamos la spark-session:

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
import string

spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("StructuredWordCount") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "hdfs:///spark/logs/history") \
    .config("spark.history.fs.logDirectory", "hdfs:///spark/logs/history") \
    .getOrCreate()




Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2. Creamos un dataframe a partir de un *origen* de datos. En este caso *socket* que no se recomienda en producción:

In [3]:
lines = spark \
         .readStream \
         .format("socket") \
         .option("host", "localhost") \
         .option("port",9999) \
         .load()

25/04/09 17:50:09 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


3. Generamos un nuevo dataframe que cuenta las ocurrencias de cada palabra a partir del anterior.

In [4]:
from pyspark.sql.functions import regexp_replace
# Split the lines into words
words = lines.select(
        explode(
split(lines.value, " ")
   ).alias("word")
)

 # Generate running word count
wordCounts = words.groupBy("word").count()

4. Iniciamos la consulta indicando el destino (*sink*)

In [5]:
query = wordCounts \
          .writeStream \
          .outputMode("update") \
          .format("memory") \
          .queryName("consulta1") \
          .start()


25/04/09 17:50:16 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-5986546d-bf45-4686-8955-47d5288264cf. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/04/09 17:50:16 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.

5. Mostramos

In [6]:
from IPython.display import display, clear_output
from time import sleep

while True:
    clear_output(wait=True)
    display(query.status)
    display(spark.sql('SELECT * FROM consulta1').show())
    sleep(1)

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

+----+-----+
|word|count|
+----+-----+
| que|    1|
|hola|    1|
| tal|    1|
|hola|    2|
+----+-----+



None

KeyboardInterrupt: 