In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
import string

spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("StructuredWordCount") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "hdfs:///spark/logs/history") \
    .config("spark.history.fs.logDirectory", "hdfs:///spark/logs/history") \
    .getOrCreate()





Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# Create DataFrame representing the stream of input lines from connection to localhost:9999
lines = spark \
         .readStream \
         .format("socket") \
         .option("host", "localhost") \
         .option("port",9999) \
         .load()


25/04/07 09:48:39 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


In [3]:
from pyspark.sql.functions import regexp_replace
# Split the lines into words
words = lines.select(
        explode(
split(lines.value, " ")
   ).alias("word")
)

 # Generate running word count
wordCounts = words.groupBy("word").count()

In [4]:
 # Start running the query that prints the running counts to the console
query = wordCounts \
          .writeStream \
          .outputMode("complete") \
          .format("memory") \
          .queryName("consulta1") \
          .start()



25/04/07 09:48:50 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-06400db8-bdc4-44ae-ab53-2177ac553a47. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/04/07 09:48:50 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
[Stage 1:>                                                        (0 + 2) / 200]

In [5]:
print(lines.isStreaming)

True


[Stage 1:=>                                                       (5 + 2) / 200]

In [6]:
# Start running the query that prints the running counts to the console
query = wordCounts \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

#query.awaitTermination()


25/04/07 09:39:04 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-8ec9c026-e5a5-4987-bcc5-11ae9d32909f. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/04/07 09:39:04 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----+
|word|count|
+----+-----+
+----+-----+



                                                                                

In [5]:
from IPython.display import display, clear_output
from time import sleep

while True:
    clear_output(wait=True)
    display(query.status)
    display(spark.sql('SELECT * FROM consulta1').show())
    sleep(1)

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

+--------+-----+
|    word|count|
+--------+-----+
|    otra|    1|
|     vez|    1|
|probando|    1|
|       o|    1|
|      va|    1|
|     que|    1|
|    hola|    3|
|      no|    1|
|  parece|    1|
+--------+-----+



None

KeyboardInterrupt: 

In [None]:

# display(query.status)
# display(spark.sql('SELECT * FROM consulta1').show())
print (query.status)
print (spark.sql('SELECT * FROM consulta1').show())

In [None]:
from pyspark.sql.functions import to_json
kafka_output_topic = "salida"

kafka_output_config = {
    "kafka.bootstrap.servers": "kafka-1:9092",  # Coloca aquí los servidores de arranque de Kafka
    "topic": kafka_output_topic
}

query = wordCounts \
    .selectExpr("CAST(word AS STRING) AS key", "CAST(count AS STRING) AS value") \
    .writeStream \
    .format("kafka") \
    .outputMode("complete") \
    .options(**kafka_output_config) \
    .start()
