In [1]:
# Inicialización del objeto SparkSession para Kafka

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
import os
import pyspark

scala_version = '2.12'  # TODO: Ensure this is correct
spark_version = pyspark.__version__

packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:3.5.1'
]

args = os.environ.get('PYSPARK_SUBMIT_ARGS', '')
if not args:
    args = f'--packages {",".join(packages)}'
    print('Using packages', packages)
    os.environ['PYSPARK_SUBMIT_ARGS'] = f'{args} pyspark-shell'
else:
    print(f'Found existing args: {args}') 

spark = SparkSession.builder\
   .master("local")\
   .appName("kafka-example")\
   .getOrCreate()

Using packages ['org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0', 'org.apache.kafka:kafka-clients:3.5.1']


In [2]:
# Subscribe to 1 topic
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-1:9092") \
  .option("subscribe", "entrada") \
  .option('includeTimestamp', 'true')\
  .load()


In [None]:
df.printSchema()

In [3]:
from pyspark.sql.functions import explode, split
dfPalabras = df.select(
    explode(split(df.value, ' ')).alias('palabra'),
    df.timestamp)

In [None]:
dfPalabras.printSchema()

In [5]:
from pyspark.sql.functions import window
windowedCounts = dfPalabras\
    .withWatermark("timestamp", "2 minutes") \
    .groupBy(
       window(dfPalabras.timestamp, "2 minutes", "1 minutes"), dfPalabras.palabra
    ).count().orderBy('window')


In [6]:
windowedCounts.printSchema()

root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- palabra: string (nullable = false)
 |-- count: long (nullable = false)



In [None]:
from pyspark.sql.functions import to_json
kafka_output_topic = "salidaw"

kafka_output_config = {
    "kafka.bootstrap.servers": "kafka-1:9092",  # Coloca aquí los servidores de arranque de Kafka
    "topic": kafka_output_topic
}

query2 = windowedCounts \
    .selectExpr("CAST(palabra AS STRING) AS key", "to_json(struct(*)) AS value") \
    .writeStream \
    .format("kafka") \
    .outputMode("complete") \
    .options(**kafka_output_config) \
    .option("checkpointLocation", "/home/jovyan/checkpoints") \
    .start()

In [7]:
 # Start running the query that prints the running counts to the console
query = windowedCounts \
          .writeStream \
          .outputMode("complete") \
          .format("memory") \
          .queryName("consulta1") \
          .start()


In [None]:
from IPython.display import display, clear_output
from time import sleep

while True:
    clear_output(wait=True)
    display(query.status)
    display(spark.sql('SELECT * FROM consulta1').show())
    sleep(5)

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

+--------------------+-------+-----+
|              window|palabra|count|
+--------------------+-------+-----+
|{2024-04-25 20:14...|   hola|    1|
|{2024-04-25 20:15...|   hola|    1|
+--------------------+-------+-----+



None