In [1]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName("Bizum Streaming IABD") \
        .master("local[2]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .config("spark.sql.shuffle.partitions", 3) \
        .getOrCreate()

In [2]:
# Definimos el esquema de los datos de entrada
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
kafkaSchema = StructType([
    StructField("timestamp", StringType()),
    StructField("nombre", StringType()),
    StructField("cantidad", IntegerType()),
    StructField("concepto", StringType())
])

In [3]:
kafkaDF = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "iabd-virtualbox:9092") \
    .option("subscribe", "iabd-bizum") \
    .option("startingOffsets", "earliest") \
    .load()

In [4]:
# Pasamos el value de Kafka a string y luego a JSON
from pyspark.sql.functions import from_json, col
valueDF = kafkaDF.select(from_json(col("value").cast("string"), kafkaSchema).alias("value"))

In [5]:
valueDF.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- timestamp: string (nullable = true)
 |    |-- nombre: string (nullable = true)
 |    |-- cantidad: integer (nullable = true)
 |    |-- concepto: string (nullable = true)



In [6]:
# Cast del campo login_time a tipo fecha
from pyspark.sql.functions import to_timestamp
bizumDF = valueDF.select("value.*") \
    .withColumn("timestamp", to_timestamp(col("timestamp"), "yyyy-MM-dd HH:mm:ss.SSSSSS"))

In [7]:
bizumDF.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- nombre: string (nullable = true)
 |-- cantidad: integer (nullable = true)
 |-- concepto: string (nullable = true)



In [8]:
from pyspark.sql.functions import window, sum, max, min, count, avg
windowDF = bizumDF \
    .withWatermark("timestamp", "30 seconds") \
    .groupBy(  # col("BrokerCode"),
         window(col("timestamp"), "3 minutes")) \
    .agg(max("cantidad").alias("mayor"),
         min("cantidad").alias("menor"),
         sum("cantidad").alias("total"),
         count("cantidad").alias("cantidad"),
         avg("cantidad").alias("media"),
        )

In [9]:
windowDF.printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- mayor: integer (nullable = true)
 |-- menor: integer (nullable = true)
 |-- total: long (nullable = true)
 |-- cantidad: long (nullable = false)
 |-- media: double (nullable = true)



In [10]:
salidaDF = windowDF.select("window.start", "window.end", "mayor", "menor", "total", "cantidad", "media")

In [11]:
#bizumQuery = salidaDF.writeStream \
#    .format("console") \
#    .outputMode("complete") \
#    .start()

In [12]:
bizumQuery = salidaDF.writeStream \
    .format("mongodb") \
    .outputMode("append") \
    .option("checkpointLocation", "chk-point-dir-bizum-kafka") \
    .option("connection.uri","mongodb+srv://iabd:iabdiabd@cluster0.dfaz5er.mongodb.net") \
    .option("database","iabd") \
    .option("collection","bizum") \
    .start()