In [2]:
spark = SparkSession \
    .builder \
    .master("local[3]") \
    .appName("s8a Kafka join MariaDB") \
    .config("spark.streaming.stopGracefullyOnShutdown", "true") \
    .config("spark.sql.shuffle.partitions", 2) \
    .getOrCreate()

In [3]:
from pyspark.sql.types import StructType, StructField, StringType
kafkaSchema = StructType([
    StructField("login_id", StringType()),
    StructField("login_time", StringType())
])

In [4]:
kafkaDF = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "iabd-virtualbox:9092") \
    .option("subscribe", "usuariosk") \
    .option("startingOffsets", "earliest") \
    .load()

In [5]:
from pyspark.sql.functions import from_json, col, to_timestamp
# Pasamos el value de Kafka a string y luego a JSON
valueDF = kafkaDF.select(from_json(col("value").cast("string"), kafkaSchema).alias("value"))

In [6]:
# Cast del campo login_time a tipo fecha
loginDF = valueDF.select("value.*") \
    .withColumn("login_time", to_timestamp(col("login_time"), "yyyy-MM-dd HH:mm:ss"))

In [7]:
jdbcDF = spark.read \
    .format("jdbc") \
    .option("driver", "com.mysql.jdbc.Driver") \
    .option("url", "jdbc:mysql://localhost") \
    .option("dbtable", "spark.usuarios") \
    .option("user", "iabd") \
    .option("password", "iabd") \
    .load()

In [8]:
jdbcDF.show()

+---+--------------+-------------------+
| id|        nombre|        ultimoLogin|
+---+--------------+-------------------+
|  1| Aitor Medrano|2022-05-16 13:36:58|
|  2|   Pedro Casas|2022-05-16 13:36:58|
|  3|  Laura García|2022-05-16 13:36:58|
|  4|Mª Josep Vidal|2022-05-16 13:36:58|
+---+--------------+-------------------+



In [10]:
joinExpr = loginDF.login_id == jdbcDF.id
joinType = "inner"

joinDF = loginDF.join(jdbcDF, joinExpr, joinType) \
    .drop(loginDF.login_id)
joinDF.printSchema()

root
 |-- login_time: timestamp (nullable = true)
 |-- id: decimal(20,0) (nullable = true)
 |-- nombre: string (nullable = true)
 |-- ultimoLogin: timestamp (nullable = true)



In [13]:
resultadoDF = joinDF.select(col("id"), col("nombre"),
    col("ultimoLogin"), col("login_time"))
resultadoDF.printSchema()

root
 |-- id: decimal(20,0) (nullable = true)
 |-- nombre: string (nullable = true)
 |-- ultimoLogin: timestamp (nullable = true)
 |-- login_time: timestamp (nullable = true)



In [16]:
queryWriter = resultadoDF.writeStream \
    .format("console") \
    .outputMode("update") \
    .option("checkpointLocation", "chk-point-dir") \
    .trigger(processingTime="1 minute") \
    .start()

In [None]:
queryWriter.awaitTermination()