In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, ArrayType

spark = (
    SparkSession 
    .builder 
    .appName("Streaming from Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0')
    .master("local[*]") 
    .getOrCreate()
)


json_schema = StructType([
    StructField("id", StringType(), True),  # Le champ 'id' est une chaîne
    StructField("tripUpdate", StructType([  # Le champ 'tripUpdate' est un objet imbriqué
        StructField("trip", StructType([  # Le champ 'trip' est aussi un objet imbriqué
            StructField("tripId", StringType(), True),
            StructField("startTime", StringType(), True),
            StructField("startDate", StringType(), True),
            StructField("scheduleRelationship", StringType(), True),
            StructField("routeId", StringType(), True),
            StructField("directionId", LongType(), True)  # Utilisation de LongType pour les entiers longs
        ]), True),
        StructField("stopTimeUpdate", ArrayType(  # Le champ 'stopTimeUpdate' est un tableau
            StructType([  # Chaque élément du tableau est un objet
                StructField("stopSequence", LongType(), True),
                StructField("departure", StructType([  # Le champ 'departure' est un objet imbriqué
                    StructField("time", StringType(), True)
                ]), True),
                StructField("stopId", StringType(), True),
                StructField("scheduleRelationship", StringType(), True)
            ])
        ), True)
    ]), True),
    StructField("timestamp", StringType(), True)  # Le champ 'timestamp' est une chaîne
])


In [3]:
spark

In [4]:
kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")  # Utiliser kafka.broker.list pour les anciennes versions de Kafka
    .option("subscribe", "tripupdate")  # Le nom du topic à lire
    .option("startingOffsets", "earliest")  # Lire depuis le début si c'est le premier démarrage
    .load()
)

In [5]:
kafka_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [6]:
#test 
kafka_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start() \
    .awaitTermination()

24/12/20 22:33:18 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-8c262c2b-3730-400f-b1fe-96af7252ad35. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/12/20 22:33:18 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/12/20 22:33:19 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/12/20 22:33:19 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/12/20 22:33:19 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/12/20 22:33:19 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known con

-------------------------------------------
Batch: 0
-------------------------------------------
+---+-----+
|key|value|
+---+-----+
+---+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+----+--------------------+
| key|               value|
+----+--------------------+
|NULL|{"id": "74_14674_...|
|NULL|{"id": "74_15266_...|
|NULL|{"id": "74_15266_...|
|NULL|{"id": "74_15266_...|
|NULL|{"id": "74_15266_...|
|NULL|{"id": "74_15627_...|
|NULL|{"id": "74_15266_...|
+----+--------------------+

-------------------------------------------
Batch: 2
-------------------------------------------
+----+--------------------+
| key|               value|
+----+--------------------+
|NULL|{"id": "74_15266_...|
|NULL|{"id": "74_15626_...|
|NULL|{"id": "74_15627_...|
|NULL|{"id": "74_15627_...|
|NULL|{"id": "74_14853_...|
|NULL|{"id": "74_14691_...|
|NULL|{"id": "74_15627_...|
|NULL|{"id": "74_15627_...|
|NULL|{"id": "74_15627_...|
|NULL|{"id": "74_15627_...|
|NULL|{"id": "74_15627_...|
|NULL|{"id": "74_14942_...|
|NULL|{"id": "74_15641_...|
|NULL|{"id": "74_15627_...|
|NULL|{"id": "74_1467

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [7]:
messages_df = kafka_df.selectExpr("CAST(value AS STRING) as json")

In [8]:
json_df = messages_df.selectExpr("from_json(json, 'id STRING, tripUpdate STRUCT<trip:STRUCT<tripId:STRING, startTime:STRING, startDate:STRING, scheduleRelationship:STRING, routeId:STRING, directionId:LONG>, stopTimeUpdate:ARRAY<STRUCT<stopSequence:LONG, departure:STRUCT<time:STRING>, stopId:STRING, scheduleRelationship:STRING>>>, timestamp STRING') as parsed_data") \
    .select("parsed_data.*")

In [9]:
json_df

DataFrame[id: string, tripUpdate: struct<trip:struct<tripId:string,startTime:string,startDate:string,scheduleRelationship:string,routeId:string,directionId:bigint>,stopTimeUpdate:array<struct<stopSequence:bigint,departure:struct<time:string>,stopId:string,scheduleRelationship:string>>>, timestamp: string]

In [10]:
query = json_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination()

24/12/20 22:34:45 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-b02066b6-f4b5-4df4-93e6-c6bde2593011. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/12/20 22:34:45 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/12/20 22:34:45 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/12/20 22:34:45 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/12/20 22:34:45 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/12/20 22:34:45 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known con

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------------+--------------------+---------+
|                  id|          tripUpdate|timestamp|
+--------------------+--------------------+---------+
|74_14674_8544710_...|{{74_14674_854471...|     NULL|
|74_15266_9085747_...|{{74_15266_908574...|     NULL|
|74_15266_9085745_...|{{74_15266_908574...|     NULL|
|74_15266_9085746_...|{{74_15266_908574...|     NULL|
|74_15266_9085704_...|{{74_15266_908570...|     NULL|
|74_15627_9326657_...|{{74_15627_932665...|     NULL|
|74_15266_8882010_...|{{74_15266_888201...|     NULL|
|74_15266_9084930_...|{{74_15266_908493...|     NULL|
|74_15626_9323289_...|{{74_15626_932328...|     NULL|
|74_15627_9325981_...|{{74_15627_932598...|     NULL|
|74_15627_9326550_...|{{74_15627_932663...|     NULL|
|74_14853_8658325_...|{{74_14853_865832...|     NULL|
|74_14691_8930866_...|{{74_14691_893086...|     NULL|
|74_15627_9326551_...|{{74_15627_932655

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 