In [32]:
from pyspark.sql import SparkSession
spark = (
    SparkSession 
    .builder 
    .appName("Streaming from Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True)
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,com.datastax.spark:spark-cassandra-connector_2.12:3.5.0")
    .config("spark.cassandra.connection.host", "127.0.0.1")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.memory", "2g")
    .config("spark.cassandra.auth.username", "cassandra")
    .config("spark.cassandra.auth.password", "cassandra")
    .master("local[*]") 
    .getOrCreate()
)

In [33]:
spark

In [34]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, IntegerType

schema_vp = StructType([
    StructField("vehicle_type", StringType(), True),
    StructField("nextStop", StringType(), True),
    StructField("VP", StructType([
        StructField("desi", StringType(), True),
        StructField("dir", StringType(), True),
        StructField("oper", IntegerType(), True),
        StructField("veh", IntegerType(), True),
        StructField("tst", StringType(), True),
        StructField("tsi", LongType(), True),
        StructField("spd", DoubleType(), True),
        StructField("hdg", IntegerType(), True),
        StructField("lat", DoubleType(), True),
        StructField("long", DoubleType(), True),
        StructField("acc", DoubleType(), True),
        StructField("dl", IntegerType(), True),
        StructField("odo", IntegerType(), True),  # correction ici
        StructField("drst", IntegerType(), True),
        StructField("oday", StringType(), True),
        StructField("jrn", IntegerType(), True),
        StructField("line", IntegerType(), True),
        StructField("start", StringType(), True),
        StructField("loc", StringType(), True),
        StructField("stop", IntegerType(), True),
        StructField("route", StringType(), True),
        StructField("occu", IntegerType(), True)
    ]))
])

In [35]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

stop_time_update_schema = StructType([
    StructField("stop_id", StringType(), True),
    StructField("arrival_time", StringType(), True),
    StructField("departure_time", StringType(), True)
])

trip_update_schema = StructType([
    StructField("trip_id", StringType(), True),
    StructField("route_id", StringType(), True),
    StructField("start_date", StringType(), True),
    StructField("start_time", StringType(), True),
    StructField("direction_id", IntegerType(), True),
    StructField("stop_time_updates", ArrayType(stop_time_update_schema), True)
])


In [36]:
kafka_df_vp = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "vp") \
    .option("startingOffsets","earliest") \
    .option("enable.auto.create.topics", "false") \
    .load()

In [37]:
kafka_df_tp = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "trip") \
    .option("startingOffsets","earliest") \
    .option("enable.auto.create.topics", "false") \
    .load()

In [38]:
from pyspark.sql.functions import expr

kafka_json_df_vp = kafka_df_vp.withColumn("value", expr("cast(value as string)"))

In [39]:
from pyspark.sql.functions import expr

kafka_json_df_tp = kafka_df_tp.withColumn("value", expr("cast(value as string)"))

In [40]:
from pyspark.sql.functions import from_json, col

streaming_df_vp = kafka_json_df_vp.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema_vp).alias("data"))

In [41]:
from pyspark.sql.functions import from_json, col

streaming_df_tp= kafka_json_df_tp.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), trip_update_schema).alias("data"))

In [42]:
streaming_df_vp.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- vehicle_type: string (nullable = true)
 |    |-- nextStop: string (nullable = true)
 |    |-- VP: struct (nullable = true)
 |    |    |-- desi: string (nullable = true)
 |    |    |-- dir: string (nullable = true)
 |    |    |-- oper: integer (nullable = true)
 |    |    |-- veh: integer (nullable = true)
 |    |    |-- tst: string (nullable = true)
 |    |    |-- tsi: long (nullable = true)
 |    |    |-- spd: double (nullable = true)
 |    |    |-- hdg: integer (nullable = true)
 |    |    |-- lat: double (nullable = true)
 |    |    |-- long: double (nullable = true)
 |    |    |-- acc: double (nullable = true)
 |    |    |-- dl: integer (nullable = true)
 |    |    |-- odo: integer (nullable = true)
 |    |    |-- drst: integer (nullable = true)
 |    |    |-- oday: string (nullable = true)
 |    |    |-- jrn: integer (nullable = true)
 |    |    |-- line: integer (nullable = true)
 |    |    |-- start: string (nullable = true)
 | 

In [43]:
streaming_df_tp.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- trip_id: string (nullable = true)
 |    |-- route_id: string (nullable = true)
 |    |-- start_date: string (nullable = true)
 |    |-- start_time: string (nullable = true)
 |    |-- direction_id: integer (nullable = true)
 |    |-- stop_time_updates: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- stop_id: string (nullable = true)
 |    |    |    |-- arrival_time: string (nullable = true)
 |    |    |    |-- departure_time: string (nullable = true)



In [44]:
flattened_df_vp = streaming_df_vp.select(
    col("data.vehicle_type"),
    col("data.nextStop"),
    col("data.VP.desi"),
    col("data.VP.dir"),
    col("data.VP.oper"),
    col("data.VP.veh"),
    col("data.VP.tst"),
    col("data.VP.tsi"),
    col("data.VP.spd"),
    col("data.VP.hdg"),
    col("data.VP.lat"),
    col("data.VP.long"),
    col("data.VP.acc"),
    col("data.VP.dl"),
    col("data.VP.odo"),
    col("data.VP.drst"),
    col("data.VP.oday"),
    col("data.VP.jrn"),
    col("data.VP.line"),
    col("data.VP.start"),
    col("data.VP.loc"),
    col("data.VP.stop"),
    col("data.VP.route"),
    col("data.VP.occu")
)

In [45]:
flattened_df_vp.printSchema()

root
 |-- vehicle_type: string (nullable = true)
 |-- nextStop: string (nullable = true)
 |-- desi: string (nullable = true)
 |-- dir: string (nullable = true)
 |-- oper: integer (nullable = true)
 |-- veh: integer (nullable = true)
 |-- tst: string (nullable = true)
 |-- tsi: long (nullable = true)
 |-- spd: double (nullable = true)
 |-- hdg: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- acc: double (nullable = true)
 |-- dl: integer (nullable = true)
 |-- odo: integer (nullable = true)
 |-- drst: integer (nullable = true)
 |-- oday: string (nullable = true)
 |-- jrn: integer (nullable = true)
 |-- line: integer (nullable = true)
 |-- start: string (nullable = true)
 |-- loc: string (nullable = true)
 |-- stop: integer (nullable = true)
 |-- route: string (nullable = true)
 |-- occu: integer (nullable = true)



In [46]:
from pyspark.sql.functions import concat_ws, col, date_format

flattened_df_vp = flattened_df_vp.withColumn(
    "trip_id",
    concat_ws("_", 
        col("route"),  # Correspond à route_id dans TripUpdate
        col("oday"),   # Correspond à start_date dans TripUpdate
        date_format(col("start"), "HH:mm:ss"),  # Conversion de start_time au format hh:mm:ss
        col("dir")     # Correspond à direction_id dans TripUpdate
    )
)


In [47]:
from pyspark.sql.functions import explode
flattened_df_tp = streaming_df_tp.select(
    col("data.trip_id").alias("trip_id_tp"),
    col("data.route_id"),
    col("data.start_date"),
    col("data.start_time"),
    col("data.direction_id"),
    explode(col("data.stop_time_updates")).alias("stop_time_update")  # Exploser l'array
)

In [48]:
flattened_df_tp = flattened_df_tp.select(
    "trip_id_tp",
    "route_id",
    "start_date",
    "start_time",
    "direction_id",
    col("stop_time_update.stop_id"),
    col("stop_time_update.arrival_time"),
    col("stop_time_update.departure_time")
)

In [49]:
flattened_df_vp.printSchema()

root
 |-- vehicle_type: string (nullable = true)
 |-- nextStop: string (nullable = true)
 |-- desi: string (nullable = true)
 |-- dir: string (nullable = true)
 |-- oper: integer (nullable = true)
 |-- veh: integer (nullable = true)
 |-- tst: string (nullable = true)
 |-- tsi: long (nullable = true)
 |-- spd: double (nullable = true)
 |-- hdg: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- acc: double (nullable = true)
 |-- dl: integer (nullable = true)
 |-- odo: integer (nullable = true)
 |-- drst: integer (nullable = true)
 |-- oday: string (nullable = true)
 |-- jrn: integer (nullable = true)
 |-- line: integer (nullable = true)
 |-- start: string (nullable = true)
 |-- loc: string (nullable = true)
 |-- stop: integer (nullable = true)
 |-- route: string (nullable = true)
 |-- occu: integer (nullable = true)
 |-- trip_id: string (nullable = false)



In [50]:
flattened_df_tp.printSchema()

root
 |-- trip_id_tp: string (nullable = true)
 |-- route_id: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- start_time: string (nullable = true)
 |-- direction_id: integer (nullable = true)
 |-- stop_id: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- departure_time: string (nullable = true)



In [61]:
#joined_df = flattened_df_vp.join(flattened_df_tp, flattened_df_vp.trip_id == flattened_df_tp.trip_id_tp, how="inner")
#joined_df = flattened_df_vp.join(flattened_df_tp, "trip_id")
joined_stream = flattened_df_vp.join(
    flattened_df_tp,
    (flattened_df_vp.trip_id == flattened_df_tp.trip_id_tp),
    "inner"
)

In [62]:
joined_df = joined_stream.select(
    flattened_df_vp.trip_id, 
    flattened_df_vp.vehicle_type, 
    flattened_df_vp.nextStop, 
    flattened_df_vp.desi,
    flattened_df_vp.tst,
    flattened_df_vp.dl,  
    flattened_df_vp.spd, 
    flattened_df_vp.lat, 
    flattened_df_vp.long, 
    flattened_df_tp.route_id, 
    flattened_df_tp.start_date, 
    flattened_df_tp.start_time, 
    flattened_df_tp.arrival_time, 
    flattened_df_tp.departure_time
)

In [63]:
joined_df.printSchema()

root
 |-- trip_id: string (nullable = false)
 |-- vehicle_type: string (nullable = true)
 |-- nextStop: string (nullable = true)
 |-- desi: string (nullable = true)
 |-- tst: string (nullable = true)
 |-- dl: integer (nullable = true)
 |-- spd: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- route_id: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- start_time: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- departure_time: string (nullable = true)



In [64]:
from pyspark.sql.functions import concat_ws, col, to_timestamp, date_format

# Convertir 'arrival_time' et 'departure_time' au format timestamp
joined_df = joined_df.withColumn(
    "arrival_time", 
    to_timestamp(col("arrival_time"), "yyyy-MM-dd'T'HH:mm:ss")  # Conversion en timestamp
).withColumn(
    "departure_time", 
    to_timestamp(col("departure_time"), "yyyy-MM-dd'T'HH:mm:ss")  # Conversion en timestamp
)

In [65]:
joined_df.printSchema()
df_unique = joined_df.distinct()

root
 |-- trip_id: string (nullable = false)
 |-- vehicle_type: string (nullable = true)
 |-- nextStop: string (nullable = true)
 |-- desi: string (nullable = true)
 |-- tst: string (nullable = true)
 |-- dl: integer (nullable = true)
 |-- spd: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- route_id: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- start_time: string (nullable = true)
 |-- arrival_time: timestamp (nullable = true)
 |-- departure_time: timestamp (nullable = true)



In [66]:
joined_df =joined_df.withColumn("nextstop",col("nextStop"))

In [67]:
joined_df.printSchema()

root
 |-- trip_id: string (nullable = false)
 |-- vehicle_type: string (nullable = true)
 |-- nextstop: string (nullable = true)
 |-- desi: string (nullable = true)
 |-- tst: string (nullable = true)
 |-- dl: integer (nullable = true)
 |-- spd: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- route_id: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- start_time: string (nullable = true)
 |-- arrival_time: timestamp (nullable = true)
 |-- departure_time: timestamp (nullable = true)



In [70]:
query = joined_df.writeStream \
    .format("org.apache.spark.sql.cassandra") \
    .option("keyspace", "test") \
    .option("table", "vehicle_data_st") \
    .outputMode("append") \
    .option("checkpointLocation", "chetest") \
    .option("failOnDataLoss", "false") \
    .start()
query.awaitTermination()

25/01/20 21:44:28 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/01/20 21:44:28 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/01/20 21:44:28 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
ERROR:root:KeyboardInterrupt while sending command.                             
Traceback (most recent call last):
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.p

KeyboardInterrupt: 

In [242]:
query = joined_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", False) \
    .start()
query.awaitTermination()

25/01/20 14:33:47 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-cd4a1bf6-9d8d-4dc7-bd77-d3c496bdf55b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/01/20 14:33:47 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/01/20 14:33:47 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/01/20 14:33:47 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                             

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------------------+------------+--------+----+------------------------+----+-----+---------+---------+--------+----------+----------+-------------------+-------------------+
|trip_id                   |vehicle_type|nextstop|desi|tst                     |dl  |spd  |lat      |long     |route_id|start_date|start_time|arrival_time       |departure_time     |
+--------------------------+------------+--------+----+------------------------+----+-----+---------+---------+--------+----------+----------+-------------------+-------------------+
|1059_2025-01-20_14:21:00_1|bus         |1333182 |59  |2025-01-20T13:13:04.755Z|-51 |8.29 |60.250694|24.851788|1059    |2025-01-20|14:21:00  |2025-01-20 13:19:24|2025-01-20 13:21:17|
|1059_2025-01-20_14:21:00_1|bus         |1333182 |59  |2025-01-20T13:13:05.755Z|-51 |8.06 |60.250753|24.851699|1059    |2025-01-20|14:21:00  |2025-01-20 13:19:24|2025-01-2

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [33]:
query = flattened_df_vp.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", False) \
    .start()
query.awaitTermination()

25/01/20 13:09:26 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-fcb0493c-c5e5-42db-9d49-3d21599ab5d3. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/01/20 13:09:26 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/01/20 13:09:26 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+------------+--------+----+---+----+----+------------------------+----------+-----+----+---------+---------+-----+------+-----+----+----------+----+----+-----+---+-------+-----+----+------------------------+
|vehicle_type|nextStop|desi|dir|oper|veh |tst                     |tsi       |spd  |hdg |lat      |long     |acc  |dl    |odo  |drst|oday      |jrn |line|start|loc|stop   |route|occu|trip_id                 |
+------------+--------+----+---+----+----+------------------------+----------+-----+----+---------+---------+-----+------+-----+----+----------+----+----+-----+---+-------+-----+----+------------------------+
|bus         |1383181 |71  |2  |22  |1186|2025-01-20T10:39:28.008Z|1737369568|0.0  |60  |60.240936|24.998722|0.0  |-12   |1804 |0   |2025-01-20|1362|94  |12:35|GPS|1383181|1071 |0   |1071_2025-01-20_12:35_2 |
|bus         |6040226 |171 |1  |6   |293 |2025-01-20T10:39:28.009Z|

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [102]:
query = flattened_df_vp.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()
query.awaitTermination()


25/01/20 12:37:47 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-215aeec1-1906-44eb-b342-88b55327a8a9. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/01/20 12:37:47 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/01/20 12:37:47 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+------------+--------+----+---+----+----+--------------------+----------+-----+----+---------+---------+-----+------+-----+----+----------+----+----+-----+---+-------+-----+----+--------------------+
|vehicle_type|nextStop|desi|dir|oper| veh|                 tst|       tsi|  spd| hdg|      lat|     long|  acc|    dl|  odo|drst|      oday| jrn|line|start|loc|   stop|route|occu|             trip_id|
+------------+--------+----+---+----+----+--------------------+----------+-----+----+---------+---------+-----+------+-----+----+----------+----+----+-----+---+-------+-----+----+--------------------+
|         bus| 1383181|  71|  2|  22|1186|2025-01-20T10:39:...|1737369568|  0.0|  60|60.240936|24.998722|  0.0|   -12| 1804|   0|2025-01-20|1362|  94|12:35|GPS|1383181| 1071|   0|1071_2025-01-20_1...|
|         bus| 6040226| 171|  1|   6| 293|2025-01-20T10:39:...|1737369568|12.19| 302|60.126188|24.4

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 