In [24]:
%run "SparkApp.ipynb"

In [25]:
spark

In [26]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType

schema = StructType([
    StructField("id", StringType(), True),
    StructField("vehicle", StructType([
        StructField("trip", StructType([
            StructField("tripId", StringType(), True),
            StructField("startTime", StringType(), True),
            StructField("startDate", StringType(), True),
            StructField("scheduleRelationship", StringType(), True),
            StructField("routeId", StringType(), True),
            StructField("directionId", LongType(), True),
        ]), True),
        StructField("position", StructType([
            StructField("latitude", DoubleType(), True),
            StructField("longitude", DoubleType(), True),
            StructField("bearing", DoubleType(), True),
            StructField("speed", DoubleType(), True),
        ]), True),
        StructField("currentStopSequence", LongType(), True),
        StructField("currentStatus", StringType(), True),
        StructField("timestamp", StringType(), True),
        StructField("congestionLevel", StringType(), True),
        StructField("stopId", StringType(), True),
        StructField("vehicle", StructType([
            StructField("id", StringType(), True),
            StructField("label", StringType(), True),
            StructField("licensePlate", StringType(), True),
        ]), True),
    ]), True),
])


In [27]:
kafka_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "vehicleposition") \
    .option("startingOffsets","earliest") \
    .load()

In [28]:
kafka_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [29]:
from pyspark.sql.functions import expr

kafka_json_df = kafka_df.withColumn("value", expr("cast(value as string)"))

In [30]:
from pyspark.sql.functions import from_json,col

streaming_df = kafka_json_df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data"))

In [31]:
streaming_df.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- vehicle: struct (nullable = true)
 |    |    |-- trip: struct (nullable = true)
 |    |    |    |-- tripId: string (nullable = true)
 |    |    |    |-- startTime: string (nullable = true)
 |    |    |    |-- startDate: string (nullable = true)
 |    |    |    |-- scheduleRelationship: string (nullable = true)
 |    |    |    |-- routeId: string (nullable = true)
 |    |    |    |-- directionId: long (nullable = true)
 |    |    |-- position: struct (nullable = true)
 |    |    |    |-- latitude: double (nullable = true)
 |    |    |    |-- longitude: double (nullable = true)
 |    |    |    |-- bearing: double (nullable = true)
 |    |    |    |-- speed: double (nullable = true)
 |    |    |-- currentStopSequence: long (nullable = true)
 |    |    |-- currentStatus: string (nullable = true)
 |    |    |-- timestamp: string (nullable = true)
 |    |    |-- congestionLevel: string (nullable = true)

In [32]:
flattened_df = streaming_df.select(
    col("data.id").alias("id"),
    col("data.vehicle.trip.tripId").alias("tripId"),
    col("data.vehicle.trip.startTime").alias("startTime"),
    col("data.vehicle.trip.startDate").alias("startDate"),
    col("data.vehicle.trip.scheduleRelationship").alias("scheduleRelationship"),
    col("data.vehicle.trip.routeId").alias("routeId"),
    col("data.vehicle.trip.directionId").alias("directionId"),
    col("data.vehicle.position.latitude").alias("latitude"),
    col("data.vehicle.position.longitude").alias("longitude"),
    col("data.vehicle.position.bearing").alias("bearing"),
    col("data.vehicle.position.speed").alias("speed"),
    col("data.vehicle.currentStopSequence").alias("currentStopSequence"),
    col("data.vehicle.currentStatus").alias("currentStatus"),
    col("data.vehicle.timestamp").alias("timestamp"),
    col("data.vehicle.congestionLevel").alias("congestionLevel"),
    col("data.vehicle.stopId").alias("stopId"),
    col("data.vehicle.vehicle.id").alias("vehicleId"),
    col("data.vehicle.vehicle.label").alias("vehicleLabel"),
    col("data.vehicle.vehicle.licensePlate").alias("licensePlate"),
)

### Pour retard :test

In [33]:
from pyspark.sql import functions as F

In [34]:
retard_df = streaming_df.select(
    col("data.id").alias("id"),
    col("data.vehicle.trip.startTime").alias("startTime"),
    col("data.vehicle.timestamp").alias("timestamp"),
    col("data.vehicle.trip.routeId").alias("routeId"),
    col("data.vehicle.vehicle.id").alias("vehicleId"),
)
retard_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- startTime: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- routeId: string (nullable = true)
 |-- vehicleId: string (nullable = true)



In [35]:
flattened_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- tripId: string (nullable = true)
 |-- startTime: string (nullable = true)
 |-- startDate: string (nullable = true)
 |-- scheduleRelationship: string (nullable = true)
 |-- routeId: string (nullable = true)
 |-- directionId: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- bearing: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- currentStopSequence: long (nullable = true)
 |-- currentStatus: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- congestionLevel: string (nullable = true)
 |-- stopId: string (nullable = true)
 |-- vehicleId: string (nullable = true)
 |-- vehicleLabel: string (nullable = true)
 |-- licensePlate: string (nullable = true)



In [85]:
flattened_df = flattened_df.withColumn(
    "timestamp_ts", F.from_unixtime("timestamp").cast("timestamp")
)

In [86]:
flattened_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- tripId: string (nullable = true)
 |-- startTime: string (nullable = true)
 |-- startDate: string (nullable = true)
 |-- scheduleRelationship: string (nullable = true)
 |-- routeId: string (nullable = true)
 |-- directionId: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- bearing: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- currentStopSequence: long (nullable = true)
 |-- currentStatus: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- congestionLevel: string (nullable = true)
 |-- stopId: string (nullable = true)
 |-- vehicleId: string (nullable = true)
 |-- vehicleLabel: string (nullable = true)
 |-- licensePlate: string (nullable = true)
 |-- timestamp_ts: timestamp (nullable = true)



In [87]:
from pyspark.sql.functions import col, current_timestamp, unix_timestamp, to_timestamp, expr, from_unixtime

In [92]:
ret = flattened_df.withColumn(
    "startTime_ts",
    to_timestamp(expr("CONCAT(startDate, ' ', startTime)"), "yyyyMMdd HH:mm:ss")  # Conversion en timestamp
)


In [89]:
ret = flattened_df.withColumn(
    "processing_timestamp", current_timestamp()  # Utilisation directe de current_timestamp
)

In [94]:
ret = ret.withColumn(
    "delay_en_seconde",
    (current_timestamp().cast("long") - col("startTime_ts").cast("long"))  # Différence en secondes
)

In [95]:
ret = ret.withColumn(
    "delay_en_minutes",
    (col("delay_en_seconde") / 60).cast("int")  # Calcul en minutes
)

In [97]:
ret.printSchema()

root
 |-- id: string (nullable = true)
 |-- tripId: string (nullable = true)
 |-- startTime: string (nullable = true)
 |-- startDate: string (nullable = true)
 |-- scheduleRelationship: string (nullable = true)
 |-- routeId: string (nullable = true)
 |-- directionId: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- bearing: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- currentStopSequence: long (nullable = true)
 |-- currentStatus: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- congestionLevel: string (nullable = true)
 |-- stopId: string (nullable = true)
 |-- vehicleId: string (nullable = true)
 |-- vehicleLabel: string (nullable = true)
 |-- licensePlate: string (nullable = true)
 |-- timestamp_ts: timestamp (nullable = true)
 |-- startTime_ts: timestamp (nullable = true)
 |-- delay_en_seconde: long (nullable = true)
 |-- delay_en_minutes: integer (nullable = true)



In [98]:
from pyspark.sql.functions import col, current_timestamp, unix_timestamp


# Timestamp de traitement actuel (au moment où la donnée est lue ou traitée)
#ret = flattened_df.withColumn(
#    "processing_timestamp", current_timestamp()  # Utilisation directe de current_timestamp
#)

# Calcul du retard (en secondes) - Assure-toi que "timestamp_str" est de type timestamp
#ret = ret.withColumn(
#    "delay_en_seconde", (col("startTime_ts").cast("long") - col("timestamp_ts").cast("long"))  # Calcul de la différence en secondes
#)

#Retard

In [100]:
query = ret.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination()

24/12/26 13:22:45 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-b4457e58-28dd-4726-bfce-269c6127a3f0. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/12/26 13:22:45 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/12/26 13:22:45 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/12/26 13:22:45 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/12/26 13:22:45 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/12/26 13:22:45 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known con

-------------------------------------------
Batch: 0
-------------------------------------------
+--------+----------------+---------+---------+--------------------+-------+-----------+---------+---------+-------+-----------+-------------------+-------------+----------+--------------------+------+---------+-------------+------------+-------------------+-------------------+----------------+----------------+
|      id|          tripId|startTime|startDate|scheduleRelationship|routeId|directionId| latitude|longitude|bearing|      speed|currentStopSequence|currentStatus| timestamp|     congestionLevel|stopId|vehicleId| vehicleLabel|licensePlate|       timestamp_ts|       startTime_ts|delay_en_seconde|delay_en_minutes|
+--------+----------------+---------+---------+--------------------+-------+-----------+---------+---------+-------+-----------+-------------------+-------------+----------+--------------------+------+---------+-------------+------------+-------------------+-------------------

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
-------------------------------------------
Batch: 2
-------------------------------------------
-------------------------------------------
Batch: 2
-------------------------------------------
-------------------------------------------
Batch: 2
-------------------------------------------
-------------------------------------------
Batch: 1
-------------------------------------------
+----+------+---------+---------+--------------------+-------+-----------+--------+---------+-------+-----+-------------------+-------------+---------+---------------+------+---------+------------+------------+------------+--------------------+----------------+-------------+
|  id|tripId|startTime|startDate|scheduleRelationship|routeId|directionId|latitude|longitude|bearing|speed|currentStopSequence|currentStatus|timestamp|congestionLevel|stopId|vehicleId|vehicleLabel|licensePlate|timestamp_ts|processing_times

In [44]:
retard = streaming_df.select(
    F.col("data.id").alias("id"),
    F.col("data.vehicle.trip.startTime").alias("startTime"),
    F.col("data.vehicle.timestamp").alias("timestamp"),
    F.col("data.vehicle.trip.routeId").alias("routeId"),
    F.col("data.vehicle.vehicle.id").alias("vehicleId"),
)

# Convertir 'startTime' (HH:mm:ss) en Timestamp en ajoutant une date fictive
retard = retard.withColumn(
    "startTime_ts", F.to_timestamp(F.concat(F.lit("1970-01-01 "), F.col("startTime")), "yyyy-MM-dd HH:mm:ss")
)

# Convertir 'timestamp' (Unix timestamp) en Timestamp
retard = retard.withColumn(
    "timestamp_ts", F.from_unixtime("timestamp").cast("timestamp")
)

# Calcul du retard en secondes
retard = retard.withColumn(
    "delay_seconds", (F.col("timestamp_ts").cast("long") - F.col("startTime_ts").cast("long"))
)

# Vérifier si le véhicule est en retard
retard = retard.withColumn(
    "isDelayed", F.when(F.col("delay_seconds") > 0, True).otherwise(False)
)

# Afficher les résultats

# Démarrer le stream et afficher dans la console
query = retard.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination()

24/12/21 15:59:43 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-d25f823c-2dd2-4451-b01f-bcd32df3796e. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/12/21 15:59:43 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/12/21 15:59:43 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/12/21 15:59:43 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/12/21 15:59:43 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/12/21 15:59:43 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known con

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+---------+----------+--------+---------+-------------------+-------------------+-------------+---------+
|       id|startTime| timestamp| routeId|vehicleId|       startTime_ts|       timestamp_ts|delay_seconds|isDelayed|
+---------+---------+----------+--------+---------+-------------------+-------------------+-------------+---------+
| 6921_105| 15:50:00|1734790095|   76921| 6921_105|1970-01-01 15:50:00|2024-12-21 15:08:15|   1734736695|     true|
| 6990_182| 15:25:00|1734790096|  266990| 6990_182|1970-01-01 15:25:00|2024-12-21 15:08:16|   1734738196|     true|
|10299_379| 15:35:00|1734790096|  610299|10299_379|1970-01-01 15:35:00|2024-12-21 15:08:16|   1734737596|     true|
| 47374_77| 16:05:00|1734790095|40A47374| 47374_77|1970-01-01 16:05:00|2024-12-21 15:08:15|   1734735795|     true|
| 47374_39| 15:25:00|1734790095|40C47374| 47374_39|1970-01-01 15:25:00|2024-12-21 15:08:15|

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

24/12/21 16:13:42 WARN NetworkClient: [AdminClient clientId=adminclient-5] Connection to node 0 (Ala./127.0.1.1:9092) could not be established. Broker may not be available.
24/12/21 16:13:42 WARN NetworkClient: [AdminClient clientId=adminclient-4] Connection to node 0 (Ala./127.0.1.1:9092) could not be established. Broker may not be available.
24/12/21 16:13:42 WARN NetworkClient: [AdminClient clientId=adminclient-7] Connection to node 0 (Ala./127.0.1.1:9092) could not be established. Broker may not be available.
24/12/21 16:13:42 WARN NetworkClient: [AdminClient clientId=adminclient-6] Connection to node 0 (Ala./127.0.1.1:9092) could not be established. Broker may not be available.
24/12/21 16:13:42 WARN NetworkClient: [AdminClient clientId=adminclient-4] Connection to node 0 (Ala./127.0.1.1:9092) could not be established. Broker may not be available.
24/12/21 16:13:42 WARN NetworkClient: [AdminClient clientId=adminclient-5] Connection to node 0 (Ala./127.0.1.1:9092) could not be esta

In [37]:
retard_df = retard_df.withColumn(
    "startTime_ts", F.to_timestamp("startTime", "HH:mm:ss")
).withColumn(
    "timestamp_ts", F.to_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss")  # Assurez-vous que le format est correct
)

In [38]:
retard_df = retard_df.withColumn(
    "delay_seconds", (F.col("timestamp_ts").cast("long") - F.col("startTime_ts").cast("long"))
)

In [39]:
retard_df = retard_df.withColumn(
    "isDelayed", F.when(F.col("delay_seconds") > 0, True).otherwise(False)
)

In [41]:
query = retard_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination()

24/12/21 15:54:58 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-113f91a7-ca7c-4b62-82bb-9c1f8329988a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/12/21 15:54:58 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/12/21 15:54:58 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/12/21 15:54:58 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/12/21 15:54:58 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/12/21 15:54:58 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known con

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+---------+----------+--------+---------+-------------------+------------+-------------+---------+
|       id|startTime| timestamp| routeId|vehicleId|       startTime_ts|timestamp_ts|delay_seconds|isDelayed|
+---------+---------+----------+--------+---------+-------------------+------------+-------------+---------+
| 6921_105| 15:50:00|1734790095|   76921| 6921_105|1970-01-01 15:50:00|        NULL|         NULL|    false|
| 6990_182| 15:25:00|1734790096|  266990| 6990_182|1970-01-01 15:25:00|        NULL|         NULL|    false|
|10299_379| 15:35:00|1734790096|  610299|10299_379|1970-01-01 15:35:00|        NULL|         NULL|    false|
| 47374_77| 16:05:00|1734790095|40A47374| 47374_77|1970-01-01 16:05:00|        NULL|         NULL|    false|
| 47374_39| 15:25:00|1734790095|40C47374| 47374_39|1970-01-01 15:25:00|        NULL|         NULL|    false|
| 56920_11| 15:48:00|1734790095

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [32]:
query = congestion_analysis_df.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

query.awaitTermination()

24/12/21 15:48:11 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-ae36b0d6-a6cd-40dd-b6a6-4fcf2191c18a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/12/21 15:48:11 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/12/21 15:48:11 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/12/21 15:48:11 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/12/21 15:48:11 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/12/21 15:48:11 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known con

-------------------------------------------
Batch: 0
-------------------------------------------
+----------------+--------------------+-----+
|          tripId|     congestionLevel|count|
+----------------+--------------------+-----+
|74_14944_9085164|UNKNOWN_CONGESTIO...|    1|
|74_14944_8882051|UNKNOWN_CONGESTIO...|    1|
|74_15628_9324685|UNKNOWN_CONGESTIO...|    1|
|74_15628_9326025|UNKNOWN_CONGESTIO...|    1|
|74_15628_9329045|UNKNOWN_CONGESTIO...|    1|
|74_15628_9324674|UNKNOWN_CONGESTIO...|    1|
|74_14944_9085026|UNKNOWN_CONGESTIO...|    1|
|74_15628_9326860|UNKNOWN_CONGESTIO...|    1|
|74_15628_9183916|UNKNOWN_CONGESTIO...|    1|
|74_15628_8541501|UNKNOWN_CONGESTIO...|    1|
|74_15628_9155482|UNKNOWN_CONGESTIO...|    1|
|74_15628_9005936|UNKNOWN_CONGESTIO...|    1|
|74_15628_9329950|UNKNOWN_CONGESTIO...|    1|
|74_15628_9326069|UNKNOWN_CONGESTIO...|    1|
|74_15628_9323382|UNKNOWN_CONGESTIO...|    1|
|74_15628_9325055|UNKNOWN_CONGESTIO...|    1|
|74_14944_8881898|UNKNOWN_CON

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 