In [1]:
from pyspark.sql import SparkSession
spark = (
    SparkSession 
    .builder 
    .appName("Streaming from Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True)
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,com.datastax.spark:spark-cassandra-connector_2.12:3.5.0")
    .config("spark.cassandra.connection.host", "127.0.0.1")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.memory", "2g")
    .config("spark.cassandra.auth.username", "cassandra")
    .config("spark.cassandra.auth.password", "cassandra")
    .master("local[*]") 
    .getOrCreate()
)

25/01/30 22:08:32 WARN Utils: Your hostname, Ala resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/01/30 22:08:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ala/.ivy2/cache
The jars for the packages stored in: /home/ala/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7c2c55da-9d3f-4672-bd33-87318b8bfc55;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in cent

In [2]:
spark

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, IntegerType

schema_vp = StructType([
    StructField("vehicle_type", StringType(), True),
    StructField("nextStop", StringType(), True),
    StructField("VP", StructType([
        StructField("desi", StringType(), True),
        StructField("dir", StringType(), True),
        StructField("oper", IntegerType(), True),
        StructField("veh", IntegerType(), True),
        StructField("tst", StringType(), True),
        StructField("tsi", LongType(), True),
        StructField("spd", DoubleType(), True),
        StructField("hdg", IntegerType(), True),
        StructField("lat", DoubleType(), True),
        StructField("long", DoubleType(), True),
        StructField("acc", DoubleType(), True),
        StructField("dl", IntegerType(), True),
        StructField("odo", IntegerType(), True),  # correction ici
        StructField("drst", IntegerType(), True),
        StructField("oday", StringType(), True),
        StructField("jrn", IntegerType(), True),
        StructField("line", IntegerType(), True),
        StructField("start", StringType(), True),
        StructField("loc", StringType(), True),
        StructField("stop", IntegerType(), True),
        StructField("route", StringType(), True),
        StructField("occu", IntegerType(), True)
    ]))
])

In [4]:
kafka_df_vp = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "vp") \
    .option("startingOffsets","earliest") \
    .option("enable.auto.create.topics", "false") \
    .load()

In [5]:
from pyspark.sql.functions import expr

kafka_json_df_vp = kafka_df_vp.withColumn("value", expr("cast(value as string)"))

In [6]:
from pyspark.sql.functions import from_json, col

streaming_df_vp = kafka_json_df_vp.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema_vp).alias("data"))

In [7]:
streaming_df_vp.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- vehicle_type: string (nullable = true)
 |    |-- nextStop: string (nullable = true)
 |    |-- VP: struct (nullable = true)
 |    |    |-- desi: string (nullable = true)
 |    |    |-- dir: string (nullable = true)
 |    |    |-- oper: integer (nullable = true)
 |    |    |-- veh: integer (nullable = true)
 |    |    |-- tst: string (nullable = true)
 |    |    |-- tsi: long (nullable = true)
 |    |    |-- spd: double (nullable = true)
 |    |    |-- hdg: integer (nullable = true)
 |    |    |-- lat: double (nullable = true)
 |    |    |-- long: double (nullable = true)
 |    |    |-- acc: double (nullable = true)
 |    |    |-- dl: integer (nullable = true)
 |    |    |-- odo: integer (nullable = true)
 |    |    |-- drst: integer (nullable = true)
 |    |    |-- oday: string (nullable = true)
 |    |    |-- jrn: integer (nullable = true)
 |    |    |-- line: integer (nullable = true)
 |    |    |-- start: string (nullable = true)
 | 

In [8]:
flattened_df_vp = streaming_df_vp.select(
    col("data.vehicle_type"),
    col("data.nextStop"),
    col("data.VP.desi"),
    col("data.VP.dir"),
    col("data.VP.oper"),
    col("data.VP.veh"),
    col("data.VP.tst"),
    col("data.VP.tsi"),
    col("data.VP.spd"),
    col("data.VP.hdg"),
    col("data.VP.lat"),
    col("data.VP.long"),
    col("data.VP.acc"),
    col("data.VP.dl"),
    col("data.VP.odo"),
    col("data.VP.drst"),
    col("data.VP.oday"),
    col("data.VP.jrn"),
    col("data.VP.line"),
    col("data.VP.start"),
    col("data.VP.loc"),
    col("data.VP.stop"),
    col("data.VP.route"),
    col("data.VP.occu")
)

In [9]:
flattened_df_vp_clean = flattened_df_vp.fillna({
    'vehicle_type': 'Unknown',
    'nextStop': 'Unknown',
    'desi': 'Unknown',
    'dir': 'Unknown',
    'oper': 0,
    'veh': 0,
    'tst': 'Unknown',
    'tsi': 0,
    'spd': 0.0,
    'hdg': 0,
    'lat': 0.0,
    'long': 0.0,
    'acc': 0.0,
    'dl': 0,
    'odo': 0,
    'drst': 0,
    'oday': 'Unknown',
    'jrn': 0,
    'line': 0,
    'start': 'Unknown',
    'loc': 'Unknown',
    'stop': 0,
    'route': 'Unknown',
    'occu': 0
})

In [10]:
from pyspark.sql import functions as F

In [11]:
flattened_df_vp_clean = flattened_df_vp_clean.withColumn("vehicle_type", F.col("vehicle_type").cast(StringType())) \
    .withColumn("nextstop", F.col("nextStop").cast(StringType())) \
    .withColumn("desi", F.col("desi").cast(StringType())) \
    .withColumn("dir", F.col("dir").cast(StringType())) \
    .withColumn("oper", F.col("oper").cast(IntegerType())) \
    .withColumn("veh", F.col("veh").cast(IntegerType())) \
    .withColumn("tst", F.col("tst").cast(StringType())) \
    .withColumn("tsi", F.col("tsi").cast(LongType())) \
    .withColumn("spd", (F.col("spd") * 3.6).cast(DoubleType())) \
    .withColumn("hdg", F.col("hdg").cast(IntegerType())) \
    .withColumn("lat", F.col("lat").cast(DoubleType())) \
    .withColumn("long", F.col("long").cast(DoubleType())) \
    .withColumn("acc", F.col("acc").cast(DoubleType())) \
    .withColumn("dl", F.col("dl").cast(IntegerType())) \
    .withColumn("odo", F.col("odo").cast(IntegerType())) \
    .withColumn("drst", F.col("drst").cast(IntegerType())) \
    .withColumn("oday", F.col("oday").cast(StringType())) \
    .withColumn("jrn", F.col("jrn").cast(IntegerType())) \
    .withColumn("line", F.col("line").cast(IntegerType())) \
    .withColumn("start", F.col("start").cast(StringType())) \
    .withColumn("loc", F.col("loc").cast(StringType())) \
    .withColumn("stop", F.col("stop").cast(IntegerType())) \
    .withColumn("route", F.col("route").cast(StringType())) \
    .withColumn("occu", F.col("occu").cast(IntegerType())).select("*")

In [12]:
flattened_df_vp_clean.printSchema()

root
 |-- vehicle_type: string (nullable = false)
 |-- nextstop: string (nullable = false)
 |-- desi: string (nullable = false)
 |-- dir: string (nullable = false)
 |-- oper: integer (nullable = false)
 |-- veh: integer (nullable = false)
 |-- tst: string (nullable = false)
 |-- tsi: long (nullable = false)
 |-- spd: double (nullable = false)
 |-- hdg: integer (nullable = false)
 |-- lat: double (nullable = false)
 |-- long: double (nullable = false)
 |-- acc: double (nullable = false)
 |-- dl: integer (nullable = false)
 |-- odo: integer (nullable = false)
 |-- drst: integer (nullable = false)
 |-- oday: string (nullable = false)
 |-- jrn: integer (nullable = false)
 |-- line: integer (nullable = false)
 |-- start: string (nullable = false)
 |-- loc: string (nullable = false)
 |-- stop: integer (nullable = false)
 |-- route: string (nullable = false)
 |-- occu: integer (nullable = false)



In [13]:
# Calculer la vitesse moyenne par vehicle_type
from pyspark.sql.functions import avg
from pyspark.sql.functions import window

In [14]:
flattened_df_vp_clean = flattened_df_vp_clean.withColumn("tst", F.col("tst").cast("timestamp"))

In [15]:
flattened_df_vp_clean = flattened_df_vp_clean.filter(F.col("spd").isNotNull())

In [16]:
fleet_analysis = flattened_df_vp_clean \
    .withWatermark("tst", "10 minutes") \
    .groupBy(
        F.col("vehicle_type"),
        F.window(F.col("tst"), "5 minutes", "2 minutes")  # Fenêtre plus petite
    ) \
    .agg(
        F.avg("spd").alias("avg_speed"),
        F.count("veh").alias("active_vehicles"),
        F.max("spd").alias("max_speed"),
        F.min("spd").alias("min_speed"),
        F.avg("occu").alias("avg_occupancy")
    ) \
    .select(
        F.col("vehicle_type"),
        F.col("window.start").alias("window_start"),
        F.col("window.end").alias("window_end"),
        F.col("avg_speed"),
        F.col("active_vehicles"),
        F.col("max_speed"),
        F.col("min_speed"),
        F.col("avg_occupancy")
    )

In [17]:
fleet_analysis.printSchema()

root
 |-- vehicle_type: string (nullable = false)
 |-- window_start: timestamp (nullable = true)
 |-- window_end: timestamp (nullable = true)
 |-- avg_speed: double (nullable = true)
 |-- active_vehicles: long (nullable = false)
 |-- max_speed: double (nullable = true)
 |-- min_speed: double (nullable = true)
 |-- avg_occupancy: double (nullable = true)



In [18]:
analysis_types = ['avg_speed', 'active_vehicles', 'max_speed', 'min_speed', 'avg_occupancy']

In [19]:
fleet_analysis_long = fleet_analysis.select(
    F.col("vehicle_type"),
    F.col("window_start"),
    F.col("window_end"),
    F.lit("avg_speed").alias("analysis_type"),
    F.col("avg_speed").alias("metric_value")
).union(
    fleet_analysis.select(
        F.col("vehicle_type"),
        F.col("window_start"),
        F.col("window_end"),
        F.lit("active_vehicles").alias("analysis_type"),
        F.col("active_vehicles").alias("metric_value")
    )
).union(
    fleet_analysis.select(
        F.col("vehicle_type"),
        F.col("window_start"),
        F.col("window_end"),
        F.lit("max_speed").alias("analysis_type"),
        F.col("max_speed").alias("metric_value")
    )
).union(
    fleet_analysis.select(
        F.col("vehicle_type"),
        F.col("window_start"),
        F.col("window_end"),
        F.lit("min_speed").alias("analysis_type"),
        F.col("min_speed").alias("metric_value")
    )
)

In [20]:
fleet_analysis_long = fleet_analysis_long \
    .withColumnRenamed("window_start", "time_window_start") \
    .withColumnRenamed("window_end", "time_window_end")

In [21]:
fleet_analysis_long.printSchema()

root
 |-- vehicle_type: string (nullable = false)
 |-- time_window_start: timestamp (nullable = true)
 |-- time_window_end: timestamp (nullable = true)
 |-- analysis_type: string (nullable = false)
 |-- metric_value: double (nullable = true)



In [22]:
from pyspark.sql.functions import col, from_json, avg, min, max, count, desc
from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType

In [24]:
flattened_df_vp_clean.printSchema()

root
 |-- vehicle_type: string (nullable = false)
 |-- nextstop: string (nullable = false)
 |-- desi: string (nullable = false)
 |-- dir: string (nullable = false)
 |-- oper: integer (nullable = false)
 |-- veh: integer (nullable = false)
 |-- tst: timestamp (nullable = true)
 |-- tsi: long (nullable = false)
 |-- spd: double (nullable = false)
 |-- hdg: integer (nullable = false)
 |-- lat: double (nullable = false)
 |-- long: double (nullable = false)
 |-- acc: double (nullable = false)
 |-- dl: integer (nullable = false)
 |-- odo: integer (nullable = false)
 |-- drst: integer (nullable = false)
 |-- oday: string (nullable = false)
 |-- jrn: integer (nullable = false)
 |-- line: integer (nullable = false)
 |-- start: string (nullable = false)
 |-- loc: string (nullable = false)
 |-- stop: integer (nullable = false)
 |-- route: string (nullable = false)
 |-- occu: integer (nullable = false)



In [25]:
df_analysis = flattened_df_vp_clean.groupBy("line").agg(
    avg(col("spd")).alias("vitesse_moyenne"),
    min(col("spd")).alias("vitesse_min"),
    max(col("spd")).alias("vitesse_max"),
    avg(col("dl")).alias("retard_moyen"),
    count(col("veh")).alias("nb_vehicules_actifs")
).orderBy(desc("nb_vehicules_actifs"))

In [34]:
df_analysis.printSchema()

root
 |-- line: integer (nullable = false)
 |-- vitesse_moyenne: double (nullable = true)
 |-- vitesse_min: double (nullable = true)
 |-- vitesse_max: double (nullable = true)
 |-- retard_moyen: double (nullable = true)
 |-- nb_vehicules_actifs: long (nullable = false)



In [36]:
df_analysis = df_analysis.withColumn("nb_vehicules_actifs", col("nb_vehicules_actifs").cast(IntegerType()))


In [37]:
df_analysis.writeStream \
    .format("org.apache.spark.sql.cassandra") \
    .option("keyspace", "test") \
    .option("table", "analyses_temps_reel") \
    .outputMode("append") \
    .option("checkpointLocation", "checkpoint/check__Line") \
    .start() \
    .awaitTermination()

25/01/30 22:20:07 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


AnalysisException: Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;
Project [line#603, vitesse_moyenne#987, vitesse_min#989, vitesse_max#991, retard_moyen#993, cast(nb_vehicules_actifs#995L as int) AS nb_vehicules_actifs#1224]
+- Sort [nb_vehicules_actifs#995L DESC NULLS LAST], true
   +- Aggregate [line#603], [line#603, avg(spd#353) AS vitesse_moyenne#987, min(spd#353) AS vitesse_min#989, max(spd#353) AS vitesse_max#991, avg(dl#478) AS retard_moyen#993, count(veh#278) AS nb_vehicules_actifs#995L]
      +- Filter isnotnull(spd#353)
         +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, cast(tst#303 as timestamp) AS tst#777, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, dl#478, odo#503, drst#528, oday#553, jrn#578, line#603, start#628, loc#653, stop#678, route#703, occu#728]
            +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, dl#478, odo#503, drst#528, oday#553, jrn#578, line#603, start#628, loc#653, stop#678, route#703, occu#728]
               +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, dl#478, odo#503, drst#528, oday#553, jrn#578, line#603, start#628, loc#653, stop#678, route#703, cast(occu#128 as int) AS occu#728]
                  +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, dl#478, odo#503, drst#528, oday#553, jrn#578, line#603, start#628, loc#653, stop#678, cast(route#127 as string) AS route#703, occu#128]
                     +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, dl#478, odo#503, drst#528, oday#553, jrn#578, line#603, start#628, loc#653, cast(stop#126 as int) AS stop#678, route#127, occu#128]
                        +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, dl#478, odo#503, drst#528, oday#553, jrn#578, line#603, start#628, cast(loc#125 as string) AS loc#653, stop#126, route#127, occu#128]
                           +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, dl#478, odo#503, drst#528, oday#553, jrn#578, line#603, cast(start#124 as string) AS start#628, loc#125, stop#126, route#127, occu#128]
                              +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, dl#478, odo#503, drst#528, oday#553, jrn#578, cast(line#123 as int) AS line#603, start#124, loc#125, stop#126, route#127, occu#128]
                                 +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, dl#478, odo#503, drst#528, oday#553, cast(jrn#122 as int) AS jrn#578, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                    +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, dl#478, odo#503, drst#528, cast(oday#121 as string) AS oday#553, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                       +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, dl#478, odo#503, cast(drst#120 as int) AS drst#528, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                          +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, dl#478, cast(odo#119 as int) AS odo#503, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                             +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, acc#453, cast(dl#118 as int) AS dl#478, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, long#428, cast(acc#117 as double) AS acc#453, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                   +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, lat#403, cast(long#116 as double) AS long#428, acc#117, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                      +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, hdg#378, cast(lat#115 as double) AS lat#403, long#116, acc#117, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                         +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, spd#353, cast(hdg#114 as int) AS hdg#378, lat#115, long#116, acc#117, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                            +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, tsi#328L, cast((spd#113 * 3.6) as double) AS spd#353, hdg#114, lat#115, long#116, acc#117, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                               +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, tst#303, cast(tsi#112L as bigint) AS tsi#328L, spd#113, hdg#114, lat#115, long#116, acc#117, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                                  +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, veh#278, cast(tst#111 as string) AS tst#303, tsi#112L, spd#113, hdg#114, lat#115, long#116, acc#117, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                                     +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, oper#253, cast(veh#110 as int) AS veh#278, tst#111, tsi#112L, spd#113, hdg#114, lat#115, long#116, acc#117, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                                        +- Project [vehicle_type#153, nextstop#178, desi#203, dir#228, cast(oper#109 as int) AS oper#253, veh#110, tst#111, tsi#112L, spd#113, hdg#114, lat#115, long#116, acc#117, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                                           +- Project [vehicle_type#153, nextstop#178, desi#203, cast(dir#108 as string) AS dir#228, oper#109, veh#110, tst#111, tsi#112L, spd#113, hdg#114, lat#115, long#116, acc#117, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                                              +- Project [vehicle_type#153, nextstop#178, cast(desi#107 as string) AS desi#203, dir#108, oper#109, veh#110, tst#111, tsi#112L, spd#113, hdg#114, lat#115, long#116, acc#117, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                                                 +- Project [vehicle_type#153, cast(nextStop#106 as string) AS nextstop#178, desi#107, dir#108, oper#109, veh#110, tst#111, tsi#112L, spd#113, hdg#114, lat#115, long#116, acc#117, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                                                    +- Project [cast(vehicle_type#105 as string) AS vehicle_type#153, nextStop#106, desi#107, dir#108, oper#109, veh#110, tst#111, tsi#112L, spd#113, hdg#114, lat#115, long#116, acc#117, dl#118, odo#119, drst#120, oday#121, jrn#122, line#123, start#124, loc#125, stop#126, route#127, occu#128]
                                                                                       +- Project [coalesce(vehicle_type#33, cast(Unknown as string)) AS vehicle_type#105, coalesce(nextStop#34, cast(Unknown as string)) AS nextStop#106, coalesce(desi#35, cast(Unknown as string)) AS desi#107, coalesce(dir#36, cast(Unknown as string)) AS dir#108, coalesce(oper#37, cast(0 as int)) AS oper#109, coalesce(veh#38, cast(0 as int)) AS veh#110, coalesce(tst#39, cast(Unknown as string)) AS tst#111, coalesce(tsi#40L, cast(0 as bigint)) AS tsi#112L, coalesce(nanvl(spd#41, cast(null as double)), cast(0.0 as double)) AS spd#113, coalesce(hdg#42, cast(0 as int)) AS hdg#114, coalesce(nanvl(lat#43, cast(null as double)), cast(0.0 as double)) AS lat#115, coalesce(nanvl(long#44, cast(null as double)), cast(0.0 as double)) AS long#116, coalesce(nanvl(acc#45, cast(null as double)), cast(0.0 as double)) AS acc#117, coalesce(dl#46, cast(0 as int)) AS dl#118, coalesce(odo#47, cast(0 as int)) AS odo#119, coalesce(drst#48, cast(0 as int)) AS drst#120, coalesce(oday#49, cast(Unknown as string)) AS oday#121, coalesce(jrn#50, cast(0 as int)) AS jrn#122, coalesce(line#51, cast(0 as int)) AS line#123, coalesce(start#52, cast(Unknown as string)) AS start#124, coalesce(loc#53, cast(Unknown as string)) AS loc#125, coalesce(stop#54, cast(0 as int)) AS stop#126, coalesce(route#55, cast(Unknown as string)) AS route#127, coalesce(occu#56, cast(0 as int)) AS occu#128]
                                                                                          +- Project [data#31.vehicle_type AS vehicle_type#33, data#31.nextStop AS nextStop#34, data#31.VP.desi AS desi#35, data#31.VP.dir AS dir#36, data#31.VP.oper AS oper#37, data#31.VP.veh AS veh#38, data#31.VP.tst AS tst#39, data#31.VP.tsi AS tsi#40L, data#31.VP.spd AS spd#41, data#31.VP.hdg AS hdg#42, data#31.VP.lat AS lat#43, data#31.VP.long AS long#44, data#31.VP.acc AS acc#45, data#31.VP.dl AS dl#46, data#31.VP.odo AS odo#47, data#31.VP.drst AS drst#48, data#31.VP.oday AS oday#49, data#31.VP.jrn AS jrn#50, data#31.VP.line AS line#51, data#31.VP.start AS start#52, data#31.VP.loc AS loc#53, data#31.VP.stop AS stop#54, data#31.VP.route AS route#55, data#31.VP.occu AS occu#56]
                                                                                             +- Project [from_json(StructField(vehicle_type,StringType,true), StructField(nextStop,StringType,true), StructField(VP,StructType(StructField(desi,StringType,true),StructField(dir,StringType,true),StructField(oper,IntegerType,true),StructField(veh,IntegerType,true),StructField(tst,StringType,true),StructField(tsi,LongType,true),StructField(spd,DoubleType,true),StructField(hdg,IntegerType,true),StructField(lat,DoubleType,true),StructField(long,DoubleType,true),StructField(acc,DoubleType,true),StructField(dl,IntegerType,true),StructField(odo,IntegerType,true),StructField(drst,IntegerType,true),StructField(oday,StringType,true),StructField(jrn,IntegerType,true),StructField(line,IntegerType,true),StructField(start,StringType,true),StructField(loc,StringType,true),StructField(stop,IntegerType,true),StructField(route,StringType,true),StructField(occu,IntegerType,true)),true), value#29, Some(Europe/Paris)) AS data#31]
                                                                                                +- Project [cast(value#21 as string) AS value#29]
                                                                                                   +- Project [key#7, cast(value#8 as string) AS value#21, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]
                                                                                                      +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@56564a40, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@7d3fce53, [startingOffsets=earliest, kafka.bootstrap.servers=localhost:9092, subscribe=vp, enable.auto.create.topics=false], [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@596078e3,kafka,List(),None,List(),None,Map(kafka.bootstrap.servers -> localhost:9092, subscribe -> vp, startingOffsets -> earliest, enable.auto.create.topics -> false),None), kafka, [key#0, value#1, topic#2, partition#3, offset#4L, timestamp#5, timestampType#6]


In [26]:
fleet_analysis_long.writeStream \
    .format("org.apache.spark.sql.cassandra") \
    .option("keyspace", "test") \
    .option("table", "analyse_metrics") \
    .outputMode("append") \
    .option("checkpointLocation", "checkpoint/spark-avrge") \
    .start() \
    .awaitTermination()


df_analysis.writeStream \
    .format("org.apache.spark.sql.cassandra") \
    .option("keyspace", "test") \
    .option("table", "analyses_temps_reel") \
    .outputMode("append") \
    .option("checkpointLocation", "checkpoint/check__Line") \
    .start() \
    .awaitTermination()

25/01/30 22:10:23 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/01/30 22:10:24 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
ERROR:root:KeyboardInterrupt while sending command.                             
Traceback (most recent call last):
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ala/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 