In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.master("local[*]") \
                    .appName('Batch Bus Location from Sensor EV-BUS') \
                    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/25 07:22:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
schemaRawBusLocation = StructType([
    StructField('bus_id',StringType(),False),
    StructField('lat',DoubleType(),True),
    StructField('long',DoubleType(),True),
    StructField('timestamp',TimestampType(),False),
])

In [4]:
parquetDF = spark.read.schema(schemaRawBusLocation) \
            .parquet('hdfs://172.18.0.4:9000/user/parallels/job/topic=bus_location')

parquetDF.printSchema()
parquetDF.show(truncate=False)

root
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- bus_id: string (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

+--------+---------+-----------------------+---------------------+
|lat     |long     |timestamp              |bus_id               |
+--------+---------+-----------------------+---------------------+
|-7.35209|112.72481|2022-10-19 04:36:36.998|bus_LC06S24S0M4000001|
|-7.35209|112.72484|2022-10-19 04:36:38.002|bus_LC06S24S0M4000001|
|-7.3521 |112.72488|2022-10-19 04:36:39.003|bus_LC06S24S0M4000001|
|-7.35216|112.72508|2022-10-19 04:36:40.007|bus_LC06S24S0M4000001|
|-7.35226|112.72502|2022-10-19 04:36:41.011|bus_LC06S24S0M4000001|
|-7.35237|112.72494|2022-10-19 04:36:42.012|bus_LC06S24S0M4000001|
|-7.35243|112.7249 |2022-10-19 04:36:43.015|bus_LC06S24S0M4000001|
|-7.35248|112.72489|2022-10-19 04:36:44.019|bus_LC06S24S0M4000001|
|-7.35258|112.72486|2022-10-19 04:36:45.023|bus_LC06S24S0M4000001|
|-7.3526 |112.72485|2022-10-19 04:36:46.027|bus_LC06S24S0M4000001|
|-7.35277|112.72481|2022-10-19 04:36:47.028|bus_LC06S24S0M4000001|
|-7.35287|112.72478|2022-10-19 04:36:48.031|bus_LC06S24S0M4000

                                                                                

In [None]:
query = parquetDF.withColumn('date', to_date(col('timestamp')))
window = Window.partitionBy('date').orderBy(col('date'))

def dist(long_x, lat_x, long_y, lat_y):
    return when((col(lat_x) == lat_y) & (col(long_x) == long_y), lit(0.0)).otherwise(
    round(acos(
        sin(toRadians(lat_x)) * sin(toRadians(lat_y)) + 
        cos(toRadians(lat_x)) * cos(toRadians(lat_y)) * 
            cos(toRadians(long_x) - radians(long_y))
    ) * lit(6371.0),4))

distance = query.withColumn("dist", dist(
    "long", "lat",
    lag("long", 1).over(window), lag("lat", 1).over(window)
).alias("dist")).orderBy('timestamp').cache()

In [None]:
total_miles_per_day = distance.groupBy('bus_id','date').agg(sum("dist").alias('day_km')).orderBy('date')
total_miles_per_day.printSchema()

total_miles_per_day.show(truncate=False)

In [None]:
total_miles_per_bus = total_miles_per_day.groupBy('bus_id').agg(sum('day_km').alias('total_km'))

total_miles_per_bus.show()