In [None]:
"""_summary_

Variations to aggregate over: 
    + per day as challenge 1 
    + using consecutive trip ids
    
    """

In [None]:
import pandas as pd
import pyarrow.parquet as pa
from pathlib import Path
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [None]:
data_path = '/Users/athulparvelikudy/Personal/ACRTA/tech-test-data/supporting-data'
file_name_0 = 'drive/part-00000-tid-4109877695252048813-a3139a95-1807-419c-af03-4877385b4c8c-11-1-c000.snappy.parquet'
file_name_1 = 'drive/part-00001-tid-4109877695252048813-a3139a95-1807-419c-af03-4877385b4c8c-12-1-c000.snappy.parquet'
file_name_2 = 'drive/part-00002-tid-4109877695252048813-a3139a95-1807-419c-af03-4877385b4c8c-13-1-c000.snappy.parquet'
file_name_3 = 'drive/part-00003-tid-4109877695252048813-a3139a95-1807-419c-af03-4877385b4c8c-14-1-c000.snappy.parquet'
file_name_4 = 'drive/part-00004-tid-4109877695252048813-a3139a95-1807-419c-af03-4877385b4c8c-15-1-c000.snappy.parquet'
vehicle_file = 'vehicle.csv'

vehicle_csv = str(Path(data_path) / vehicle_file)

full_file_path0 = str(Path(data_path) / file_name_0)
full_file_path1 = str(Path(data_path) / file_name_1)
full_file_path2 = str(Path(data_path) / file_name_2)
full_file_path3 = str(Path(data_path) / file_name_3)
full_file_path4 = str(Path(data_path) / file_name_4)



In [None]:
spark = (
    SparkSession.builder
    .appName("ReadLocalParquet")
    .getOrCreate()
)

In [None]:
df_0 = spark.read.parquet(full_file_path0)
df_vehicle = (
    spark.read
    .option("header", "true") 
    .option("inferSchema", "true") 
    .csv(vehicle_csv)
)

In [None]:
df_0.createOrReplaceTempView("drive_data")
df_vehicle.createOrReplaceTempView("vehicle_specs")


In [None]:
drive_w_specs = spark.sql(
    """
    SELECT
        d.trip_id,
        d.datetime,
        d.vehicle_spec_id,
        d.eng_load,
        d.velocity,
        d.fuel_level,
        v.fuel_tank_capacity
    FROM drive_data d
    INNER JOIN vehicle_specs v
        ON d.vehicle_spec_id = v.vehicle_spec_id
""")

In [None]:
drive_w_specs.show(10, truncate=False)

drive_w_specs.createOrReplaceTempView("drive_w_specs")

In [None]:
trip_agg = spark.sql(
    """

       SELECT
        trip_id,
        AVG(100.0 * (eng_load / 255.0)) AS average_eng_load_perc,
        AVG(velocity) AS average_velocity
    FROM drive_w_specs
    GROUP BY trip_id

    """ 
)
trip_agg.show(10, truncate=False)

trip_agg.createOrReplaceTempView("trip_agg")

In [None]:
ranked = spark.sql(
    """

       SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY trip_id ORDER BY datetime ASC)  AS rn_start,
        ROW_NUMBER() OVER (PARTITION BY trip_id ORDER BY datetime DESC) AS rn_end
    FROM drive_w_specs

    """ 
)
ranked.show(10, truncate=False)

In [None]:
ranked.show(10, truncate=False)

ranked.createOrReplaceTempView("drive_w_specs")

In [None]:
fuel = spark.sql(
    """

       SELECT
        trip_id,

        -- fuel at trip start
        MAX(
            CASE WHEN rn_start = 1
            THEN (fuel_level / 255.0) * fuel_tank_capacity
            END
        ) AS start_fuel_litres,

        -- fuel at trip end
        MAX(
            CASE WHEN rn_end = 1
            THEN (fuel_level / 255.0) * fuel_tank_capacity
            END
        ) AS end_fuel_litres
    FROM ranked
    GROUP BY trip_id

    """ 
)
fuel.show(10, truncate=False)

fuel.createOrReplaceTempView("fuel")    
