In [None]:
"""_summary_

Variations to aggregate over: 
    + per day as challenge 1 
    + using consecutive trip ids
    
    """

In [1]:
import pandas as pd
import pyarrow.parquet as pa
from pathlib import Path
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [2]:
data_path = '/Users/athulparvelikudy/Personal/ACRTA/tech-test-data/supporting-data'
file_name_0 = 'drive/part-00000-tid-4109877695252048813-a3139a95-1807-419c-af03-4877385b4c8c-11-1-c000.snappy.parquet'
file_name_1 = 'drive/part-00001-tid-4109877695252048813-a3139a95-1807-419c-af03-4877385b4c8c-12-1-c000.snappy.parquet'
file_name_2 = 'drive/part-00002-tid-4109877695252048813-a3139a95-1807-419c-af03-4877385b4c8c-13-1-c000.snappy.parquet'
file_name_3 = 'drive/part-00003-tid-4109877695252048813-a3139a95-1807-419c-af03-4877385b4c8c-14-1-c000.snappy.parquet'
file_name_4 = 'drive/part-00004-tid-4109877695252048813-a3139a95-1807-419c-af03-4877385b4c8c-15-1-c000.snappy.parquet'
vehicle_file = 'vehicle.csv'

vehicle_csv = str(Path(data_path) / vehicle_file)

full_file_path0 = str(Path(data_path) / file_name_0)
full_file_path1 = str(Path(data_path) / file_name_1)
full_file_path2 = str(Path(data_path) / file_name_2)
full_file_path3 = str(Path(data_path) / file_name_3)
full_file_path4 = str(Path(data_path) / file_name_4)



In [3]:
spark = (
    SparkSession.builder
    .appName("ReadLocalParquet")
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/27 18:36:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df_0 = spark.read.parquet(full_file_path0)
df_vehicle = (
    spark.read
    .option("header", "true") 
    .option("inferSchema", "true") 
    .csv(vehicle_csv)
)

In [5]:
df_0.createOrReplaceTempView("drive_data")
df_vehicle.createOrReplaceTempView("vehicle_specs")


In [12]:
drive_w_specs = spark.sql(
    """
    SELECT
        d.trip_id,
        d.datetime,
        d.vehicle_spec_id,
        d.eng_load,
        d.velocity,
        d.fuel_level,
        v.fuel_tank_capacity
    FROM drive_data d
    INNER JOIN vehicle_specs v
        ON d.vehicle_spec_id = v.vehicle_spec_id
""")

In [15]:
drive_w_specs.show(10, truncate=False)

drive_w_specs.createOrReplaceTempView("drive_w_specs")

+--------------------------------+-------------------+---------------+--------+--------+----------+------------------+
|trip_id                         |datetime           |vehicle_spec_id|eng_load|velocity|fuel_level|fuel_tank_capacity|
+--------------------------------+-------------------+---------------+--------+--------+----------+------------------+
|1e20465533c545f98332ff14d5a0af22|2017-01-06 14:41:24|1000502        |210.65  |61.01   |76.0      |55                |
|afe9ae193f1544398c289a0cb3ff611e|2017-02-14 08:21:22|1000501        |210.18  |70.2    |75.0      |60                |
|2df4799783604dd9bcf6c2692a6a6bf2|2017-02-14 07:08:26|1000502        |197.89  |68.41   |59.0      |55                |
|96c2aec9925445acb430607c2cd944cf|2017-01-08 11:19:41|1000503        |207.9   |69.44   |96.68     |66                |
|472c62cc56b8421f8c8080a2d2f14f99|2017-01-06 21:30:14|1000500        |214.13  |78.5    |198.19    |47                |
|af2509615c5c4b2db121d08dfba01f34|2017-01-06 05:

In [None]:
ranked = spark.sql(
    """

    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY vehicle_spec_id, to_date(datetime)  ORDER BY datetime ASC)  AS rn_start,
        ROW_NUMBER() OVER (PARTITION BY vehicle_spec_id, to_date(datetime) ORDER BY datetime DESC) AS rn_end
    FROM drive_w_specs

    """ 
)
ranked.show(10, truncate=False)



+--------------------------------+-------------------+---------------+--------+--------+----------+------------------+--------+------+
|trip_id                         |datetime           |vehicle_spec_id|eng_load|velocity|fuel_level|fuel_tank_capacity|rn_start|rn_end|
+--------------------------------+-------------------+---------------+--------+--------+----------+------------------+--------+------+
|05e3f9ef5204475981edd44e5939c4a4|2017-02-01 17:04:37|1000504        |222.77  |20.0    |116.0     |66                |779     |1     |
|05e3f9ef5204475981edd44e5939c4a4|2017-02-01 17:04:32|1000504        |214.75  |55.68   |124.0     |66                |778     |2     |
|05e3f9ef5204475981edd44e5939c4a4|2017-02-01 17:04:31|1000504        |206.32  |69.41   |115.0     |66                |777     |3     |
|05e3f9ef5204475981edd44e5939c4a4|2017-02-01 17:04:26|1000504        |208.72  |70.1    |119.0     |66                |776     |4     |
|05e3f9ef5204475981edd44e5939c4a4|2017-02-01 17:04:13|1

                                                                                